def soap_my(self, data, tag, attr='a', href='href'): # from BeautifulSoup import BeautifulSoup # import re site = urlparse2(self.url).hostname soup = BeautifulSoup(data) ################### links = soup.findAll(attr, href == True) print links try: if links == []: links = soup.findAll(attr, href == True) except: pass done = 0 for everytext in links: if re.findall(tag, str(everytext)): print everytext print everytext[href] if not (re.findall('www', everytext[href]) or re.findall('http://', everytext[href])): f_nmae = urlparse.urljoin("http://" + site, everytext[href]) print f_nmae else: f_nmae = everytext[href] print f_nmae text = ''.join(everytext.findAll(text=True)) data = text.strip() done = 1 return f_nmae ############### if done == 0: link = [] return link
def build_opener(self, url, proxy=[], User_Pass=[], postdata=None, extraheaders={}, forbid_redirect=False): txheaders = { 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'en,hu;q=0.8,en-us;q=0.5,hu-hu;q=0.3', # 'Accept-Encoding': 'gzip, deflate', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', # 'Keep-Alive': '300', # 'Connection': 'keep-alive', # 'Cache-Control': 'max-age=0', } for key, value in extraheaders.iteritems(): txheaders[key] = value req = urllib2.Request(url, postdata, txheaders) self.cookies.add_cookie_header(req) if forbid_redirect: redirector = HTTPNoRedirector() else: redirector = urllib2.HTTPRedirectHandler() if proxy != [] and (not re.findall("None", proxy)) and proxy!='': if User_Pass != [] and User_Pass!='': proxies = {"http": "http://" + User_Pass + "@" + proxy} else: proxies = {"http": "http://%s" % proxy} proxy_support = urllib2.ProxyHandler(proxies) # opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler(debuglevel=1)) else: proxy_support = urllib2.ProxyHandler() # opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler(debuglevel=1)) # url=link.absolute_url # headers={'User-agent' : 'Mozilla/5.0'} http_handler = urllib2.HTTPHandler(debuglevel=self.debug) https_handler = urllib2.HTTPSHandler(debuglevel=self.debug) # default_classes = [ProxyHandler, UnknownHandler, HTTPHandler, # HTTPDefaultErrorHandler, HTTPRedirectHandler, # FTPHandler, FileHandler, HTTPErrorProcessor] u = urllib2.build_opener(proxy_support, http_handler, https_handler, urllib2.HTTPCookieProcessor(self.cookies), redirector) urllib2.install_opener(u) u.addheaders = [ ('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; hu-HU; rv:1.7.8) Gecko/20050511 Firefox/1.0.4')] if not postdata is None: req.add_data(postdata) if self.cookie3=='': fo = os.getcwd().replace('\\','/') # pathname = os.path.join("cookies", cookie3) site = urlparse2(url).hostname if not os.path.isdir(fo + "/cookies/"+site ):os.mkdir(fo + "/cookies/"+site ) chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" self.cookie3 = fo + "/cookies/"+site +'/'+''.join([random.choice(chars) for x in range(5)]) + ".txt" self.cookies.save(self.cookie3) return (req, u,self.cookie3)
def soap_my(self, **kwargs): data = kwargs['data'] tag = kwargs['tag'] try: attr = kwargs['attr'] except: attr = 'a' try: href = kwargs['href'] except: href = 'href' try: url = kwargs['url'] except: url = "http://" + urlparse2(self.url).hostname # from BeautifulSoup import BeautifulSoup # import re # site = urlparse2(self.url).hostname soup = BeautifulSoup(data) ################### links = soup.findAll(attr, href == True) # print links try: if links == []: links = soup.findAll(attr, href == True) except: pass done = 0 for everytext in links: if re.findall(tag, str(everytext)): print " link url finded for downloading...\n\t%s" % everytext # print everytext if not (re.findall('www', everytext[href]) or re.findall('http://', everytext[href])): # f_nmae = urlparse.urljoin( url, everytext[href]) f_nmae = url + '/' + everytext[href] else: f_nmae = everytext[href] print unicode(f_nmae) text = ''.join(everytext.findAll(text=True)) data = text.strip() done = 1 return f_nmae ############### if done == 0: link = [] return link
def url2Path(**kwargs): myhost="http://127.0.0.1/" url=kwargs['url'] site = urlparse2(url).hostname myhost="http://"+site+"/" try: kwargs['pdf_dir'] pdf_dir=kwargs['pdf_dir'] except:pdf_dir=url.split(myhost)[1] ph=pdf_dir.split('/')[0] f_ph=pdf_dir.split('/')[-1] rp=os.getcwd().replace('\\','/').replace('%20',' ').split(ph)[0] path=rp+pdf_dir.split(f_ph)[0].replace('%20',' ')+f_ph return path
def url2Path(**kwargs): myhost = "http://127.0.0.1/" url = kwargs['url'] site = urlparse2(url).hostname myhost = "http://" + site + "/" try: kwargs['pdf_dir'] pdf_dir = kwargs['pdf_dir'] except: pdf_dir = url.split(myhost)[1] ph = pdf_dir.split('/')[0] f_ph = pdf_dir.split('/')[-1] rp = os.getcwd().replace('\\', '/').replace('%20', ' ').split(ph)[0] path = rp + pdf_dir.split(f_ph)[0].replace('%20', ' ') + f_ph return path
def soap_my(self, **kwargs): data=kwargs['data'] tag=kwargs['tag'] try:attr=kwargs['attr'] except:attr='a' try:href=kwargs['href'] except:href='href' try:url=kwargs['url'] except: url = "http://"+urlparse2(self.url).hostname # from BeautifulSoup import BeautifulSoup # import re # site = urlparse2(self.url).hostname soup = BeautifulSoup(data) ################### links = soup.findAll(attr, href == True) print links try: if links == []: links = soup.findAll(attr, href == True) except: pass done = 0 for everytext in links: if re.findall(tag, str(everytext)): print " link url finded for downloading...\n\t%s" % everytext # print everytext if not (re.findall('www', everytext[href]) or re.findall('http://', everytext[href])): f_nmae = urlparse.urljoin( url, everytext[href]) else: f_nmae = everytext[href] print unicode(f_nmae) text = ''.join(everytext.findAll(text=True)) data = text.strip() done = 1 return f_nmae ############### if done == 0: link = [] return link
def get_pdf_link(self,proxy='', user_pass=''): url=self.url if proxy == '': fo = os.getcwd() pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//", url) os.chdir(fo) else: pr_h = [] user_pass_h = [] pr_h.append(proxy) user_pass_h.append(user_pass) # i = user_pass_h.index("") # del user_pass_h[i] try: i = pr_h.index("") del pr_h[i] except: pass don_flg = -1 if pr_h != []: i = -1 site = urlparse2(url).hostname listhandle = self.file_rd(self.sites_list, 'r') file_listhandle = self.file_rd(self.sites_list_files, 'r') link_done = 0 url_pdf = {} for j in range(i + 1, len(pr_h)): if don_flg != 1 and not url.endswith('.pdf') \ and not url.endswith('.zip') and link_done == 0: time0=time.time() res=self.dowload_basePr_userpass_link(url,pr_h[j],user_pass_h[j],cookies='') html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm'] if link_done == 0 and html!=[] : # try: # if os.path.isfile(html): # h=open(html) # ht=h.read() # h.close() # os.remove(html) # html=ht # except: # pass links =self.soap_my(data=html, tag='Full Text as PDF', attr='a', href='href',url=url) # links = self.soap_my(html, 'Full Text as PDF', 'a', 'href') # if links == [] or links==None: # links =self.soap_my(data=html, tag='Full Text', attr='a', href='href',url=url) if links!=[] and mech!=1 : res=self.dowload_basePr_userpass_link(url,pr_h[j],user_pass_h[j],cookies='') html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm'] # try: # if os.path.isfile(html): # h=open(html) # ht=h.read() # h.close() # os.remove(html) # html=ht # except: # pass links =self.soap_my(data=html, tag='<frame src="http://ieeexplore.ieee.org', attr='frame', href='src',url=str(links)) # links2=self.soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src') # links=links2 if links == [] or links==None: pass else: link_done = 1 print '---------------we found Proper link which is :------------\n'+str(links)+ \ '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j]) print '----------------- Link Found -------------------------' break for line in listhandle: if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) : if re.findall("TAG1:", line): try: Tag = line.split("TAG1:")[1].split("---")[0] Tag=Tag.replace("+++",'') atrr = line.split("Attr1:")[1].split("---")[0] atrr=atrr.replace("+++",'') href=line.split('Href1:')[1].split("---")[0] href=href.replace("+++",'') links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url) # links = self.soap_my(html, Tag, atrr,href) if links != [] and link_done!=None and mech!=1: try: Tag = line.split("TAG2:")[1].split("---")[0] Tag=Tag.replace("---",'').replace("+++",'') atrr = line.split("Attr2:")[1].split("---")[0] atrr=atrr.replace('---','').replace("+++",'') href=line.split('Href2:')[1].split("---")[0] href=href.replace("+++",'') res=self.dowload_basePr_userpass_link(url,pr_h[j],user_pass_h[j],cookies='') html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm'] # links = self.soap_my(html, Tag, atrr,href) except:pass # [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j]) # links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url) # links = self.soap_my(html, Tag, atrr,href) if links != [] or links!=None: link_done = 1 print '---------------we found Proper link which is :------------\n'+str(links)+ \ '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j]) print '----------------- Link Found -------------------------' return links,pr_h[j],user_pass_h[j] except: pass # Tag = line.split("TAG1:")[1] # Tag=Tag.replace("---",'') # try: # abstract_match = re.search("Full Text as PDF([^\']+)", html, re.IGNORECASE) # abstract_url = "http://ieeexplore.ieee.org%s" % abstract_match.group(0) # import lxml.html, codecs # abs = [] # root = lxml.html.fromstring(html) # for div in root: # t = div.text_content() # if t: # abs.append(t) # # links = LINK(url).soap_my(html, Tag) # if links != [] and link_done!=None: # link_done = 1 # except: # pass # break elif link_done == 1: print "<li><a>tag found</a></li>" print links break elif link_done==0: if not os.path.isdir('configs/sites_proxy/'+site): os.mkdir('configs/sites_proxy/'+site) time_diff = str(round(time.time() - time0, 2)) if len(user_pass)!=0: self.proxy_checker3.make_txt_file('configs/sites_proxy/'+site+"/badproxylist.txt", str(pr_h[j])+'@'+str(user_pass_h[j]), site, time_diff) else: self.proxy_checker3.make_txt_file('configs/sites_proxy/'+site+"/badproxylist.txt", str(pr_h[j]), site, time_diff) elif url!=[] or (url.endswith('.pdf') or url.endswith('.zip')): cookies='' return url,'','',cookies if link_done==0: links=[] pr_h[j]=[] user_pass_h[j]=[] print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet" return links,pr_h[j],user_pass_h[j],cookies else: # pr_h[j]=[] there is no trusted proxy for it res=self.dowload_basePr_userpass_link(url,"None:None",[],cookies='') html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm'] # [html,proxy0,user_pass,cookies]=self.dowload_basePr_userpass_link(url,"None:None",[],cookies='') links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href') if links==[]: res=self.dowload_basePr_userpass_link(links,"None:None",[],cookies=cookies) html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm'] # [html,proxy0,user_pass,cookies]=self.dowload_basePr_userpass_link(links,"None:None",[],cookies=cookies) links2=LINK(links).soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src') link=links2 if links == [] or links==None: print'there is no trusted proxy for downloading it' else: link_done = 1 return links,[],[],cookies
def BROWSER(self,cookie3=''): """ :param url: """ # global br, cj, r, proxy, User_Pass br = mechanize.Browser() # print br # Cookie Jar # fo=os.getcwd()+"\\cookies\\" # try : # os.mkdir(fo) # except: # pass # os.chdir(fo) # folder=sys.path.insert(0,'/cookies') if self.cookie3=='': fo = os.getcwd().replace('\\','/') # pathname = os.path.join("cookies", cookie3) site = urlparse2(self.url).hostname if not os.path.isdir(fo + "/cookies/"+site ):os.mkdir(fo + "/cookies/"+site ) chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" self.cookie3 = fo + "/cookies/"+site +'/'+''.join([random.choice(chars) for x in range(5)]) + ".txt" self.cj = cookielib.LWPCookieJar() else: self.cj = cookielib.LWPCookieJar() self.cj.revert(self.cookie3) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj)) br.set_cookiejar(self.cj) # os.chdir(..) # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_referer(True) # no allow everything to be written to br.set_handle_robots(False) # no robots br.set_handle_refresh(True) # can sometimes hang without this br.set_handle_redirect(True) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) # User-Agent (this is cheating, ok?) br.addheaders = [('User-Agent', 'Mozilla/5.0 (Linux; U; Android 2.3.4; en-us; T-Mobile myTouch 3G Slide Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Language', 'en-gb,en;q=0.5'), ('Accept-Encoding', 'gzip,deflate'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'), ('Keep-Alive', '115'), ('Connection', 'keep-alive'), ('Cache-Control', 'max-age=0'), ('Referer', 'http://yahoo.com')] # # If the protected site didn't receive the authentication data you would # # end up with a 410 error in your face # br.add_password('http://safe-site.domain', 'username', 'password') # br.open('http://safe-site.domain') # Open some site, let's pick a random one, the first that pops in mind: # Proxy and user/password #proxy = "61.233.25.166:80" # proxy = "202.202.0.163:3128" # proxy=self.proxy # Proxy # dd=re.findall('None:None', proxy) if self.proxy != [] and self.proxy != '' and not (re.findall('None', self.proxy)): br.proxies = br.set_proxies({"http": self.proxy}) # br.proxies=br.set_proxies( proxy) if self.User_Pass != [] and self.User_Pass != '' and not (re.findall('None:None', self.User_Pass)): br.add_proxy_password(self.User_Pass.split(":")[0], self.User_Pass.split(":")[1]) # if r!={}: # rr = br.open(url) # c= cookielib.Cookie(version=0, name='PON', value="xxx.xxx.xxx.111", expires=365, port=None, port_specified=False, domain='xxxx', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=True, discard=False, comment=None, comment_url=None, rest={'HttpOnly': False}, rfc2109=False) # cj.set_cookie(c0) self.cj.save( self.cookie3) return br
def download_link(self, **kwargs): # global form,main_url if kwargs: CurrentDir = os.path.dirname(os.path.realpath(__file__)).replace( '\\', '/') if 'link' in kwargs: url0 = kwargs['link'] if type(url0) is list: url = url0[0] else: if url0[:4] == 'www.': url = 'http://' + url0[4:] else: url = url0 else: url = '' if 'html' in kwargs: html = kwargs['html'] else: html = '' if 'proxy' in kwargs: pr = kwargs['proxy'] else: pr = '' if 'user_pass' in kwargs: us_pss = kwargs['user_pass'] else: us_pss = '' if 'pdfdir' in kwargs: pdf_download_location = kwargs['pdfdir'] else: pdf_download_location = CurrentDir + '/PDF_Files' if 'water_pdfdir' in kwargs: wat_locatton = kwargs['water_pdfdir'] else: wat_locatton = CurrentDir + '/Watermarked_PDF_Files' if 'root' in kwargs: root = kwargs['root'] else: root = CurrentDir if 'need_watermarker' in kwargs: need_watermarker = kwargs['need_watermarker'] else: need_watermarker = True if 'server' in kwargs: server_cdn = kwargs['server'] else: server_cdn = '' if 'cookies' in kwargs: cookies = kwargs['cookies'] else: cookies = '' if 'ftp_upload' in kwargs: ftp_upload = kwargs['ftp_upload'] else: ftp_upload = '' if 'log_out' in kwargs: log_out = kwargs['log_out'] if log_out != '': link = {'link': url, 'log_out': log_out} else: link = url log_out = '' else: link = url log_out = '' try: url_watermark = kwargs['url_watermark'] except: # url_watermark='Please insert Your url to add as watermark' url_watermark = 'www.free-peprs.elasa.ir' done = 0 try: main_url = kwargs['main_url'] except: main_url = url # link,proxy,user_pass = Find_Link(url).find_link(pr,us_pss) # link,proxy,user_pass=self.valid_link() # link=url if link != [] and link != None: if main_url != url: data_base_host = str(urlparse2(main_url).hostname) try: ez_host = str(urlparse2(url).hostname) except: ez_host = str(urlparse2(url[0]).hostname) try: base_url = 'http://' + data_base_host file_name_link = base_url + url.split(ez_host)[1] except: base_url = 'http://' + data_base_host try: file_name_link = base_url + url[0].split(ez_host)[1] except: file_name_link = url else: file_name_link = url os.chdir(CurrentDir) # file_name = self.Find_Link(file_name_link).find_name(pdf_download_location,wat_locatton) # file_name.url_watermark=url_watermark # [html,proxy,user_pass,cookies]=self.Find_Link(link,pdfdir=pdf_download_location,water_pdfdir=wat_locatton,cookies=cookies).dowload_basePr_userpass(pr,us_pss,cookies) if (not (html.endswith('.pdf'))) and (html[:4] != '%PDF' or len( re.findall('%%EOF', html)) == 0): [html, proxy, user_pass, cookies] = self.Find_Link( main_url, pdfdir=pdf_download_location, water_pdfdir=wat_locatton, cookies=cookies).dowload_basePr_userpass(pr, us_pss, cookies, url=link) else: proxy = pr user_pass = us_pss try: if os.path.isfile(html): file_is = 1 else: file_is = 0 except: file_is = 0 if (html != [] and html[:4] == '%PDF') or file_is == 1: PDF_File = import_mod(from_module='save_source', from_module2='PDF_File') if not (html.endswith('.pdf')): # from save_source import PDF_File file_name = self.Find_Link(file_name_link).find_name( pdf_download_location, wat_locatton) # file_name['url_watermark']=url_watermark file_name.url_watermark = url_watermark else: os.remove(cookies) file_name = self.Find_Link(file_name_link).find_name( pdf_download_location, wat_locatton) file_name.filename = html.split('/')[-1] # file_name.pdf_Folder_filename=file_name.pdf_Folder_filename.split('/')[-1] file_name.url_watermark = url_watermark # file_name = PDF_File(link,pdf_download_location,wat_locatton).filename(link) # file_name = self.Find_Link(link).find_name(pdf_download_location,wat_locatton) if not need_watermarker == False: #need wtaremarker is ok os.chdir(CurrentDir) if not os.path.isdir(pdf_download_location): os.mkdir(pdf_download_location) if not os.path.isdir(wat_locatton): os.mkdir(wat_locatton) pdf_dw_dir, pdf_dw_Wr_dir = PDF_File( url, pdf_download_location, wat_locatton).finall_file_saving(html, file_name, pdf_download_location, no_watermarker=0) # photo=PDF_File(url,pdf_download_location,wat_locatton).pdf_to_image(pdf=pdf_dw_dir,pages=0) pdf_size = size(os.path.getsize(pdf_dw_dir)) pdf_dw_li = self.path2url(file_name.pdf_Folder_filename, server_cdn, pdf_download_location, root) if file_is == 1 and html.endswith('.pdf'): wt_pdf_size = size(os.path.getsize(pdf_dw_Wr_dir)) pdf_dw_Wr_li = self.path2url( file_name.W_pdf_Folder_filename, server_cdn, wat_locatton, root) elif file_is == 1 and not html.endswith('.pdf'): wt_pdf_size = pdf_size pdf_dw_Wr_li = pdf_dw_li else: wt_pdf_size = size(os.path.getsize(pdf_dw_Wr_dir)) pdf_dw_Wr_li = self.path2url( file_name.W_pdf_Folder_filename, server_cdn, wat_locatton, root) try: os.remove(cookies) except: pass print "fetching main paper link url ...\n\t%s" % pdf_dw_li[:] print "fetching waterarker paper link url ...\n\t%s" % pdf_dw_Wr_li else: if not os.path.isdir(pdf_download_location): os.mkdir(pdf_download_location) pdf_dw_dir, pdf_dw_Wr_dir = PDF_File( url, pdf_download_location, wat_locatton).finall_file_saving(html, file_name, pdf_download_location, no_watermarker=1) pdf_size = size(os.path.getsize(pdf_dw_dir)) # pdf_size=len(html)/1024 #in kbit wt_pdf_size = '' pdf_dw_li = self.path2url(file_name.pdf_Folder_filename, server_cdn, pdf_download_location, root) print "fetching main paper link url ...\n\t%s" % pdf_dw_li[:] pdf_dw_Wr_li = "No watter marker requested my be becuase of big size or lack of time" print "fetching waterarker paper link url ...\n\t%s" % pdf_dw_Wr_li done = 1 if ftp_upload == '1': public_url = 'None' if need_watermarker == True: #need wtaremarker is ok public_url = self.upload_file( water_pdfdir=pdf_dw_Wr_dir, METODE='FTP') else: public_url = self.upload_file(water_pdfdir=pdf_dw_li, METODE='FTP') else: public_url = 'None' if public_url != 'None': # try: # file=open(pdf_dw_dir); # file.close() # file=open(pdf_dw_Wr_dir); # file.close() # except: # print 'pdfs are closed and reasy to removed from loal host!' # os.close(pdf_dw_dir);os.close(pdf_dw_Wr_dir); os.remove(pdf_dw_dir) os.remove(pdf_dw_Wr_dir) pdf_dw_Wr_li = public_url pdf_dw_li = public_url else: public_url = pdf_dw_Wr_li address = { 'url': str(url), 'pdf_name': file_name.filename, 'W_pdf_name': file_name.filename, 'W_pdf_local': wat_locatton, 'pdf_size': pdf_size, 'wt_pdf_size': wt_pdf_size, 'pdf_dir': pdf_dw_dir, 'wt_pdf_dir': pdf_dw_Wr_dir, 'pdf_dw_li': pdf_dw_li, "pdf_dw_Wr_li": pdf_dw_Wr_li, 'public_url': public_url, 'proxy_worked': proxy, 'user_pass_worked': user_pass } return address # elif os.path.isfile(html): elif html[:4] != "%PDF" and html != []: print 'file is not in PDF Format do you want to make a save it as html file' print 'format is ' + html[:4] print '*************html is :***********\n\n' print html print '************* end of html :***********\n\n' print '\n file link which found is :\n' + link + '\nbut file can not be downloaded ' else: print 'file link which found is :\n' #print str(link['link']); print 'but file can not be downloaded ' if done == 0: print 'we are unable to download from this address because can not find proper link ' address = { 'url': str(url), 'pdf_dir': '', 'pdf_size': '', 'wt_pdf_size': '', 'wt_pdf_dir': '', 'pdf_dw_li': '', "pdf_dw_Wr_li": '', 'public_url': '', 'proxy_worked': '', 'user_pass_worked': '' } return address
def get_pdf_link(self, proxy='', user_pass=''): url = self.url if proxy == '': fo = os.getcwd() pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy( "configs//sites_proxy//", url) os.chdir(fo) else: pr_h = [] user_pass_h = [] pr_h.append(proxy) user_pass_h.append(user_pass) # i = user_pass_h.index("") # del user_pass_h[i] try: i = pr_h.index("") del pr_h[i] except: pass don_flg = -1 if pr_h != []: i = -1 site = urlparse2(url).hostname listhandle = self.file_rd(self.sites_list, 'r') file_listhandle = self.file_rd(self.sites_list_files, 'r') link_done = 0 url_pdf = {} cookies = '' for j in range(i + 1, len(pr_h)): if don_flg != 1 and not url.endswith( '.pdf') and link_done == 0: time0 = time.time() [html, proxy0, user_pass ] = self.dowload_basePr_userpass(url, pr_h[j], user_pass_h[j]) if link_done == 0 and html != []: # links =self.soap_my(data=html, tag='FullTextPDF', attr='a', href='href',url=url) links = self.soap_my(data=html, tag='FullTextPdf', attr='a', href='href', url=url) if links == [] or links == None: pass else: link_done = 1 print '---------------we found Proper link which is :------------\n'+str(links)+ \ '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j]) print '----------------- Link Found -------------------------' break for line in listhandle: if re.findall(site, line) and link_done == 0 and ( not re.findall("#", line.split("TAG:")[0])): if re.findall("TAG1:", line): try: Tag = line.split("TAG1:")[1].split( "---")[0] Tag = Tag.replace("+++", '') atrr = line.split("Attr1:")[1].split( "---")[0] atrr = atrr.replace("+++", '') href = line.split('Href1:')[1].split( "---")[0] href = href.replace("+++", '') links = self.soap_my(data=html, tag=Tag, attr=atrr, href=href, url=url) # links = self.soap_my(html, Tag, atrr,href) if links != [] or links != None: link_done = 1 print '---------------we found Proper link which is :------------\n'+str(links)+ \ '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j]) print '----------------- Link Found -------------------------' return links, pr_h[j], user_pass_h[ j] except: pass elif link_done == 1: print "<li><a>tag found</a></li>" print links break elif link_done == 0: if not os.path.isdir('configs/sites_proxy/' + site): os.mkdir('configs/sites_proxy/' + site) time_diff = str(round(time.time() - time0, 2)) if len(user_pass) != 0: self.proxy_checker3.make_txt_file( 'configs/sites_proxy/' + site + "/badproxylist.txt", str(pr_h[j]) + '@' + str(user_pass_h[j]), site, time_diff) else: self.proxy_checker3.make_txt_file( 'configs/sites_proxy/' + site + "/badproxylist.txt", str(pr_h[j]), site, time_diff) elif url != [] and url.endswith('.pdf'): return url, '', '' if link_done == 0: links = [] pr_h[j] = [] user_pass_h[j] = [] cookies = '' print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet" return links, pr_h[j], user_pass_h[j], cookies else: # pr_h[j]=[] there is no trusted proxy for it html = self.dowload_basePr_userpass(url, 'None') links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href') if links == []: html = self.dowload_basePr_userpass(links, 'None') links2 = LINK(links).soap_my( html, '<frame src="http://ieeexplore.ieee.org', 'frame', 'src') link = links2 if links == [] or links == None: print 'there is no trusted proxy for downloading it' else: link_done = 1 return links, [], []
def __init__(self, url='', sites_list='configs/sites_list_pdf_tags.txt', sites_list_files="configs/sites_list_files.txt", site_proxy="configs//sites_proxy//", **kwargs): self.sites_list = sites_list self.sites_list_files = sites_list_files self.site_proxy = site_proxy self.url = url CurrentDir = os.path.dirname(os.path.realpath(__file__)).replace( '\\', '/') try: if kwargs['cookies']: self.cookies = kwargs['cookies'] else: self.cookies = '' except: self.cookies = '' try: if 'pdfdir' in kwargs: self.pdf_download_location = kwargs['pdfdir'] except: self.pdf_download_location = CurrentDir + '/PDF_Files' try: if 'water_pdfdir' in kwargs: self.wat_locatton = kwargs['water_pdfdir'] except: self.wat_locatton = CurrentDir + '/Watermarked_PDF_Files' try: if 'root' in kwargs: self.root = kwargs['root'] except: self.root = CurrentDir # from download_mozilla import web proxy_checker3_all_function = import_mod( from_module='proxy_checker3_all_function') # import proxy_checker3_all_function self.proxy_checker3 = proxy_checker3_all_function # self.Mozilla_Web=web print 'url is ' + url site = urlparse2(url).hostname fo = os.getcwd() CurrentDir = os.path.dirname(os.path.realpath(__file__)) s = CurrentDir.replace('\\', '/') + '/configs/Links_site/' print site # s2=os.getcwd()+'\\configs\\Links_site\\' self.file_exist = 0 if os.path.isfile(s + site.replace('.', '_') + '.py'): # sys.path.insert(0, s) # ss=sys.path # print ss sys.path.insert(0, s) # os.chdir(s) # import importlib # module2 = importlib.import_module(site.replace('.','_'), package=None) # del module2 si = sys.modules if site.replace('.', '_') in si: print "@@@@@@@@@@@@@@ module already exist for " + site + ' is \n: @@@@@@@@@@@@@@\n\n' self.new_module = si[site.replace('.', '_')] else: print "@@@@@@@@@@@@@@ module inserted for " + site + ' is \n: @@@@@@@@@@@@@@\n\n' self.new_module = __import__(site.replace('.', '_'), {}, {}, [], 2) # import imp # try: # imp.find_module(site.replace('.','_')) # found = True # self.new_module=site.replace('.','_') # print "@@@@@@@@@@@@@@ module already exist for "+site+' is \n: @@@@@@@@@@@@@@\n\n' # except ImportError: # found = False # self.new_module = __import__(site.replace('.','_'),{},{},[],2) # print "@@@@@@@@@@@@@@ module inserted for "+site+' is \n: @@@@@@@@@@@@@@\n\n' # self.new_module = __import__(site.replace('.','_'),{},{},[],2) print self.new_module self.file_exist = 1 else: print "@@@@@@@@@@@@@@ module " + CurrentDir.replace( '\\', '/') + '/configs/Links_site/' + site.replace( '.', '_') + '.py' + '\n Not found: @@@@@@@@@@@@@@\n\n' os.chdir(fo)
def find_link(self, proxy='', user_pass=''): url = self.url site = urlparse2(url).hostname link_done = 0 url_pdf = {} if not url.endswith('.pdf'): # from download_mozilla import web # from importlib import import_module # html=web().download(url) # html = br.open(url).read() # s=os.getcwd().replace('\\','/')+'/configs/Links_site/' # s2=os.getcwd()+'\\configs\\Links_site\\' # if os.path.isfile(os.getcwd().replace('\\','/')+'/configs/Links_site/'+site.replace('.','_')+'.py'): # fo = os.getcwd().replace('\\','/') # sys.path.insert(0, s) # # CurrentDir = os.path.dirname(os.path.realpath(__file__)) # # os.chdir(s) # # import importlib # # module2 = importlib.import_module(site.replace('.','_'), package=None) # new_module = __import__(site.replace('.','_')) # if self.file_exist == 1: res = self.new_module.LINK(url).get_pdf_link(proxy, user_pass) link = res['links'] proxy = res['proxy'] user_pass = res['user_pass'] cookies = res['cookies'] title = res['title'] html = res['html'] try: log_out = res['log_out'] except: log_out = '' try: form = res['form'] except: form = '' responce = { 'html': html, 'url': url, 'link': link, 'title': title, 'proxy': proxy, 'user_pass': user_pass, 'cookies': cookies, 'form': form, 'log_out': log_out } else: print "No " + site + '.py in config in ' + os.getcwd( ) + "\configs\Links_site" link = [] responce = { 'html': html, 'url': url, 'link': link, 'title': title, 'proxy': proxy, 'user_pass': user_pass, 'cookies': cookies, } # for line in listhandle: # if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) : # new_module = __import__(line) # import importlib # module = importlib.import_module(line, package=None) # # lookup in a set is in constant time # safe_names = {"file1.py", "file2.py", "file3.py", ...} # # user_input = ... # # if user_input in safe_names: # file = import_module(user_input) # else: # print("Nope, not doing this.") # # if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) : else: #if not url.endwith( '.pdf'): print 'address you have entered is end with .pdf and link is the same' link = url responce = { 'html': [], 'url': url, 'link': link, 'title': '', 'proxy': [], 'user_pass': [], 'cookies': '', } return responce
def writer(f, rq): while True: line = rq.get() f.write(line + '\n') site = urlparse2(test_url).hostname l = 0 proxy_handler = [] pr_h = [] proxy_h = [] user_pass_h = [] try: listhandle = file_rd(input_file + site + ".txt", 'r') # sort_file(name,text_to_sort) for line in listhandle: if re.findall("For Site:", line): proxy1 = line.split("For Site:")[0] proxy2 = line.split("For Site:")[1] # proxyList.append(proxy1) pr, proxy_han, user_pass = make_proxy_handler(proxy1) # if pr!=[]: pr_h.append(pr) # if proxy_han!=[]: proxy_h.append(proxy_han) # if user_pass!=[]: user_pass_h.append(user_pass) except: listhandle = open(proxy_alive).readlines() for line in listhandle: if re.findall(site, line): if re.findall("For Site:", line): proxy1 = line.split("For Site:")[0] proxy2 = line.split("For Site:")[1] if re.findall(site, proxy2): # proxyList.append(proxy1) pr, proxy_han, user_pass = make_proxy_handler(proxy1) # if pr!=[]: pr_h.append(pr) # if proxy_han!=[]: proxy_h.append(proxy_han) # if user_pass!=[]: user_pass_h.append(user_pass) if pr_h == []: proxy_checker(test_url, input_file + "proxylist.txt", proxy_alive, input_file, 30) try: listhandle = open(input_file + site + ".txt").readlines() for line in listhandle: if re.findall("For Site:", line): proxy1 = line.split("For Site:")[0] proxy2 = line.split("For Site:")[1] # proxyList.append(proxy1) pr, proxy_han, user_pass = make_proxy_handler(proxy1) # if pr!=[]: pr_h.append(pr) # if proxy_han!=[]: proxy_h.append(proxy_han) # if user_pass!=[]: user_pass_h.append(user_pass) except: pass try: listhandle = open(input_file + site + ".txt").readlines() except: proxy_checker(test_url, input_file + "proxylist.txt", proxy_alive, input_file, 30) # pip=proxy1 # # if l==1: # try : # pip.split('@')[1] # proxy_info = { # 'user' : pip.split('@')[1].split(":")[0], # 'pass' : pip.split('@')[1].split(":")[1].replace('\n', ''), # 'host' : pip.split('@')[0].split(":")[0], # 'port' : pip.split('@')[0].split(":")[1] # or 8080 or whatever # } # proxy_handler.append("http://%(user)s:%(pass)s@%(host)s:%(port)s" % proxy_info) # except: # proxy_info = { # 'host' : pip.split(":")[0], # 'port' : pip.split(":")[1].replace('\n', '') # or 8080 or whatever # } # proxy_handler.append("http://%(host)s:%(port)s" % proxy_info) return pr_h, proxy_h, user_pass_h
def download(self, url='', proxy='', user_pass='', location='PDF_Files/', **kwargs): """ :param url: """ if kwargs['cookies']: cookies = kwargs['cookies'] else: cookies = '' site = urlparse2(url).hostname # if proxy == '': # # fo = os.getcwd() # pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//"+site+'//', url) # os.chdir(fo) if proxy == '' or proxy == []: proxy_checker3_all_function = import_mod( from_module='proxy_checker3_all_function') # import proxy_checker3_all_function fo = os.getcwd().replace('\\', '/') pr_h, proxy_h, user_pass_h = proxy_checker3_all_function.make_returning_proxy( "configs//sites_proxy//" + site + '//', url) os.chdir(fo) else: pr_h = [] user_pass_h = [] pr_h.append(proxy) user_pass_h.append(user_pass) # try: # i = user_pass_h.index("") # del user_pass_h[i] # except: # print 'there is no empty lsit in user_password list' try: i = pr_h.index("") del pr_h[i] except: pass # print 'there is no empty list in proxy list' # pr_h=['222.66.115.233:80 ', '202.202.0.163:3128 ', '151.236.14.48:80'] pdf_dw_li = pdf_dw_Wr_li = [] frontpage = [] don_flg = -1 if pr_h != []: i = -1 for j in range(i + 1, len(pr_h)): if don_flg != 1: debug = True cash = None # dl = MozillaEmulator(cash,0,debug) dl = MozillaEmulator(cash, 0, cookies=cookies) try: if user_pass_h[j] != '': frontpage, cookies = dl.download( url, pr_h[j], user_pass_h[j]) pr = pr_h[j] upss = user_pass_h[j] else: frontpage, cookies = dl.download(url, pr_h[j]) pr = pr_h[j] upss = '' except: print "we cant download because of invalid tag or invalid proxy line 620" + "\n" if frontpage != []: if len(user_pass_h[j]) != 0: print "file downloaded with " + str( pr_h[j]) + '@' + str(user_pass_h[j]) else: print "file downloaded with " + str(pr_h[j]) don_flg = 1 # pr = pr_h[j] # upss = user_pass_h[j] break else: print "we could not download file with proxy:" + pr_h[j] if don_flg != 1: print "we are unable to download your file Now!!" + '\n' frontpage = [] pr = '' upss = '' # cookies='' else: print "we are unable to download your file Now!! Beacouse proxy is empty" + '\n' return frontpage, pr, upss, cookies
def download_link(self,**kwargs): # global form,main_url if kwargs: CurrentDir=os.path.dirname(os.path.realpath(__file__)).replace('\\','/') if 'link' in kwargs: url0=kwargs['link'] if type(url0) is list: url=url0[0] else: if url0[:4]=='www.': url='http://'+url0[4:] else: url=url0 else: url='' if 'html' in kwargs:html=kwargs['html'] else:html='' if 'proxy' in kwargs:pr=kwargs['proxy'] else:pr='' if 'user_pass' in kwargs:us_pss=kwargs['user_pass'] else:us_pss='' if 'pdfdir' in kwargs:pdf_download_location=kwargs['pdfdir'] else:pdf_download_location=CurrentDir+'/PDF_Files' if 'water_pdfdir' in kwargs:wat_locatton=kwargs['water_pdfdir'] else:wat_locatton=CurrentDir+'/Watermarked_PDF_Files' if 'root' in kwargs:root=kwargs['root'] else:root=CurrentDir if 'need_watermarker' in kwargs:need_watermarker=kwargs['need_watermarker'] else:need_watermarker=True if 'server' in kwargs:server_cdn=kwargs['server'] else:server_cdn='' if 'cookies' in kwargs:cookies=kwargs['cookies'] else:cookies='' if 'ftp_upload' in kwargs:ftp_upload=kwargs['ftp_upload'] else:ftp_upload='' if 'log_out' in kwargs: log_out=kwargs['log_out'] if log_out!='': link={ 'link':url, 'log_out':log_out } else: link=url log_out='' else: link=url log_out='' try: url_watermark=kwargs['url_watermark'] except: # url_watermark='Please insert Your url to add as watermark' url_watermark='www.free-peprs.elasa.ir' done=0 try: main_url=kwargs['main_url'] except: main_url=url # link,proxy,user_pass = Find_Link(url).find_link(pr,us_pss) # link,proxy,user_pass=self.valid_link() # link=url if link!=[] and link!=None: if main_url!=url: data_base_host = str(urlparse2(main_url).hostname) try: ez_host = str(urlparse2(url).hostname) except: ez_host = str(urlparse2(url[0]).hostname) try: base_url='http://'+data_base_host file_name_link=base_url+url.split(ez_host)[1] except: base_url='http://'+data_base_host try: file_name_link=base_url+url[0].split(ez_host)[1] except: file_name_link=url else: file_name_link=url os.chdir(CurrentDir) # file_name = self.Find_Link(file_name_link).find_name(pdf_download_location,wat_locatton) # file_name.url_watermark=url_watermark # [html,proxy,user_pass,cookies]=self.Find_Link(link,pdfdir=pdf_download_location,water_pdfdir=wat_locatton,cookies=cookies).dowload_basePr_userpass(pr,us_pss,cookies) if (not (html.endswith('.pdf'))) and (html[:4]!='%PDF' or len ( re.findall('%%EOF', html ))==0) : [html,proxy,user_pass,cookies]=self.Find_Link(main_url,pdfdir=pdf_download_location,water_pdfdir=wat_locatton,cookies=cookies).dowload_basePr_userpass(pr,us_pss,cookies,url=link) else: proxy=pr;user_pass=us_pss; try: if os.path.isfile(html): file_is=1 else:file_is=0 except: file_is=0 if (html!=[] and html[:4]=='%PDF')or file_is==1 : PDF_File=import_mod(from_module='save_source',from_module2='PDF_File') if not (html.endswith('.pdf')): # from save_source import PDF_File file_name = self.Find_Link(file_name_link).find_name(pdf_download_location,wat_locatton) # file_name['url_watermark']=url_watermark file_name.url_watermark=url_watermark else: os.remove(cookies) file_name = self.Find_Link(file_name_link).find_name(pdf_download_location,wat_locatton) file_name.filename=html.split('/')[-1] # file_name.pdf_Folder_filename=file_name.pdf_Folder_filename.split('/')[-1] file_name.url_watermark=url_watermark # file_name = PDF_File(link,pdf_download_location,wat_locatton).filename(link) # file_name = self.Find_Link(link).find_name(pdf_download_location,wat_locatton) if not need_watermarker==False:#need wtaremarker is ok os.chdir(CurrentDir) if not os.path.isdir(pdf_download_location): os.mkdir(pdf_download_location) if not os.path.isdir(wat_locatton): os.mkdir(wat_locatton) pdf_dw_dir, pdf_dw_Wr_dir = PDF_File(url,pdf_download_location,wat_locatton).finall_file_saving(html, file_name,pdf_download_location,no_watermarker=0) # photo=PDF_File(url,pdf_download_location,wat_locatton).pdf_to_image(pdf=pdf_dw_dir,pages=0) pdf_size=size(os.path.getsize(pdf_dw_dir)) pdf_dw_li =self.path2url(file_name.pdf_Folder_filename,server_cdn,pdf_download_location,root) if file_is==1 and html.endswith('.pdf'): wt_pdf_size=size(os.path.getsize(pdf_dw_Wr_dir)) pdf_dw_Wr_li = self.path2url(file_name.W_pdf_Folder_filename,server_cdn,wat_locatton,root) elif file_is==1 and not html.endswith('.pdf'): wt_pdf_size=pdf_size pdf_dw_Wr_li=pdf_dw_li else: wt_pdf_size=size(os.path.getsize(pdf_dw_Wr_dir)) pdf_dw_Wr_li = self.path2url(file_name.W_pdf_Folder_filename,server_cdn,wat_locatton,root) try: os.remove(cookies) except: pass print "fetching main paper link url ...\n\t%s" % pdf_dw_li[:] print "fetching waterarker paper link url ...\n\t%s" % pdf_dw_Wr_li else: if not os.path.isdir(pdf_download_location): os.mkdir(pdf_download_location) pdf_dw_dir, pdf_dw_Wr_dir = PDF_File(url,pdf_download_location,wat_locatton).finall_file_saving(html, file_name,pdf_download_location,no_watermarker=1) pdf_size=size(os.path.getsize(pdf_dw_dir)) # pdf_size=len(html)/1024 #in kbit wt_pdf_size='' pdf_dw_li =self.path2url(file_name.pdf_Folder_filename,server_cdn,pdf_download_location,root) print "fetching main paper link url ...\n\t%s" % pdf_dw_li[:] pdf_dw_Wr_li="No watter marker requested my be becuase of big size or lack of time" print "fetching waterarker paper link url ...\n\t%s" % pdf_dw_Wr_li done=1 if ftp_upload=='1': public_url='None' if need_watermarker==True:#need wtaremarker is ok public_url=self.upload_file(water_pdfdir=pdf_dw_Wr_dir,METODE='FTP') else: public_url=self.upload_file(water_pdfdir=pdf_dw_li,METODE='FTP') else: public_url='None' if public_url!='None': # try: # file=open(pdf_dw_dir); # file.close() # file=open(pdf_dw_Wr_dir); # file.close() # except: # print 'pdfs are closed and reasy to removed from loal host!' # os.close(pdf_dw_dir);os.close(pdf_dw_Wr_dir); os.remove(pdf_dw_dir);os.remove(pdf_dw_Wr_dir); pdf_dw_Wr_li=public_url;pdf_dw_li=public_url; else: public_url=pdf_dw_Wr_li; address={ 'url':str(url), 'pdf_name':file_name.filename, 'W_pdf_name':file_name.filename, 'W_pdf_local':wat_locatton, 'pdf_size':pdf_size, 'wt_pdf_size':wt_pdf_size, 'pdf_dir':pdf_dw_dir, 'wt_pdf_dir':pdf_dw_Wr_dir, 'pdf_dw_li':pdf_dw_li, "pdf_dw_Wr_li":pdf_dw_Wr_li, 'public_url':public_url, 'proxy_worked':proxy, 'user_pass_worked':user_pass} return address # elif os.path.isfile(html): elif html[:4]!="%PDF" and html!=[]: print 'file is not in PDF Format do you want to make a save it as html file' print 'format is '+html[:4] print '*************html is :***********\n\n' print html print '************* end of html :***********\n\n' print '\n file link which found is :\n'+link+'\nbut file can not be downloaded ' else: print 'file link which found is :\n';#print str(link['link']); print 'but file can not be downloaded ' if done==0: print 'we are unable to download from this address because can not find proper link ' address={ 'url':str(url), 'pdf_dir':'', 'pdf_size':'', 'wt_pdf_size':'', 'wt_pdf_dir':'', 'pdf_dw_li':'', "pdf_dw_Wr_li":'', 'public_url':'', 'proxy_worked':'', 'user_pass_worked':''} return address
def get_pdf_link(self,proxy='', user_pass=''): url=self.url if proxy == '': fo = os.getcwd() pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//", url) os.chdir(fo) else: pr_h = [] user_pass_h = [] pr_h.append(proxy) user_pass_h.append(user_pass) i = user_pass_h.index("") del user_pass_h[i] try: i = pr_h.index("") del pr_h[i] except: pass don_flg = -1 if pr_h != []: i = -1 site = urlparse2(url).hostname listhandle = self.file_rd(self.sites_list, 'r') file_listhandle = self.file_rd(self.sites_list_files, 'r') link_done = 0 url_pdf = {} for j in range(i + 1, len(pr_h)): if don_flg != 1 and not url.endswith('.pdf') and link_done == 0: [html,proxy0,user_pass]=self.dowload_basePr_userpass(url,pr_h[j],user_pass_h[j]) if link_done == 0 : links = self.soap_my(html, 'Full Text as PDF', 'a', 'href') if links!=[]: [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j]) links3=self.soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src') if links == [] or links==None: pass else: link_done = 1 break for line in listhandle: if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) : if re.findall("TAG1:", line): try: Tag = line.split("TAG1:")[1].split("---")[0] Tag=Tag.replace("+++",'') atrr = line.split("Attr1:")[1].split("---")[0] atrr=atrr.replace("+++",'') href=line.split('Href1:')[1].split("---")[0] href=href.replace("+++",'') links = self.soap_my(html, Tag, atrr,href) if links != [] and link_done!=None: try: Tag = line.split("TAG2:")[1].split("---")[0] Tag=Tag.replace("---",'').replace("+++",'') atrr = line.split("Attr2:")[1].split("---")[0] atrr=atrr.replace('---','').replace("+++",'') href=line.split('Href2:')[1].split("---")[0] href=href.replace("+++",'') except: return links,pr_h[j],user_pass_h[j] [html,proxy0]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j]) links = self.soap_my(html, Tag, atrr,href) link_done = 1 except: Tag = line.split("TAG1:")[1] Tag=Tag.replace("---",'') try: abstract_match = re.search("Full Text as PDF([^\']+)", html, re.IGNORECASE) abstract_url = "http://ieeexplore.ieee.org%s" % abstract_match.group(0) import lxml.html, codecs abs = [] root = lxml.html.fromstring(html) for div in root: t = div.text_content() if t: abs.append(t) links = LINK(url).soap_my(html, Tag) if links != [] and link_done!=None: link_done = 1 except: pass break elif link_done == 1: print "<li><a>tag found</a></li>" print links break elif url!=[] and url.endswith('.pdf'): return url,'','' if link_done==0: links=[] pr_h[j]=[] user_pass_h[j]=[] print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet" return links,pr_h[j],user_pass_h[j] else: # pr_h[j]=[] there is no trusted proxy for it html=self.dowload_basePr_userpass(url,'None') links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href') if links==[]: html=self.dowload_basePr_userpass(links,'None') links3=LINK(links).soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src') if links == [] or links==None: print'there is no trusted proxy for downloading it' else: link_done = 1 return links,[],[]
def get_pdf_link(self,proxy='', user_pass=''): url=self.url if proxy == '': fo = os.getcwd() pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//", url) os.chdir(fo) else: pr_h = [] user_pass_h = [] pr_h.append(proxy) user_pass_h.append(user_pass) # i = user_pass_h.index("") # del user_pass_h[i] try: i = pr_h.index("") del pr_h[i] except: pass don_flg = -1 if pr_h != []: i = -1 site = urlparse2(url).hostname listhandle = self.file_rd(self.sites_list, 'r') file_listhandle = self.file_rd(self.sites_list_files, 'r') link_done = 0 url_pdf = {} cookies='' for j in range(i + 1, len(pr_h)): if don_flg != 1 and not url.endswith('.pdf') and link_done == 0: time0=time.time() [html,proxy0,user_pass]=self.dowload_basePr_userpass(url,pr_h[j],user_pass_h[j]) if link_done == 0 and html!=[]: # links =self.soap_my(data=html, tag='FullTextPDF', attr='a', href='href',url=url) links =self.soap_my(data=html, tag='FullTextPdf', attr='a', href='href',url=url) if links == [] or links==None: pass else: link_done = 1 print '---------------we found Proper link which is :------------\n'+str(links)+ \ '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j]) print '----------------- Link Found -------------------------' break for line in listhandle: if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) : if re.findall("TAG1:", line): try: Tag = line.split("TAG1:")[1].split("---")[0] Tag=Tag.replace("+++",'') atrr = line.split("Attr1:")[1].split("---")[0] atrr=atrr.replace("+++",'') href=line.split('Href1:')[1].split("---")[0] href=href.replace("+++",'') links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url) # links = self.soap_my(html, Tag, atrr,href) if links != [] or links!=None: link_done = 1 print '---------------we found Proper link which is :------------\n'+str(links)+ \ '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j]) print '----------------- Link Found -------------------------' return links,pr_h[j],user_pass_h[j] except: pass elif link_done == 1: print "<li><a>tag found</a></li>" print links break elif link_done==0: if not os.path.isdir('configs/sites_proxy/'+site): os.mkdir('configs/sites_proxy/'+site) time_diff = str(round(time.time() - time0, 2)) if len(user_pass)!=0: self.proxy_checker3.make_txt_file('configs/sites_proxy/'+site+"/badproxylist.txt", str(pr_h[j])+'@'+str(user_pass_h[j]), site, time_diff) else: self.proxy_checker3.make_txt_file('configs/sites_proxy/'+site+"/badproxylist.txt", str(pr_h[j]), site, time_diff) elif url!=[] and url.endswith('.pdf'): return url,'','' if link_done==0: links=[] pr_h[j]=[] user_pass_h[j]=[] cookies='' print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet" return links,pr_h[j],user_pass_h[j],cookies else: # pr_h[j]=[] there is no trusted proxy for it html=self.dowload_basePr_userpass(url,'None') links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href') if links==[]: html=self.dowload_basePr_userpass(links,'None') links2=LINK(links).soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src') link=links2 if links == [] or links==None: print'there is no trusted proxy for downloading it' else: link_done = 1 return links,[],[]
def __init__(self, url='', sites_list='configs/sites_list_pdf_tags.txt', sites_list_files="configs/sites_list_files.txt", site_proxy="configs//sites_proxy//", **kwargs): self.sites_list = sites_list self.sites_list_files = sites_list_files self.site_proxy = site_proxy self.url = url CurrentDir = os.path.dirname(os.path.realpath(__file__)).replace('\\', '/') try: if kwargs['cookies']: self.cookies = kwargs['cookies'] else: self.cookies = '' except: self.cookies = '' try: if 'pdfdir' in kwargs: self.pdf_download_location = kwargs['pdfdir'] except: self.pdf_download_location = CurrentDir + '/PDF_Files' try: if 'water_pdfdir' in kwargs: self.wat_locatton = kwargs['water_pdfdir'] except: self.wat_locatton = CurrentDir + '/Watermarked_PDF_Files' try: if 'root' in kwargs: self.root = kwargs['root'] except: self.root = CurrentDir # from download_mozilla import web proxy_checker3_all_function = import_mod(from_module='proxy_checker3_all_function') # import proxy_checker3_all_function self.proxy_checker3 = proxy_checker3_all_function # self.Mozilla_Web=web print 'url is ' + url site = urlparse2(url).hostname fo = os.getcwd() CurrentDir = os.path.dirname(os.path.realpath(__file__)) s = CurrentDir.replace('\\', '/') + '/configs/Links_site/' print site # s2=os.getcwd()+'\\configs\\Links_site\\' self.file_exist = 0 if os.path.isfile(s + site.replace('.', '_') + '.py'): # sys.path.insert(0, s) # ss=sys.path # print ss sys.path.insert(0, s) # os.chdir(s) # import importlib # module2 = importlib.import_module(site.replace('.','_'), package=None) # del module2 si = sys.modules if site.replace('.', '_') in si: print "@@@@@@@@@@@@@@ module already exist for " + site + ' is \n: @@@@@@@@@@@@@@\n\n' self.new_module = si[site.replace('.', '_')] else: print "@@@@@@@@@@@@@@ module inserted for " + site + ' is \n: @@@@@@@@@@@@@@\n\n' self.new_module = __import__(site.replace('.', '_'), {}, {}, [], 2) # import imp # try: # imp.find_module(site.replace('.','_')) # found = True # self.new_module=site.replace('.','_') # print "@@@@@@@@@@@@@@ module already exist for "+site+' is \n: @@@@@@@@@@@@@@\n\n' # except ImportError: # found = False # self.new_module = __import__(site.replace('.','_'),{},{},[],2) # print "@@@@@@@@@@@@@@ module inserted for "+site+' is \n: @@@@@@@@@@@@@@\n\n' # self.new_module = __import__(site.replace('.','_'),{},{},[],2) print self.new_module self.file_exist = 1 else: print "@@@@@@@@@@@@@@ module " + CurrentDir.replace('\\', '/') + '/configs/Links_site/' + site.replace('.', '_') + '.py' + '\n Not found: @@@@@@@@@@@@@@\n\n' os.chdir(fo)
def find_link(self, proxy='', user_pass=''): url = self.url site = urlparse2(url).hostname link_done = 0 url_pdf = {} if not url.endswith('.pdf'): # from download_mozilla import web # from importlib import import_module # html=web().download(url) # html = br.open(url).read() # s=os.getcwd().replace('\\','/')+'/configs/Links_site/' # s2=os.getcwd()+'\\configs\\Links_site\\' # if os.path.isfile(os.getcwd().replace('\\','/')+'/configs/Links_site/'+site.replace('.','_')+'.py'): # fo = os.getcwd().replace('\\','/') # sys.path.insert(0, s) # # CurrentDir = os.path.dirname(os.path.realpath(__file__)) # # os.chdir(s) # # import importlib # # module2 = importlib.import_module(site.replace('.','_'), package=None) # new_module = __import__(site.replace('.','_')) # if self.file_exist == 1: res = self.new_module.LINK(url).get_pdf_link(proxy, user_pass) link = res['links']; proxy = res['proxy']; user_pass = res['user_pass']; cookies = res['cookies'] title = res['title']; html = res['html']; try: log_out = res['log_out'] except: log_out = '' try: form = res['form'] except: form = '' responce = { 'html': html, 'url': url, 'link': link, 'title': title, 'proxy': proxy, 'user_pass': user_pass, 'cookies': cookies, 'form': form, 'log_out': log_out } else: print "No " + site + '.py in config in ' + os.getcwd() + "\configs\Links_site" link = [] responce = { 'html': html, 'url': url, 'link': link, 'title': title, 'proxy': proxy, 'user_pass': user_pass, 'cookies': cookies, } # for line in listhandle: # if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) : # new_module = __import__(line) # import importlib # module = importlib.import_module(line, package=None) # # lookup in a set is in constant time # safe_names = {"file1.py", "file2.py", "file3.py", ...} # # user_input = ... # # if user_input in safe_names: # file = import_module(user_input) # else: # print("Nope, not doing this.") # # if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) : else:#if not url.endwith( '.pdf'): print 'address you have entered is end with .pdf and link is the same' link = url responce = { 'html': [], 'url': url, 'link': link, 'title': '', 'proxy': [], 'user_pass': [], 'cookies': '', } return responce
# url='http://johnny.heliohost.org:2082/login/' # url='http://127.0.0.1/trash/test/elec-lab.tk%20Mover-201312290438/' url='http://ieeexplore.ieee.org/ielx5/8981/28518/01274437.pdf?tp=&arnumber=1274437&isnumber=28518' url='http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=1274437&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D1274437' url='http://cpanel.1freehosting.com/' [proxy,proxy_h,prx_pass]=make_returning_proxy('configs//sites_proxy//',url,'configs//proxy_alive.txt') proxy='http://'+''.join(proxy) if proxy!='http://': os.environ['http_proxy'] = proxy # os.environ['http_proxy'] = 'http://222.66.115.233:80' listform=file_rd(site_list_form,'r') # file_input='configs/sites_proxy' host= urlparse2(url).hostname #timeout in seconds see http://docs.python.org/library/socket.html#socket.setdefaulttimeout socket.setdefaulttimeout(100) # os.environ['http_proxy']='' # import requests # content = requests.get(url) # print content # Navigate to Google for line in listform: if line.find(host)!=-1: form_data=usr_tag(line) # html=login_to_site(url,form_data) import twill.commands # t1=TWILL_Browser(url,form_data).splinter()
def get_pdf_link(self,proxy='', user_pass=''): url=self.url if proxy == '': fo = os.getcwd() pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//", url) os.chdir(fo) else: pr_h = [] user_pass_h = [] pr_h.append(proxy) user_pass_h.append(user_pass) i = user_pass_h.index("") del user_pass_h[i] try: i = pr_h.index("") del pr_h[i] except: pass don_flg = -1 if pr_h != []: i = -1 site = urlparse2(url).hostname listhandle = self.file_rd(self.sites_list, 'r') file_listhandle = self.file_rd(self.sites_list_files, 'r') link_done = 0 url_pdf = {} for j in range(i + 1, len(pr_h)): if don_flg != 1 and not url.endswith('.pdf') and link_done == 0: [html,proxy0,user_pass]=self.dowload_basePr_userpass(url,pr_h[j],user_pass_h[j]) if link_done == 0 and html!=[]: links =self.soap_my(data=html, tag='Full Text as PDF', attr='a', href='href',url=url) # links = self.soap_my(html, 'Full Text as PDF', 'a', 'href') if links!=[]: [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j]) links =self.soap_my(data=html, tag='<frame src="http://ieeexplore.ieee.org', attr='frame', href='src',url=str(links)) # links2=self.soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src') # links=links2 if links == [] or links==None: pass else: link_done = 1 break for line in listhandle: if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) : if re.findall("TAG1:", line): try: Tag = line.split("TAG1:")[1].split("---")[0] Tag=Tag.replace("+++",'') atrr = line.split("Attr1:")[1].split("---")[0] atrr=atrr.replace("+++",'') href=line.split('Href1:')[1].split("---")[0] href=href.replace("+++",'') links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url) # links = self.soap_my(html, Tag, atrr,href) if links != [] and link_done!=None: try: Tag = line.split("TAG2:")[1].split("---")[0] Tag=Tag.replace("---",'').replace("+++",'') atrr = line.split("Attr2:")[1].split("---")[0] atrr=atrr.replace('---','').replace("+++",'') href=line.split('Href2:')[1].split("---")[0] href=href.replace("+++",'') [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j]) links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=str(links)) # links = self.soap_my(html, Tag, atrr,href) except:pass # [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j]) # links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url) # links = self.soap_my(html, Tag, atrr,href) if links != [] or links!=None: link_done = 1 print '---------------we found Proper link which is :------------\n'+str(links)+\ '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j]) print '----------------- Link Found -------------------------' return links,pr_h[j],user_pass_h[j] except: pass # Tag = line.split("TAG1:")[1] # Tag=Tag.replace("---",'') # try: # abstract_match = re.search("Full Text as PDF([^\']+)", html, re.IGNORECASE) # abstract_url = "http://ieeexplore.ieee.org%s" % abstract_match.group(0) # import lxml.html, codecs # abs = [] # root = lxml.html.fromstring(html) # for div in root: # t = div.text_content() # if t: # abs.append(t) # # links = LINK(url).soap_my(html, Tag) # if links != [] and link_done!=None: # link_done = 1 # except: # pass # break elif link_done == 1: print "<li><a>tag found</a></li>" print links break elif url!=[] and url.endswith('.pdf'): return url,'','' if link_done==0: links=[] pr_h[j]=[] user_pass_h[j]=[] print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet" return links,pr_h[j],user_pass_h[j] else: # pr_h[j]=[] there is no trusted proxy for it html=self.dowload_basePr_userpass(url,'None') links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href') if links==[]: html=self.dowload_basePr_userpass(links,'None') links2=LINK(links).soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src') link=links2 if links == [] or links==None: print'there is no trusted proxy for downloading it' else: link_done = 1 return links,[],[]
proxy_checker3_all_function = import_mod( from_module='proxy_checker3_all_function') try: [proxy, proxy_h, prx_pass] = proxy_checker3_all_function.make_returning_proxy( "configs//sites_proxy//", url) proxy = 'http://' + ''.join(proxy) os.environ['http_proxy'] = proxy # os.environ['http_proxy'] = 'http://222.66.115.233:80' except: pass # [proxy,proxy_h,prx_pass]=proxy_checker3_all_function.make_returning_proxy('configs//sites_proxy//',url,'configs//proxy_alive.txt') listform = file_rd(site_list_form, 'r') # file_input='configs/sites_proxy' host = urlparse2(url).hostname #timeout in seconds see http://docs.python.org/library/socket.html#socket.setdefaulttimeout socket.setdefaulttimeout(100) # os.environ['http_proxy']='' # import requests # content = requests.get(url) # print content # Navigate to Google for line in listform: if line.find(host) != -1: form_data = usr_tag(line) # html=login_to_site(url,form_data) import twill.commands # t1=TWILL_Browser(url,form_data).splinter()