Python urlparse2示例，urlparse.urlparse2 Python示例

示例#1

0

显示文件

    def soap_my(self, data, tag, attr='a', href='href'):
        # from BeautifulSoup import BeautifulSoup
        # import re

        site = urlparse2(self.url).hostname
        soup = BeautifulSoup(data)
        ###################
        links = soup.findAll(attr, href == True)
        print links
        try:
            if links == []:
                links = soup.findAll(attr, href == True)
        except:
            pass
        done = 0
        for everytext in links:

            if re.findall(tag, str(everytext)):
                print everytext
                print everytext[href]
                if not (re.findall('www', everytext[href]) or re.findall('http://', everytext[href])):
                    f_nmae = urlparse.urljoin("http://" + site, everytext[href])
                    print f_nmae
                else:
                    f_nmae = everytext[href]
                    print f_nmae
                text = ''.join(everytext.findAll(text=True))
                data = text.strip()
                done = 1
                return f_nmae

                ###############
        if done == 0:
            link = []
            return link

示例#2

0

显示文件

文件： pr4ss_tk - befor title.py 项目： 001101/heroku-buildpack-python-ieee

    def build_opener(self, url, proxy=[], User_Pass=[], postdata=None, extraheaders={}, forbid_redirect=False):

        txheaders = {
            'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
            'Accept-Language': 'en,hu;q=0.8,en-us;q=0.5,hu-hu;q=0.3',
            #            'Accept-Encoding': 'gzip, deflate',
            'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
            #            'Keep-Alive': '300',
            #            'Connection': 'keep-alive',
            #            'Cache-Control': 'max-age=0',
        }
        for key, value in extraheaders.iteritems():
            txheaders[key] = value
        req = urllib2.Request(url, postdata, txheaders)
        self.cookies.add_cookie_header(req)
        if forbid_redirect:
            redirector = HTTPNoRedirector()
        else:
            redirector = urllib2.HTTPRedirectHandler()

        if proxy != [] and (not re.findall("None", proxy)) and proxy!='':
            if User_Pass != [] and User_Pass!='':
                proxies = {"http": "http://" + User_Pass + "@" + proxy}
            else:
                proxies = {"http": "http://%s" % proxy}
            proxy_support = urllib2.ProxyHandler(proxies)
            # opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler(debuglevel=1))
        else:
            proxy_support = urllib2.ProxyHandler()
            # opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler(debuglevel=1))
            # url=link.absolute_url
            # headers={'User-agent' : 'Mozilla/5.0'}

        http_handler = urllib2.HTTPHandler(debuglevel=self.debug)
        https_handler = urllib2.HTTPSHandler(debuglevel=self.debug)

        # default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
        #                    HTTPDefaultErrorHandler, HTTPRedirectHandler,
        #                    FTPHandler, FileHandler, HTTPErrorProcessor]


        u = urllib2.build_opener(proxy_support, http_handler, https_handler, urllib2.HTTPCookieProcessor(self.cookies),
                                 redirector)
        urllib2.install_opener(u)

        u.addheaders = [
            ('User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; hu-HU; rv:1.7.8) Gecko/20050511 Firefox/1.0.4')]
        if not postdata is None:
            req.add_data(postdata)

        if self.cookie3=='':
            fo = os.getcwd().replace('\\','/')
            # pathname = os.path.join("cookies", cookie3)
            site = urlparse2(url).hostname
            if not os.path.isdir(fo + "/cookies/"+site ):os.mkdir(fo + "/cookies/"+site )
            chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            self.cookie3 = fo + "/cookies/"+site +'/'+''.join([random.choice(chars) for x in range(5)]) + ".txt"
        self.cookies.save(self.cookie3)
        return (req, u,self.cookie3)

示例#3

0

显示文件

    def soap_my(self, **kwargs):
        data = kwargs['data']
        tag = kwargs['tag']
        try:
            attr = kwargs['attr']
        except:
            attr = 'a'
        try:
            href = kwargs['href']
        except:
            href = 'href'
        try:
            url = kwargs['url']
        except:
            url = "http://" + urlparse2(self.url).hostname

        # from BeautifulSoup import BeautifulSoup
        # import re

        # site = urlparse2(self.url).hostname
        soup = BeautifulSoup(data)
        ###################
        links = soup.findAll(attr, href == True)
        # print links
        try:
            if links == []:
                links = soup.findAll(attr, href == True)
        except:
            pass
        done = 0
        for everytext in links:

            if re.findall(tag, str(everytext)):
                print " link url finded for downloading...\n\t%s" % everytext
                # print everytext
                if not (re.findall('www', everytext[href])
                        or re.findall('http://', everytext[href])):
                    # f_nmae = urlparse.urljoin( url, everytext[href])
                    f_nmae = url + '/' + everytext[href]

                else:
                    f_nmae = everytext[href]

                print unicode(f_nmae)
                text = ''.join(everytext.findAll(text=True))
                data = text.strip()
                done = 1
                return f_nmae

                ###############
        if done == 0:
            link = []
            return link

示例#4

0

显示文件

文件： async_tornado_test.py 项目： Heroku-elasa/heroku-buildpack-python-ieee-new

def url2Path(**kwargs):
    myhost="http://127.0.0.1/"
    url=kwargs['url']
    site = urlparse2(url).hostname
    myhost="http://"+site+"/"
    try:
        kwargs['pdf_dir']
        pdf_dir=kwargs['pdf_dir']
    except:pdf_dir=url.split(myhost)[1]
    ph=pdf_dir.split('/')[0]
    f_ph=pdf_dir.split('/')[-1]
    rp=os.getcwd().replace('\\','/').replace('%20',' ').split(ph)[0]
    path=rp+pdf_dir.split(f_ph)[0].replace('%20',' ')+f_ph
    return path

示例#5

0

显示文件

def url2Path(**kwargs):
    myhost = "http://127.0.0.1/"
    url = kwargs['url']
    site = urlparse2(url).hostname
    myhost = "http://" + site + "/"
    try:
        kwargs['pdf_dir']
        pdf_dir = kwargs['pdf_dir']
    except:
        pdf_dir = url.split(myhost)[1]
    ph = pdf_dir.split('/')[0]
    f_ph = pdf_dir.split('/')[-1]
    rp = os.getcwd().replace('\\', '/').replace('%20', ' ').split(ph)[0]
    path = rp + pdf_dir.split(f_ph)[0].replace('%20', ' ') + f_ph
    return path

示例#6

0

显示文件

文件： diy4ng4django4php-freepaper_rhcloud_com.py 项目： Heroku-elasa/heroku-buildpack-python-ieee-new

    def soap_my(self, **kwargs):
        data=kwargs['data']
        tag=kwargs['tag']
        try:attr=kwargs['attr']
        except:attr='a'
        try:href=kwargs['href']
        except:href='href'
        try:url=kwargs['url']
        except:
            url = "http://"+urlparse2(self.url).hostname


        # from BeautifulSoup import BeautifulSoup
        # import re

        # site = urlparse2(self.url).hostname
        soup = BeautifulSoup(data)
        ###################
        links = soup.findAll(attr, href == True)
        print links
        try:
            if links == []:
                links = soup.findAll(attr, href == True)
        except:
            pass
        done = 0
        for everytext in links:

            if re.findall(tag, str(everytext)):
                print " link url finded for downloading...\n\t%s" % everytext
                # print everytext
                if not (re.findall('www', everytext[href]) or re.findall('http://', everytext[href])):
                    f_nmae = urlparse.urljoin( url, everytext[href])

                else:
                    f_nmae = everytext[href]

                print unicode(f_nmae)
                text = ''.join(everytext.findAll(text=True))
                data = text.strip()
                done = 1
                return f_nmae

                ###############
        if done == 0:
            link = []
            return link

示例#7

0

显示文件

文件： pr4ss_tk - befor title.py 项目： 001101/heroku-buildpack-python-ieee

    def get_pdf_link(self,proxy='', user_pass=''):

        url=self.url


        if proxy == '':

            fo = os.getcwd()
            pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//", url)
            os.chdir(fo)
        else:
            pr_h = []
            user_pass_h = []
            pr_h.append(proxy)
            user_pass_h.append(user_pass)
            # i = user_pass_h.index("")
            # del user_pass_h[i]
            try:
                i = pr_h.index("")
                del pr_h[i]
            except:
                pass

        don_flg = -1
        if pr_h != []:
            i = -1
            site = urlparse2(url).hostname
            listhandle = self.file_rd(self.sites_list, 'r')
            file_listhandle = self.file_rd(self.sites_list_files, 'r')
            link_done = 0
            url_pdf = {}
            for j in range(i + 1, len(pr_h)):
                if don_flg != 1 and not url.endswith('.pdf') \
                    and not url.endswith('.zip') and link_done == 0:
                    time0=time.time()

                    res=self.dowload_basePr_userpass_link(url,pr_h[j],user_pass_h[j],cookies='')
                    html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm']


                    if link_done == 0 and html!=[] :
                        # try:
                        #     if os.path.isfile(html):
                        #         h=open(html)
                        #         ht=h.read()
                        #         h.close()
                        #         os.remove(html)
                        #         html=ht
                        # except:
                        #     pass

                        links =self.soap_my(data=html, tag='Full Text as PDF', attr='a', href='href',url=url)
                        # links = self.soap_my(html, 'Full Text as PDF', 'a', 'href')
                        # if links == [] or links==None:
                        #     links =self.soap_my(data=html, tag='Full Text', attr='a', href='href',url=url)

                        if links!=[] and mech!=1 :
                            res=self.dowload_basePr_userpass_link(url,pr_h[j],user_pass_h[j],cookies='')
                            html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm']
                            # try:
                            #     if os.path.isfile(html):
                            #         h=open(html)
                            #         ht=h.read()
                            #         h.close()
                            #         os.remove(html)
                            #         html=ht
                            # except:
                            #     pass

                            links =self.soap_my(data=html, tag='<frame src="http://ieeexplore.ieee.org', attr='frame', href='src',url=str(links))
                            # links2=self.soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src')
                            # links=links2
                        if links == [] or links==None:
                            pass
                        else:
                            link_done = 1
                            print '---------------we found Proper link which is :------------\n'+str(links)+ \
                                  '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j])
                            print '----------------- Link Found -------------------------'
                            break

                        for line in listhandle:
                            if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) :
                                if re.findall("TAG1:", line):
                                    try:
                                        Tag = line.split("TAG1:")[1].split("---")[0]
                                        Tag=Tag.replace("+++",'')
                                        atrr = line.split("Attr1:")[1].split("---")[0]
                                        atrr=atrr.replace("+++",'')
                                        href=line.split('Href1:')[1].split("---")[0]
                                        href=href.replace("+++",'')
                                        links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url)
                                        # links = self.soap_my(html, Tag, atrr,href)
                                        if links != [] and link_done!=None and mech!=1:
                                            try:
                                                Tag = line.split("TAG2:")[1].split("---")[0]
                                                Tag=Tag.replace("---",'').replace("+++",'')

                                                atrr = line.split("Attr2:")[1].split("---")[0]
                                                atrr=atrr.replace('---','').replace("+++",'')
                                                href=line.split('Href2:')[1].split("---")[0]
                                                href=href.replace("+++",'')
                                                res=self.dowload_basePr_userpass_link(url,pr_h[j],user_pass_h[j],cookies='')
                                                html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm']
                                                # links = self.soap_my(html, Tag, atrr,href)
                                            except:pass
                                            # [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j])
                                            # links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url)
                                            # links = self.soap_my(html, Tag, atrr,href)
                                            if links != [] or links!=None:
                                                link_done = 1
                                                print '---------------we found Proper link which is :------------\n'+str(links)+ \
                                                      '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j])
                                                print '----------------- Link Found -------------------------'
                                                return links,pr_h[j],user_pass_h[j]

                                    except:
                                        pass
                                        #     Tag = line.split("TAG1:")[1]
                                        #     Tag=Tag.replace("---",'')
                                        #     try:
                                        #         abstract_match = re.search("Full Text as PDF([^\']+)", html, re.IGNORECASE)
                                        #         abstract_url = "http://ieeexplore.ieee.org%s" % abstract_match.group(0)
                                        #         import lxml.html, codecs
                                        #         abs = []
                                        #         root = lxml.html.fromstring(html)
                                        #         for div in root:
                                        #             t = div.text_content()
                                        #             if t:
                                        #                 abs.append(t)
                                        #
                                        #         links = LINK(url).soap_my(html, Tag)
                                        #         if links != []  and link_done!=None:
                                        #             link_done = 1
                                        #     except:
                                        #         pass
                                        # break


                    elif link_done == 1:
                        print "<li><a>tag found</a></li>"
                        print links
                        break
                    elif link_done==0:
                        if not os.path.isdir('configs/sites_proxy/'+site):
                            os.mkdir('configs/sites_proxy/'+site)
                        time_diff = str(round(time.time() - time0, 2))
                        if len(user_pass)!=0:
                            self.proxy_checker3.make_txt_file('configs/sites_proxy/'+site+"/badproxylist.txt", str(pr_h[j])+'@'+str(user_pass_h[j]), site, time_diff)
                        else:
                            self.proxy_checker3.make_txt_file('configs/sites_proxy/'+site+"/badproxylist.txt", str(pr_h[j]), site, time_diff)


                elif url!=[] or (url.endswith('.pdf') or  url.endswith('.zip')):
                    cookies=''
                    return url,'','',cookies


            if link_done==0:
                links=[]
                pr_h[j]=[]
                user_pass_h[j]=[]
                print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet"


            return links,pr_h[j],user_pass_h[j],cookies

        else: # pr_h[j]=[] there is no trusted proxy for it
            res=self.dowload_basePr_userpass_link(url,"None:None",[],cookies='')
            html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm']
            # [html,proxy0,user_pass,cookies]=self.dowload_basePr_userpass_link(url,"None:None",[],cookies='')
            links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href')
            if links==[]:
                res=self.dowload_basePr_userpass_link(links,"None:None",[],cookies=cookies)
                html=res['html'];proxy0=res['proxy'];user_pass=res['user_pass'];cookies=res['cookies'];mech=res['mechanizm']
                # [html,proxy0,user_pass,cookies]=self.dowload_basePr_userpass_link(links,"None:None",[],cookies=cookies)
                links2=LINK(links).soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src')
                link=links2
                if links == [] or links==None:
                    print'there is no trusted proxy for downloading it'
                else:
                    link_done = 1
            return links,[],[],cookies

示例#8

0

显示文件

文件： pr4ss_tk - befor title.py 项目： 001101/heroku-buildpack-python-ieee

    def BROWSER(self,cookie3=''):
        """
        :param url:
        """
        # global br, cj, r, proxy, User_Pass


        br = mechanize.Browser()
        # print br

        # Cookie Jar
        # fo=os.getcwd()+"\\cookies\\"
        # try :
        #     os.mkdir(fo)
        # except:
        #     pass
        # os.chdir(fo)
        # folder=sys.path.insert(0,'/cookies')
        if self.cookie3=='':
            fo = os.getcwd().replace('\\','/')
            # pathname = os.path.join("cookies", cookie3)
            site = urlparse2(self.url).hostname
            if not os.path.isdir(fo + "/cookies/"+site ):os.mkdir(fo + "/cookies/"+site )
            chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            self.cookie3 = fo + "/cookies/"+site +'/'+''.join([random.choice(chars) for x in range(5)]) + ".txt"
            self.cj = cookielib.LWPCookieJar()
        else:
            self.cj = cookielib.LWPCookieJar()
            self.cj.revert(self.cookie3)
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj))

        br.set_cookiejar(self.cj)
        # os.chdir(..)


        # Browser options
        br.set_handle_equiv(True)
        br.set_handle_gzip(True)
        br.set_handle_referer(True)    # no allow everything to be written to
        br.set_handle_robots(False)   # no robots
        br.set_handle_refresh(True)  # can sometimes hang without this
        br.set_handle_redirect(True)

        # Follows refresh 0 but not hangs on refresh > 0
        br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

        # Want debugging messages?
        #br.set_debug_http(True)
        #br.set_debug_redirects(True)
        #br.set_debug_responses(True)

        # User-Agent (this is cheating, ok?)
        br.addheaders = [('User-Agent', 'Mozilla/5.0 (Linux; U; Android 2.3.4; en-us; T-Mobile myTouch 3G Slide Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'),
                         ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                         ('Accept-Language', 'en-gb,en;q=0.5'),
                         ('Accept-Encoding', 'gzip,deflate'),
                         ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'),
                         ('Keep-Alive', '115'),
                         ('Connection', 'keep-alive'),
                         ('Cache-Control', 'max-age=0'),
                         ('Referer', 'http://yahoo.com')]
        # # If the protected site didn't receive the authentication data you would
        # # end up with a 410 error in your face
        # br.add_password('http://safe-site.domain', 'username', 'password')
        # br.open('http://safe-site.domain')

        # Open some site, let's pick a random one, the first that pops in mind:
        # Proxy and user/password
        #proxy = "61.233.25.166:80"

        # proxy = "202.202.0.163:3128"
        # proxy=self.proxy
        # Proxy
        # dd=re.findall('None:None', proxy)
        if self.proxy != [] and self.proxy != '' and not (re.findall('None', self.proxy)):
            br.proxies = br.set_proxies({"http": self.proxy})
            # br.proxies=br.set_proxies( proxy)

        if self.User_Pass != [] and self.User_Pass != '' and not (re.findall('None:None', self.User_Pass)):
            br.add_proxy_password(self.User_Pass.split(":")[0], self.User_Pass.split(":")[1])

        # if  r!={}:
        # rr = br.open(url)

        # c= cookielib.Cookie(version=0, name='PON', value="xxx.xxx.xxx.111", expires=365, port=None, port_specified=False, domain='xxxx', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=True, discard=False, comment=None, comment_url=None, rest={'HttpOnly': False}, rfc2109=False)
        # cj.set_cookie(c0)

        self.cj.save( self.cookie3)

        return br

示例#9

0

显示文件

文件： main_core.py 项目： 001101/heroku-buildpack-python-ieee

    def download_link(self, **kwargs):
        # global form,main_url
        if kwargs:
            CurrentDir = os.path.dirname(os.path.realpath(__file__)).replace(
                '\\', '/')
            if 'link' in kwargs:
                url0 = kwargs['link']
                if type(url0) is list:
                    url = url0[0]
                else:
                    if url0[:4] == 'www.':
                        url = 'http://' + url0[4:]
                    else:
                        url = url0
            else:
                url = ''
            if 'html' in kwargs: html = kwargs['html']
            else: html = ''
            if 'proxy' in kwargs: pr = kwargs['proxy']
            else: pr = ''
            if 'user_pass' in kwargs: us_pss = kwargs['user_pass']
            else: us_pss = ''
            if 'pdfdir' in kwargs: pdf_download_location = kwargs['pdfdir']
            else: pdf_download_location = CurrentDir + '/PDF_Files'
            if 'water_pdfdir' in kwargs: wat_locatton = kwargs['water_pdfdir']
            else: wat_locatton = CurrentDir + '/Watermarked_PDF_Files'
            if 'root' in kwargs: root = kwargs['root']
            else: root = CurrentDir
            if 'need_watermarker' in kwargs:
                need_watermarker = kwargs['need_watermarker']
            else:
                need_watermarker = True
            if 'server' in kwargs: server_cdn = kwargs['server']
            else: server_cdn = ''
            if 'cookies' in kwargs: cookies = kwargs['cookies']
            else: cookies = ''
            if 'ftp_upload' in kwargs: ftp_upload = kwargs['ftp_upload']
            else: ftp_upload = ''
            if 'log_out' in kwargs:
                log_out = kwargs['log_out']
                if log_out != '':
                    link = {'link': url, 'log_out': log_out}
                else:
                    link = url
                    log_out = ''
            else:
                link = url
                log_out = ''
        try:

            url_watermark = kwargs['url_watermark']
        except:
            # url_watermark='Please insert Your url to add as watermark'
            url_watermark = 'www.free-peprs.elasa.ir'

        done = 0
        try:
            main_url = kwargs['main_url']

        except:
            main_url = url

        # link,proxy,user_pass = Find_Link(url).find_link(pr,us_pss)
        # link,proxy,user_pass=self.valid_link()
        # link=url

        if link != [] and link != None:
            if main_url != url:
                data_base_host = str(urlparse2(main_url).hostname)
                try:
                    ez_host = str(urlparse2(url).hostname)
                except:
                    ez_host = str(urlparse2(url[0]).hostname)
                try:
                    base_url = 'http://' + data_base_host
                    file_name_link = base_url + url.split(ez_host)[1]
                except:
                    base_url = 'http://' + data_base_host
                    try:
                        file_name_link = base_url + url[0].split(ez_host)[1]
                    except:
                        file_name_link = url
            else:
                file_name_link = url

            os.chdir(CurrentDir)
            # file_name = self.Find_Link(file_name_link).find_name(pdf_download_location,wat_locatton)
            # file_name.url_watermark=url_watermark

            # [html,proxy,user_pass,cookies]=self.Find_Link(link,pdfdir=pdf_download_location,water_pdfdir=wat_locatton,cookies=cookies).dowload_basePr_userpass(pr,us_pss,cookies)
            if (not (html.endswith('.pdf'))) and (html[:4] != '%PDF' or len(
                    re.findall('%%EOF', html)) == 0):
                [html, proxy, user_pass, cookies] = self.Find_Link(
                    main_url,
                    pdfdir=pdf_download_location,
                    water_pdfdir=wat_locatton,
                    cookies=cookies).dowload_basePr_userpass(pr,
                                                             us_pss,
                                                             cookies,
                                                             url=link)
            else:
                proxy = pr
                user_pass = us_pss
            try:
                if os.path.isfile(html):
                    file_is = 1
                else:
                    file_is = 0
            except:
                file_is = 0
            if (html != [] and html[:4] == '%PDF') or file_is == 1:
                PDF_File = import_mod(from_module='save_source',
                                      from_module2='PDF_File')
                if not (html.endswith('.pdf')):

                    # from save_source import PDF_File
                    file_name = self.Find_Link(file_name_link).find_name(
                        pdf_download_location, wat_locatton)
                    # file_name['url_watermark']=url_watermark
                    file_name.url_watermark = url_watermark
                else:
                    os.remove(cookies)
                    file_name = self.Find_Link(file_name_link).find_name(
                        pdf_download_location, wat_locatton)
                    file_name.filename = html.split('/')[-1]
                    # file_name.pdf_Folder_filename=file_name.pdf_Folder_filename.split('/')[-1]
                    file_name.url_watermark = url_watermark
                # file_name = PDF_File(link,pdf_download_location,wat_locatton).filename(link)
                # file_name = self.Find_Link(link).find_name(pdf_download_location,wat_locatton)

                if not need_watermarker == False:  #need wtaremarker is ok
                    os.chdir(CurrentDir)
                    if not os.path.isdir(pdf_download_location):
                        os.mkdir(pdf_download_location)
                    if not os.path.isdir(wat_locatton):
                        os.mkdir(wat_locatton)
                    pdf_dw_dir, pdf_dw_Wr_dir = PDF_File(
                        url, pdf_download_location,
                        wat_locatton).finall_file_saving(html,
                                                         file_name,
                                                         pdf_download_location,
                                                         no_watermarker=0)
                    # photo=PDF_File(url,pdf_download_location,wat_locatton).pdf_to_image(pdf=pdf_dw_dir,pages=0)
                    pdf_size = size(os.path.getsize(pdf_dw_dir))

                    pdf_dw_li = self.path2url(file_name.pdf_Folder_filename,
                                              server_cdn,
                                              pdf_download_location, root)
                    if file_is == 1 and html.endswith('.pdf'):
                        wt_pdf_size = size(os.path.getsize(pdf_dw_Wr_dir))
                        pdf_dw_Wr_li = self.path2url(
                            file_name.W_pdf_Folder_filename, server_cdn,
                            wat_locatton, root)
                    elif file_is == 1 and not html.endswith('.pdf'):
                        wt_pdf_size = pdf_size
                        pdf_dw_Wr_li = pdf_dw_li

                    else:
                        wt_pdf_size = size(os.path.getsize(pdf_dw_Wr_dir))
                        pdf_dw_Wr_li = self.path2url(
                            file_name.W_pdf_Folder_filename, server_cdn,
                            wat_locatton, root)

                    try:
                        os.remove(cookies)
                    except:
                        pass
                    print "fetching main paper link url ...\n\t%s" % pdf_dw_li[:]
                    print "fetching waterarker paper link url ...\n\t%s" % pdf_dw_Wr_li
                else:
                    if not os.path.isdir(pdf_download_location):
                        os.mkdir(pdf_download_location)

                    pdf_dw_dir, pdf_dw_Wr_dir = PDF_File(
                        url, pdf_download_location,
                        wat_locatton).finall_file_saving(html,
                                                         file_name,
                                                         pdf_download_location,
                                                         no_watermarker=1)
                    pdf_size = size(os.path.getsize(pdf_dw_dir))
                    # pdf_size=len(html)/1024 #in kbit
                    wt_pdf_size = ''
                    pdf_dw_li = self.path2url(file_name.pdf_Folder_filename,
                                              server_cdn,
                                              pdf_download_location, root)
                    print "fetching main paper link url ...\n\t%s" % pdf_dw_li[:]
                    pdf_dw_Wr_li = "No watter marker requested my be becuase of big size or lack of time"
                    print "fetching waterarker paper link url ...\n\t%s" % pdf_dw_Wr_li
                done = 1
                if ftp_upload == '1':
                    public_url = 'None'

                    if need_watermarker == True:  #need wtaremarker is ok
                        public_url = self.upload_file(
                            water_pdfdir=pdf_dw_Wr_dir, METODE='FTP')
                    else:
                        public_url = self.upload_file(water_pdfdir=pdf_dw_li,
                                                      METODE='FTP')
                else:
                    public_url = 'None'
                if public_url != 'None':
                    # try:
                    #     file=open(pdf_dw_dir);
                    #     file.close()
                    #     file=open(pdf_dw_Wr_dir);
                    #     file.close()
                    # except:
                    #     print 'pdfs are closed and reasy to removed from loal host!'
                    # os.close(pdf_dw_dir);os.close(pdf_dw_Wr_dir);
                    os.remove(pdf_dw_dir)
                    os.remove(pdf_dw_Wr_dir)
                    pdf_dw_Wr_li = public_url
                    pdf_dw_li = public_url
                else:
                    public_url = pdf_dw_Wr_li
                address = {
                    'url': str(url),
                    'pdf_name': file_name.filename,
                    'W_pdf_name': file_name.filename,
                    'W_pdf_local': wat_locatton,
                    'pdf_size': pdf_size,
                    'wt_pdf_size': wt_pdf_size,
                    'pdf_dir': pdf_dw_dir,
                    'wt_pdf_dir': pdf_dw_Wr_dir,
                    'pdf_dw_li': pdf_dw_li,
                    "pdf_dw_Wr_li": pdf_dw_Wr_li,
                    'public_url': public_url,
                    'proxy_worked': proxy,
                    'user_pass_worked': user_pass
                }
                return address

            # elif os.path.isfile(html):

            elif html[:4] != "%PDF" and html != []:

                print 'file is not in PDF Format do you want to make a save it as html file'
                print 'format is ' + html[:4]
                print '*************html is :***********\n\n'
                print html
                print '************* end of html :***********\n\n'
                print '\n file link which found is :\n' + link + '\nbut file can not be downloaded '
            else:
                print 'file link which found is :\n'
                #print str(link['link']);
                print 'but file can not be downloaded '

        if done == 0:
            print 'we are unable to download from this address because can not find proper link '
            address = {
                'url': str(url),
                'pdf_dir': '',
                'pdf_size': '',
                'wt_pdf_size': '',
                'wt_pdf_dir': '',
                'pdf_dw_li': '',
                "pdf_dw_Wr_li": '',
                'public_url': '',
                'proxy_worked': '',
                'user_pass_worked': ''
            }
            return address

示例#10

0

显示文件

文件： dl_acm_org beffor cookies.py 项目： 001101/heroku-buildpack-python-ieee

    def get_pdf_link(self, proxy='', user_pass=''):

        url = self.url

        if proxy == '':

            fo = os.getcwd()
            pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy(
                "configs//sites_proxy//", url)
            os.chdir(fo)
        else:
            pr_h = []
            user_pass_h = []
            pr_h.append(proxy)
            user_pass_h.append(user_pass)
            # i = user_pass_h.index("")
            # del user_pass_h[i]
            try:
                i = pr_h.index("")
                del pr_h[i]
            except:
                pass

        don_flg = -1
        if pr_h != []:
            i = -1
            site = urlparse2(url).hostname
            listhandle = self.file_rd(self.sites_list, 'r')
            file_listhandle = self.file_rd(self.sites_list_files, 'r')
            link_done = 0
            url_pdf = {}
            cookies = ''
            for j in range(i + 1, len(pr_h)):
                if don_flg != 1 and not url.endswith(
                        '.pdf') and link_done == 0:
                    time0 = time.time()

                    [html, proxy0, user_pass
                     ] = self.dowload_basePr_userpass(url, pr_h[j],
                                                      user_pass_h[j])

                    if link_done == 0 and html != []:
                        # links =self.soap_my(data=html, tag='FullTextPDF', attr='a', href='href',url=url)

                        links = self.soap_my(data=html,
                                             tag='FullTextPdf',
                                             attr='a',
                                             href='href',
                                             url=url)

                        if links == [] or links == None:
                            pass
                        else:
                            link_done = 1
                            print '---------------we found Proper link which is :------------\n'+str(links)+ \
                                  '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j])
                            print '----------------- Link Found -------------------------'
                            break

                        for line in listhandle:
                            if re.findall(site, line) and link_done == 0 and (
                                    not re.findall("#",
                                                   line.split("TAG:")[0])):
                                if re.findall("TAG1:", line):
                                    try:
                                        Tag = line.split("TAG1:")[1].split(
                                            "---")[0]
                                        Tag = Tag.replace("+++", '')
                                        atrr = line.split("Attr1:")[1].split(
                                            "---")[0]
                                        atrr = atrr.replace("+++", '')
                                        href = line.split('Href1:')[1].split(
                                            "---")[0]
                                        href = href.replace("+++", '')
                                        links = self.soap_my(data=html,
                                                             tag=Tag,
                                                             attr=atrr,
                                                             href=href,
                                                             url=url)
                                        # links = self.soap_my(html, Tag, atrr,href)
                                        if links != [] or links != None:
                                            link_done = 1
                                            print '---------------we found Proper link which is :------------\n'+str(links)+ \
                                                  '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j])
                                            print '----------------- Link Found -------------------------'
                                            return links, pr_h[j], user_pass_h[
                                                j]

                                    except:
                                        pass

                    elif link_done == 1:
                        print "<li><a>tag found</a></li>"

                        print links
                        break
                    elif link_done == 0:
                        if not os.path.isdir('configs/sites_proxy/' + site):
                            os.mkdir('configs/sites_proxy/' + site)
                        time_diff = str(round(time.time() - time0, 2))
                        if len(user_pass) != 0:
                            self.proxy_checker3.make_txt_file(
                                'configs/sites_proxy/' + site +
                                "/badproxylist.txt",
                                str(pr_h[j]) + '@' + str(user_pass_h[j]), site,
                                time_diff)
                        else:
                            self.proxy_checker3.make_txt_file(
                                'configs/sites_proxy/' + site +
                                "/badproxylist.txt", str(pr_h[j]), site,
                                time_diff)

                elif url != [] and url.endswith('.pdf'):

                    return url, '', ''

            if link_done == 0:
                links = []
                pr_h[j] = []
                user_pass_h[j] = []
                cookies = ''
                print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet"

            return links, pr_h[j], user_pass_h[j], cookies

        else:  # pr_h[j]=[] there is no trusted proxy for it
            html = self.dowload_basePr_userpass(url, 'None')
            links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href')
            if links == []:
                html = self.dowload_basePr_userpass(links, 'None')
                links2 = LINK(links).soap_my(
                    html, '<frame src="http://ieeexplore.ieee.org', 'frame',
                    'src')
                link = links2
                if links == [] or links == None:
                    print 'there is no trusted proxy for downloading it'
                else:
                    link_done = 1
            return links, [], []

示例#11

0

显示文件

文件： find_link.py 项目： 001101/heroku-buildpack-python-ieee

    def __init__(self,
                 url='',
                 sites_list='configs/sites_list_pdf_tags.txt',
                 sites_list_files="configs/sites_list_files.txt",
                 site_proxy="configs//sites_proxy//",
                 **kwargs):
        self.sites_list = sites_list
        self.sites_list_files = sites_list_files
        self.site_proxy = site_proxy
        self.url = url

        CurrentDir = os.path.dirname(os.path.realpath(__file__)).replace(
            '\\', '/')
        try:
            if kwargs['cookies']:
                self.cookies = kwargs['cookies']
            else:
                self.cookies = ''
        except:
            self.cookies = ''
        try:
            if 'pdfdir' in kwargs:
                self.pdf_download_location = kwargs['pdfdir']
        except:
            self.pdf_download_location = CurrentDir + '/PDF_Files'
        try:
            if 'water_pdfdir' in kwargs:
                self.wat_locatton = kwargs['water_pdfdir']

        except:
            self.wat_locatton = CurrentDir + '/Watermarked_PDF_Files'
        try:
            if 'root' in kwargs: self.root = kwargs['root']
        except:
            self.root = CurrentDir

        # from  download_mozilla import web
        proxy_checker3_all_function = import_mod(
            from_module='proxy_checker3_all_function')
        # import proxy_checker3_all_function
        self.proxy_checker3 = proxy_checker3_all_function
        # self.Mozilla_Web=web
        print 'url is ' + url
        site = urlparse2(url).hostname
        fo = os.getcwd()
        CurrentDir = os.path.dirname(os.path.realpath(__file__))
        s = CurrentDir.replace('\\', '/') + '/configs/Links_site/'
        print site

        # s2=os.getcwd()+'\\configs\\Links_site\\'
        self.file_exist = 0
        if os.path.isfile(s + site.replace('.', '_') + '.py'):
            # sys.path.insert(0, s)
            # ss=sys.path
            # print ss
            sys.path.insert(0, s)
            # os.chdir(s)
            # import importlib
            # module2 = importlib.import_module(site.replace('.','_'), package=None)
            # del module2
            si = sys.modules
            if site.replace('.', '_') in si:
                print "@@@@@@@@@@@@@@ module already exist  for  " + site + ' is \n: @@@@@@@@@@@@@@\n\n'
                self.new_module = si[site.replace('.', '_')]
            else:
                print "@@@@@@@@@@@@@@ module inserted for  " + site + ' is \n: @@@@@@@@@@@@@@\n\n'
                self.new_module = __import__(site.replace('.', '_'), {}, {},
                                             [], 2)

            # import imp
            # try:
            #     imp.find_module(site.replace('.','_'))
            #     found = True
            #     self.new_module=site.replace('.','_')
            #     print "@@@@@@@@@@@@@@ module already exist  for  "+site+' is \n: @@@@@@@@@@@@@@\n\n'
            # except ImportError:
            #     found = False
            #     self.new_module = __import__(site.replace('.','_'),{},{},[],2)
            #     print "@@@@@@@@@@@@@@ module inserted for  "+site+' is \n: @@@@@@@@@@@@@@\n\n'

            # self.new_module = __import__(site.replace('.','_'),{},{},[],2)
            print self.new_module
            self.file_exist = 1
        else:
            print "@@@@@@@@@@@@@@ module " + CurrentDir.replace(
                '\\', '/') + '/configs/Links_site/' + site.replace(
                    '.', '_') + '.py' + '\n Not found: @@@@@@@@@@@@@@\n\n'
        os.chdir(fo)

示例#12

0

显示文件

文件： find_link.py 项目： 001101/heroku-buildpack-python-ieee

    def find_link(self, proxy='', user_pass=''):
        url = self.url
        site = urlparse2(url).hostname
        link_done = 0
        url_pdf = {}
        if not url.endswith('.pdf'):
            # from  download_mozilla import web
            # from importlib import import_module
            # html=web().download(url)
            # html = br.open(url).read()

            # s=os.getcwd().replace('\\','/')+'/configs/Links_site/'
            # s2=os.getcwd()+'\\configs\\Links_site\\'
            # if os.path.isfile(os.getcwd().replace('\\','/')+'/configs/Links_site/'+site.replace('.','_')+'.py'):
            #         fo = os.getcwd().replace('\\','/')
            #         sys.path.insert(0, s)
            #         # CurrentDir = os.path.dirname(os.path.realpath(__file__))
            #         # os.chdir(s)
            #         # import importlib
            #         # module2 = importlib.import_module(site.replace('.','_'), package=None)
            #         new_module = __import__(site.replace('.','_'))
            #
            if self.file_exist == 1:

                res = self.new_module.LINK(url).get_pdf_link(proxy, user_pass)
                link = res['links']
                proxy = res['proxy']
                user_pass = res['user_pass']
                cookies = res['cookies']
                title = res['title']
                html = res['html']
                try:
                    log_out = res['log_out']
                except:
                    log_out = ''
                try:
                    form = res['form']
                except:
                    form = ''
                responce = {
                    'html': html,
                    'url': url,
                    'link': link,
                    'title': title,
                    'proxy': proxy,
                    'user_pass': user_pass,
                    'cookies': cookies,
                    'form': form,
                    'log_out': log_out
                }

            else:
                print "No " + site + '.py in config in ' + os.getcwd(
                ) + "\configs\Links_site"
                link = []
                responce = {
                    'html': html,
                    'url': url,
                    'link': link,
                    'title': title,
                    'proxy': proxy,
                    'user_pass': user_pass,
                    'cookies': cookies,
                }

                # for line in listhandle:
                #     if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) :
                #         new_module = __import__(line)
                #     import importlib
                #     module = importlib.import_module(line, package=None)

                # # lookup in a set is in constant time
                # safe_names = {"file1.py", "file2.py", "file3.py", ...}
                #
                # user_input = ...
                #
                # if user_input in safe_names:
                #     file = import_module(user_input)
                # else:
                #     print("Nope, not doing this.")

                #
                # if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) :
        else:  #if not  url.endwith( '.pdf'):
            print 'address you have entered is end with .pdf and link is the same'
            link = url

            responce = {
                'html': [],
                'url': url,
                'link': link,
                'title': '',
                'proxy': [],
                'user_pass': [],
                'cookies': '',
            }
        return responce

示例#13

0

显示文件

def writer(f, rq):
    while True:
        line = rq.get()
        f.write(line + '\n')

    site = urlparse2(test_url).hostname
    l = 0
    proxy_handler = []
    pr_h = []
    proxy_h = []
    user_pass_h = []
    try:

        listhandle = file_rd(input_file + site + ".txt", 'r')

        # sort_file(name,text_to_sort)
        for line in listhandle:
            if re.findall("For Site:", line):
                proxy1 = line.split("For Site:")[0]
                proxy2 = line.split("For Site:")[1]

                # proxyList.append(proxy1)
                pr, proxy_han, user_pass = make_proxy_handler(proxy1)
                # if pr!=[]:
                pr_h.append(pr)
                # if proxy_han!=[]:
                proxy_h.append(proxy_han)
                # if user_pass!=[]:
                user_pass_h.append(user_pass)
    except:
        listhandle = open(proxy_alive).readlines()
        for line in listhandle:
            if re.findall(site, line):
                if re.findall("For Site:", line):
                    proxy1 = line.split("For Site:")[0]
                    proxy2 = line.split("For Site:")[1]
                    if re.findall(site, proxy2):
                        # proxyList.append(proxy1)
                        pr, proxy_han, user_pass = make_proxy_handler(proxy1)
                        # if pr!=[]:
                        pr_h.append(pr)
                        # if proxy_han!=[]:
                        proxy_h.append(proxy_han)
                        # if user_pass!=[]:
                        user_pass_h.append(user_pass)
        if pr_h == []:
            proxy_checker(test_url, input_file + "proxylist.txt", proxy_alive,
                          input_file, 30)
            try:
                listhandle = open(input_file + site + ".txt").readlines()

                for line in listhandle:
                    if re.findall("For Site:", line):
                        proxy1 = line.split("For Site:")[0]
                        proxy2 = line.split("For Site:")[1]

                        # proxyList.append(proxy1)
                        pr, proxy_han, user_pass = make_proxy_handler(proxy1)
                        # if pr!=[]:
                        pr_h.append(pr)
                        # if proxy_han!=[]:
                        proxy_h.append(proxy_han)
                        # if user_pass!=[]:
                        user_pass_h.append(user_pass)
            except:
                pass
        try:
            listhandle = open(input_file + site + ".txt").readlines()
        except:
            proxy_checker(test_url, input_file + "proxylist.txt", proxy_alive,
                          input_file, 30)
            #             pip=proxy1
            # # if l==1:
            #             try :
            #                 pip.split('@')[1]
            #                 proxy_info = {
            #                     'user' : pip.split('@')[1].split(":")[0],
            #                     'pass' : pip.split('@')[1].split(":")[1].replace('\n', ''),
            #                     'host' : pip.split('@')[0].split(":")[0],
            #                     'port' : pip.split('@')[0].split(":")[1] # or 8080 or whatever
            #                 }
            #                 proxy_handler.append("http://%(user)s:%(pass)s@%(host)s:%(port)s" % proxy_info)
            #             except:
            #                 proxy_info = {
            #                     'host' : pip.split(":")[0],
            #                     'port' : pip.split(":")[1].replace('\n', '') # or 8080 or whatever
            #                 }
            #                 proxy_handler.append("http://%(host)s:%(port)s" % proxy_info)
    return pr_h, proxy_h, user_pass_h

示例#14

0

显示文件

    def download(self,
                 url='',
                 proxy='',
                 user_pass='',
                 location='PDF_Files/',
                 **kwargs):
        """

        :param url:
        """
        if kwargs['cookies']: cookies = kwargs['cookies']
        else: cookies = ''
        site = urlparse2(url).hostname

        # if proxy == '':
        #
        #     fo = os.getcwd()
        #     pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//"+site+'//', url)
        #     os.chdir(fo)
        if proxy == '' or proxy == []:
            proxy_checker3_all_function = import_mod(
                from_module='proxy_checker3_all_function')
            # import proxy_checker3_all_function
            fo = os.getcwd().replace('\\', '/')
            pr_h, proxy_h, user_pass_h = proxy_checker3_all_function.make_returning_proxy(
                "configs//sites_proxy//" + site + '//', url)
            os.chdir(fo)
        else:
            pr_h = []
            user_pass_h = []
            pr_h.append(proxy)
            user_pass_h.append(user_pass)
            # try:
            #     i = user_pass_h.index("")
            #     del user_pass_h[i]
            # except:
            #     print 'there is no empty lsit in user_password list'
            try:
                i = pr_h.index("")
                del pr_h[i]
            except:
                pass
                # print 'there is no empty list in proxy list'

        # pr_h=['222.66.115.233:80 ', '202.202.0.163:3128 ', '151.236.14.48:80']

        pdf_dw_li = pdf_dw_Wr_li = []
        frontpage = []
        don_flg = -1
        if pr_h != []:
            i = -1
            for j in range(i + 1, len(pr_h)):
                if don_flg != 1:
                    debug = True
                    cash = None
                    # dl = MozillaEmulator(cash,0,debug)
                    dl = MozillaEmulator(cash, 0, cookies=cookies)
                    try:
                        if user_pass_h[j] != '':

                            frontpage, cookies = dl.download(
                                url, pr_h[j], user_pass_h[j])
                            pr = pr_h[j]
                            upss = user_pass_h[j]
                        else:
                            frontpage, cookies = dl.download(url, pr_h[j])
                            pr = pr_h[j]
                            upss = ''

                    except:
                        print "we cant download because of invalid tag or invalid proxy line 620" + "\n"

                    if frontpage != []:
                        if len(user_pass_h[j]) != 0:
                            print "file downloaded with " + str(
                                pr_h[j]) + '@' + str(user_pass_h[j])
                        else:
                            print "file downloaded with " + str(pr_h[j])
                        don_flg = 1
                        # pr = pr_h[j]
                        # upss = user_pass_h[j]
                        break
                else:
                    print "we could not download file with  proxy:" + pr_h[j]
            if don_flg != 1:
                print "we are unable to download your file Now!!" + '\n'
                frontpage = []
                pr = ''
                upss = ''
                # cookies=''

        else:
            print "we are unable to download your file Now!! Beacouse proxy is empty" + '\n'

        return frontpage, pr, upss, cookies

示例#15

0

显示文件

文件： main_core.py 项目： Heroku-elasa/heroku-buildpack-python-ieee-new

    def download_link(self,**kwargs):
        # global form,main_url
        if kwargs:
            CurrentDir=os.path.dirname(os.path.realpath(__file__)).replace('\\','/')
            if 'link' in kwargs:
                url0=kwargs['link']
                if type(url0) is list:
                    url=url0[0]
                else:
                    if url0[:4]=='www.':
                        url='http://'+url0[4:]
                    else:
                        url=url0
            else:
                url=''
            if 'html' in kwargs:html=kwargs['html']
            else:html=''
            if 'proxy' in kwargs:pr=kwargs['proxy']
            else:pr=''
            if 'user_pass' in kwargs:us_pss=kwargs['user_pass']
            else:us_pss=''
            if 'pdfdir' in kwargs:pdf_download_location=kwargs['pdfdir']
            else:pdf_download_location=CurrentDir+'/PDF_Files'
            if 'water_pdfdir' in kwargs:wat_locatton=kwargs['water_pdfdir']
            else:wat_locatton=CurrentDir+'/Watermarked_PDF_Files'
            if 'root' in kwargs:root=kwargs['root']
            else:root=CurrentDir
            if 'need_watermarker' in kwargs:need_watermarker=kwargs['need_watermarker']
            else:need_watermarker=True
            if 'server' in kwargs:server_cdn=kwargs['server']
            else:server_cdn=''
            if 'cookies' in kwargs:cookies=kwargs['cookies']
            else:cookies=''
            if 'ftp_upload' in kwargs:ftp_upload=kwargs['ftp_upload']
            else:ftp_upload=''
            if 'log_out' in kwargs:
                log_out=kwargs['log_out']
                if log_out!='':
                    link={
                        'link':url,
                        'log_out':log_out
                    }
                else:
                    link=url
                    log_out=''
            else:
                link=url
                log_out=''
        try:

            url_watermark=kwargs['url_watermark']
        except:
            # url_watermark='Please insert Your url to add as watermark'
            url_watermark='www.free-peprs.elasa.ir'

        done=0
        try:
            main_url=kwargs['main_url']

        except:
            main_url=url

        # link,proxy,user_pass = Find_Link(url).find_link(pr,us_pss)
        # link,proxy,user_pass=self.valid_link()
        # link=url


        if link!=[] and link!=None:
            if main_url!=url:
                data_base_host = str(urlparse2(main_url).hostname)
                try:
                    ez_host = str(urlparse2(url).hostname)
                except:
                    ez_host = str(urlparse2(url[0]).hostname)
                try:
                    base_url='http://'+data_base_host
                    file_name_link=base_url+url.split(ez_host)[1]
                except:
                    base_url='http://'+data_base_host
                    try:
                        file_name_link=base_url+url[0].split(ez_host)[1]
                    except:
                        file_name_link=url
            else:
                file_name_link=url

            os.chdir(CurrentDir)
            # file_name = self.Find_Link(file_name_link).find_name(pdf_download_location,wat_locatton)
            # file_name.url_watermark=url_watermark

            # [html,proxy,user_pass,cookies]=self.Find_Link(link,pdfdir=pdf_download_location,water_pdfdir=wat_locatton,cookies=cookies).dowload_basePr_userpass(pr,us_pss,cookies)
            if (not (html.endswith('.pdf'))) and (html[:4]!='%PDF' or len ( re.findall('%%EOF', html ))==0)  :
                [html,proxy,user_pass,cookies]=self.Find_Link(main_url,pdfdir=pdf_download_location,water_pdfdir=wat_locatton,cookies=cookies).dowload_basePr_userpass(pr,us_pss,cookies,url=link)
            else:
                proxy=pr;user_pass=us_pss;
            try:
                if os.path.isfile(html):
                    file_is=1
                else:file_is=0
            except:
                file_is=0
            if (html!=[] and html[:4]=='%PDF')or file_is==1 :
                PDF_File=import_mod(from_module='save_source',from_module2='PDF_File')
                if  not (html.endswith('.pdf')):

                    # from save_source import PDF_File
                    file_name = self.Find_Link(file_name_link).find_name(pdf_download_location,wat_locatton)
                    # file_name['url_watermark']=url_watermark
                    file_name.url_watermark=url_watermark
                else:
                    os.remove(cookies)
                    file_name = self.Find_Link(file_name_link).find_name(pdf_download_location,wat_locatton)
                    file_name.filename=html.split('/')[-1]
                    # file_name.pdf_Folder_filename=file_name.pdf_Folder_filename.split('/')[-1]
                    file_name.url_watermark=url_watermark
                # file_name = PDF_File(link,pdf_download_location,wat_locatton).filename(link)
                # file_name = self.Find_Link(link).find_name(pdf_download_location,wat_locatton)

                if not need_watermarker==False:#need wtaremarker is ok
                    os.chdir(CurrentDir)
                    if not os.path.isdir(pdf_download_location):
                        os.mkdir(pdf_download_location)
                    if not os.path.isdir(wat_locatton):
                        os.mkdir(wat_locatton)
                    pdf_dw_dir, pdf_dw_Wr_dir = PDF_File(url,pdf_download_location,wat_locatton).finall_file_saving(html, file_name,pdf_download_location,no_watermarker=0)
                    # photo=PDF_File(url,pdf_download_location,wat_locatton).pdf_to_image(pdf=pdf_dw_dir,pages=0)
                    pdf_size=size(os.path.getsize(pdf_dw_dir))

                    pdf_dw_li =self.path2url(file_name.pdf_Folder_filename,server_cdn,pdf_download_location,root)
                    if file_is==1 and html.endswith('.pdf'):
                        wt_pdf_size=size(os.path.getsize(pdf_dw_Wr_dir))
                        pdf_dw_Wr_li = self.path2url(file_name.W_pdf_Folder_filename,server_cdn,wat_locatton,root)
                    elif file_is==1 and not html.endswith('.pdf'):
                        wt_pdf_size=pdf_size
                        pdf_dw_Wr_li=pdf_dw_li

                    else:
                        wt_pdf_size=size(os.path.getsize(pdf_dw_Wr_dir))
                        pdf_dw_Wr_li = self.path2url(file_name.W_pdf_Folder_filename,server_cdn,wat_locatton,root)

                    try:
                        os.remove(cookies)
                    except:
                        pass
                    print "fetching main paper link url ...\n\t%s" % pdf_dw_li[:]
                    print "fetching waterarker paper link url ...\n\t%s" % pdf_dw_Wr_li
                else:
                    if not os.path.isdir(pdf_download_location):
                        os.mkdir(pdf_download_location)

                    pdf_dw_dir, pdf_dw_Wr_dir = PDF_File(url,pdf_download_location,wat_locatton).finall_file_saving(html, file_name,pdf_download_location,no_watermarker=1)
                    pdf_size=size(os.path.getsize(pdf_dw_dir))
                    # pdf_size=len(html)/1024 #in kbit
                    wt_pdf_size=''
                    pdf_dw_li =self.path2url(file_name.pdf_Folder_filename,server_cdn,pdf_download_location,root)
                    print "fetching main paper link url ...\n\t%s" % pdf_dw_li[:]
                    pdf_dw_Wr_li="No watter marker requested my be becuase of big size or lack of time"
                    print "fetching waterarker paper link url ...\n\t%s" % pdf_dw_Wr_li
                done=1
                if ftp_upload=='1':
                    public_url='None'

                    if  need_watermarker==True:#need wtaremarker is ok
                        public_url=self.upload_file(water_pdfdir=pdf_dw_Wr_dir,METODE='FTP')
                    else:
                        public_url=self.upload_file(water_pdfdir=pdf_dw_li,METODE='FTP')
                else:
                    public_url='None'
                if public_url!='None':
                    # try:
                    #     file=open(pdf_dw_dir);
                    #     file.close()
                    #     file=open(pdf_dw_Wr_dir);
                    #     file.close()
                    # except:
                    #     print 'pdfs are closed and reasy to removed from loal host!'
                    # os.close(pdf_dw_dir);os.close(pdf_dw_Wr_dir);
                    os.remove(pdf_dw_dir);os.remove(pdf_dw_Wr_dir);
                    pdf_dw_Wr_li=public_url;pdf_dw_li=public_url;
                else:
                    public_url=pdf_dw_Wr_li;
                address={
                    'url':str(url),
                    'pdf_name':file_name.filename,
                    'W_pdf_name':file_name.filename,
                    'W_pdf_local':wat_locatton,
                    'pdf_size':pdf_size,
                    'wt_pdf_size':wt_pdf_size,
                    'pdf_dir':pdf_dw_dir,
                    'wt_pdf_dir':pdf_dw_Wr_dir,
                    'pdf_dw_li':pdf_dw_li,
                    "pdf_dw_Wr_li":pdf_dw_Wr_li,
                    'public_url':public_url,
                    'proxy_worked':proxy,
                    'user_pass_worked':user_pass}
                return address

            # elif os.path.isfile(html):

            elif html[:4]!="%PDF" and html!=[]:

                print 'file is not in PDF Format do you want to make a save it as html file'
                print 'format is '+html[:4]
                print '*************html is :***********\n\n'
                print html
                print '************* end of html :***********\n\n'
                print '\n file link which found is :\n'+link+'\nbut file can not be downloaded '
            else:
                print 'file link which found is :\n';#print str(link['link']);
                print 'but file can not be downloaded '

        if done==0:
            print 'we are unable to download from this address because can not find proper link '
            address={
                'url':str(url),
                'pdf_dir':'',
                'pdf_size':'',
                'wt_pdf_size':'',
                'wt_pdf_dir':'',
                'pdf_dw_li':'',
                "pdf_dw_Wr_li":'',
                'public_url':'',
                'proxy_worked':'',
                'user_pass_worked':''}
            return address

示例#16

0

显示文件

    def get_pdf_link(self,proxy='', user_pass=''):

        url=self.url


        if proxy == '':

            fo = os.getcwd()
            pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//", url)
            os.chdir(fo)
        else:
            pr_h = []
            user_pass_h = []
            pr_h.append(proxy)
            user_pass_h.append(user_pass)
            i = user_pass_h.index("")
            del user_pass_h[i]
            try:
                i = pr_h.index("")
                del pr_h[i]
            except:
                pass

        don_flg = -1
        if pr_h != []:
            i = -1
            site = urlparse2(url).hostname
            listhandle = self.file_rd(self.sites_list, 'r')
            file_listhandle = self.file_rd(self.sites_list_files, 'r')
            link_done = 0
            url_pdf = {}
            for j in range(i + 1, len(pr_h)):
                if don_flg != 1 and not url.endswith('.pdf') and link_done == 0:

                    [html,proxy0,user_pass]=self.dowload_basePr_userpass(url,pr_h[j],user_pass_h[j])


                    if link_done == 0 :
                        links = self.soap_my(html, 'Full Text as PDF', 'a', 'href')
                        if links!=[]:
                            [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j])
                            links3=self.soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src')
                        if links == [] or links==None:
                            pass
                        else:
                            link_done = 1
                            break

                        for line in listhandle:
                                if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) :
                                    if re.findall("TAG1:", line):
                                        try:
                                            Tag = line.split("TAG1:")[1].split("---")[0]
                                            Tag=Tag.replace("+++",'')
                                            atrr = line.split("Attr1:")[1].split("---")[0]
                                            atrr=atrr.replace("+++",'')
                                            href=line.split('Href1:')[1].split("---")[0]
                                            href=href.replace("+++",'')
                                            links = self.soap_my(html, Tag, atrr,href)
                                            if links != [] and link_done!=None:
                                                try:
                                                    Tag = line.split("TAG2:")[1].split("---")[0]
                                                    Tag=Tag.replace("---",'').replace("+++",'')

                                                    atrr = line.split("Attr2:")[1].split("---")[0]
                                                    atrr=atrr.replace('---','').replace("+++",'')
                                                    href=line.split('Href2:')[1].split("---")[0]
                                                    href=href.replace("+++",'')
                                                except:
                                                    return links,pr_h[j],user_pass_h[j]

                                                [html,proxy0]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j])

                                                links = self.soap_my(html, Tag, atrr,href)
                                                link_done = 1


                                        except:
                                            Tag = line.split("TAG1:")[1]
                                            Tag=Tag.replace("---",'')
                                            try:
                                                abstract_match = re.search("Full Text as PDF([^\']+)", html, re.IGNORECASE)
                                                abstract_url = "http://ieeexplore.ieee.org%s" % abstract_match.group(0)
                                                import lxml.html, codecs
                                                abs = []
                                                root = lxml.html.fromstring(html)
                                                for div in root:
                                                    t = div.text_content()
                                                    if t:
                                                        abs.append(t)

                                                links = LINK(url).soap_my(html, Tag)
                                                if links != []  and link_done!=None:
                                                    link_done = 1
                                            except:
                                                pass
                                        break


                    elif link_done == 1:
                        print "<li><a>tag found</a></li>"
                        print links
                        break

                elif url!=[] and url.endswith('.pdf'):

                    return url,'',''


            if link_done==0:
                links=[]
                pr_h[j]=[]
                user_pass_h[j]=[]
                print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet"


            return links,pr_h[j],user_pass_h[j]

        else: # pr_h[j]=[] there is no trusted proxy for it
            html=self.dowload_basePr_userpass(url,'None')
            links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href')
            if links==[]:
                html=self.dowload_basePr_userpass(links,'None')
                links3=LINK(links).soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src')
                if links == [] or links==None:
                    print'there is no trusted proxy for downloading it'
                else:
                    link_done = 1
            return links,[],[]

示例#17

0

显示文件

文件： dl_acm_org beffor cookies.py 项目： Heroku-elasa/heroku-buildpack-python-ieee-new

    def get_pdf_link(self,proxy='', user_pass=''):

        url=self.url


        if proxy == '':

            fo = os.getcwd()
            pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//", url)
            os.chdir(fo)
        else:
            pr_h = []
            user_pass_h = []
            pr_h.append(proxy)
            user_pass_h.append(user_pass)
            # i = user_pass_h.index("")
            # del user_pass_h[i]
            try:
                i = pr_h.index("")
                del pr_h[i]
            except:
                pass

        don_flg = -1
        if pr_h != []:
            i = -1
            site = urlparse2(url).hostname
            listhandle = self.file_rd(self.sites_list, 'r')
            file_listhandle = self.file_rd(self.sites_list_files, 'r')
            link_done = 0
            url_pdf = {}
            cookies=''
            for j in range(i + 1, len(pr_h)):
                if don_flg != 1 and not url.endswith('.pdf') and link_done == 0:
                    time0=time.time()

                    [html,proxy0,user_pass]=self.dowload_basePr_userpass(url,pr_h[j],user_pass_h[j])


                    if link_done == 0 and html!=[]:
                        # links =self.soap_my(data=html, tag='FullTextPDF', attr='a', href='href',url=url)

                        links =self.soap_my(data=html, tag='FullTextPdf', attr='a', href='href',url=url)


                        if links == [] or links==None:
                            pass
                        else:
                            link_done = 1
                            print '---------------we found Proper link which is :------------\n'+str(links)+ \
                                  '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j])
                            print '----------------- Link Found -------------------------'
                            break

                        for line in listhandle:
                            if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) :
                                if re.findall("TAG1:", line):
                                    try:
                                        Tag = line.split("TAG1:")[1].split("---")[0]
                                        Tag=Tag.replace("+++",'')
                                        atrr = line.split("Attr1:")[1].split("---")[0]
                                        atrr=atrr.replace("+++",'')
                                        href=line.split('Href1:')[1].split("---")[0]
                                        href=href.replace("+++",'')
                                        links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url)
                                        # links = self.soap_my(html, Tag, atrr,href)
                                        if links != [] or links!=None:
                                                link_done = 1
                                                print '---------------we found Proper link which is :------------\n'+str(links)+ \
                                                      '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j])
                                                print '----------------- Link Found -------------------------'
                                                return links,pr_h[j],user_pass_h[j]

                                    except:
                                        pass


                    elif link_done == 1:
                        print "<li><a>tag found</a></li>"

                        print links
                        break
                    elif link_done==0:
                        if not os.path.isdir('configs/sites_proxy/'+site):
                            os.mkdir('configs/sites_proxy/'+site)
                        time_diff = str(round(time.time() - time0, 2))
                        if len(user_pass)!=0:
                            self.proxy_checker3.make_txt_file('configs/sites_proxy/'+site+"/badproxylist.txt", str(pr_h[j])+'@'+str(user_pass_h[j]), site, time_diff)
                        else:
                            self.proxy_checker3.make_txt_file('configs/sites_proxy/'+site+"/badproxylist.txt", str(pr_h[j]), site, time_diff)


                elif url!=[] and url.endswith('.pdf'):

                    return url,'',''


            if link_done==0:
                links=[]
                pr_h[j]=[]
                user_pass_h[j]=[]
                cookies=''
                print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet"


            return links,pr_h[j],user_pass_h[j],cookies

        else: # pr_h[j]=[] there is no trusted proxy for it
            html=self.dowload_basePr_userpass(url,'None')
            links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href')
            if links==[]:
                html=self.dowload_basePr_userpass(links,'None')
                links2=LINK(links).soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src')
                link=links2
                if links == [] or links==None:
                    print'there is no trusted proxy for downloading it'
                else:
                    link_done = 1
            return links,[],[]

示例#18

0

显示文件

文件： find_link.py 项目： Heroku-elasa/heroku-buildpack-python-ieee-new

    def __init__(self, url='', sites_list='configs/sites_list_pdf_tags.txt',
                 sites_list_files="configs/sites_list_files.txt",
                 site_proxy="configs//sites_proxy//", **kwargs):
        self.sites_list = sites_list
        self.sites_list_files = sites_list_files
        self.site_proxy = site_proxy
        self.url = url

        CurrentDir = os.path.dirname(os.path.realpath(__file__)).replace('\\', '/')
        try:
            if kwargs['cookies']:
                self.cookies = kwargs['cookies']
            else:
                self.cookies = ''
        except:
            self.cookies = ''
        try:
            if 'pdfdir' in kwargs: self.pdf_download_location = kwargs['pdfdir']
        except:
            self.pdf_download_location = CurrentDir + '/PDF_Files'
        try:
            if 'water_pdfdir' in kwargs: self.wat_locatton = kwargs['water_pdfdir']

        except:
            self.wat_locatton = CurrentDir + '/Watermarked_PDF_Files'
        try:
            if 'root' in kwargs: self.root = kwargs['root']
        except:
            self.root = CurrentDir


        # from  download_mozilla import web
        proxy_checker3_all_function = import_mod(from_module='proxy_checker3_all_function')
        # import proxy_checker3_all_function
        self.proxy_checker3 = proxy_checker3_all_function
        # self.Mozilla_Web=web
        print 'url is ' + url
        site = urlparse2(url).hostname
        fo = os.getcwd()
        CurrentDir = os.path.dirname(os.path.realpath(__file__))
        s = CurrentDir.replace('\\', '/') + '/configs/Links_site/'
        print site

        # s2=os.getcwd()+'\\configs\\Links_site\\'
        self.file_exist = 0
        if os.path.isfile(s + site.replace('.', '_') + '.py'):
            # sys.path.insert(0, s)
            # ss=sys.path
            # print ss
            sys.path.insert(0, s)
            # os.chdir(s)
            # import importlib
            # module2 = importlib.import_module(site.replace('.','_'), package=None)
            # del module2
            si = sys.modules
            if site.replace('.', '_') in si:
                print "@@@@@@@@@@@@@@ module already exist  for  " + site + ' is \n: @@@@@@@@@@@@@@\n\n'
                self.new_module = si[site.replace('.', '_')]
            else:
                print "@@@@@@@@@@@@@@ module inserted for  " + site + ' is \n: @@@@@@@@@@@@@@\n\n'
                self.new_module = __import__(site.replace('.', '_'), {}, {}, [], 2)

            # import imp
            # try:
            #     imp.find_module(site.replace('.','_'))
            #     found = True
            #     self.new_module=site.replace('.','_')
            #     print "@@@@@@@@@@@@@@ module already exist  for  "+site+' is \n: @@@@@@@@@@@@@@\n\n'
            # except ImportError:
            #     found = False
            #     self.new_module = __import__(site.replace('.','_'),{},{},[],2)
            #     print "@@@@@@@@@@@@@@ module inserted for  "+site+' is \n: @@@@@@@@@@@@@@\n\n'

            # self.new_module = __import__(site.replace('.','_'),{},{},[],2)
            print self.new_module
            self.file_exist = 1
        else:
            print "@@@@@@@@@@@@@@ module " + CurrentDir.replace('\\', '/') + '/configs/Links_site/' + site.replace('.',
                                                                                                                   '_') + '.py' + '\n Not found: @@@@@@@@@@@@@@\n\n'
        os.chdir(fo)

示例#19

0

显示文件

文件： find_link.py 项目： Heroku-elasa/heroku-buildpack-python-ieee-new

    def find_link(self, proxy='', user_pass=''):
        url = self.url
        site = urlparse2(url).hostname
        link_done = 0
        url_pdf = {}
        if not url.endswith('.pdf'):
            # from  download_mozilla import web
            # from importlib import import_module
            # html=web().download(url)
            # html = br.open(url).read()

            # s=os.getcwd().replace('\\','/')+'/configs/Links_site/'
            # s2=os.getcwd()+'\\configs\\Links_site\\'
            # if os.path.isfile(os.getcwd().replace('\\','/')+'/configs/Links_site/'+site.replace('.','_')+'.py'):
            #         fo = os.getcwd().replace('\\','/')
            #         sys.path.insert(0, s)
            #         # CurrentDir = os.path.dirname(os.path.realpath(__file__))
            #         # os.chdir(s)
            #         # import importlib
            #         # module2 = importlib.import_module(site.replace('.','_'), package=None)
            #         new_module = __import__(site.replace('.','_'))
            #
            if self.file_exist == 1:

                res = self.new_module.LINK(url).get_pdf_link(proxy, user_pass)
                link = res['links'];
                proxy = res['proxy'];
                user_pass = res['user_pass'];
                cookies = res['cookies']
                title = res['title'];
                html = res['html'];
                try:
                    log_out = res['log_out']
                except:
                    log_out = ''
                try:
                    form = res['form']
                except:
                    form = ''
                responce = {
                    'html': html,
                    'url': url,
                    'link': link,
                    'title': title,
                    'proxy': proxy,
                    'user_pass': user_pass,
                    'cookies': cookies,
                    'form': form,
                    'log_out': log_out
                }

            else:
                print "No " + site + '.py in config in ' + os.getcwd() + "\configs\Links_site"
                link = []
                responce = {
                    'html': html,
                    'url': url,
                    'link': link,
                    'title': title,
                    'proxy': proxy,
                    'user_pass': user_pass,
                    'cookies': cookies,
                }




                # for line in listhandle:
                #     if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) :
                #         new_module = __import__(line)
                #     import importlib
                #     module = importlib.import_module(line, package=None)


                # # lookup in a set is in constant time
                # safe_names = {"file1.py", "file2.py", "file3.py", ...}
                #
                # user_input = ...
                #
                # if user_input in safe_names:
                #     file = import_module(user_input)
                # else:
                #     print("Nope, not doing this.")

                #
                # if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) :
        else:#if not  url.endwith( '.pdf'):
            print 'address you have entered is end with .pdf and link is the same'
            link = url

            responce = {
                'html': [],
                'url': url,
                'link': link,
                'title': '',
                'proxy': [],
                'user_pass': [],
                'cookies': '',
            }
        return responce

示例#20

0

显示文件

文件： form_test_goodworking (2).py 项目： Heroku-elasa/heroku-buildpack-python-ieee-new

    # url='http://johnny.heliohost.org:2082/login/'
    # url='http://127.0.0.1/trash/test/elec-lab.tk%20Mover-201312290438/'
    url='http://ieeexplore.ieee.org/ielx5/8981/28518/01274437.pdf?tp=&arnumber=1274437&isnumber=28518'
    url='http://ieeexplore.ieee.org/xpl/login.jsp?tp=&arnumber=1274437&url=http%3A%2F%2Fieeexplore.ieee.org%2Fxpls%2Fabs_all.jsp%3Farnumber%3D1274437'
    url='http://cpanel.1freehosting.com/'
    [proxy,proxy_h,prx_pass]=make_returning_proxy('configs//sites_proxy//',url,'configs//proxy_alive.txt')
    proxy='http://'+''.join(proxy)

    if proxy!='http://':
        os.environ['http_proxy'] = proxy
    # os.environ['http_proxy'] = 'http://222.66.115.233:80'


    listform=file_rd(site_list_form,'r')
    # file_input='configs/sites_proxy'
    host= urlparse2(url).hostname

    #timeout in seconds see http://docs.python.org/library/socket.html#socket.setdefaulttimeout
    socket.setdefaulttimeout(100)
    # os.environ['http_proxy']=''
    # import requests
    # content = requests.get(url)
    # print content

    # Navigate to Google
    for line in listform:
        if line.find(host)!=-1:
            form_data=usr_tag(line)
            # html=login_to_site(url,form_data)
            import twill.commands
            # t1=TWILL_Browser(url,form_data).splinter()

示例#21

0

显示文件

文件： diy4ng4django4php-freepaper_rhcloud_com.py 项目： Heroku-elasa/heroku-buildpack-python-ieee-new

    def get_pdf_link(self,proxy='', user_pass=''):

        url=self.url


        if proxy == '':

            fo = os.getcwd()
            pr_h, proxy_h, user_pass_h = self.proxy_checker3.make_returning_proxy("configs//sites_proxy//", url)
            os.chdir(fo)
        else:
            pr_h = []
            user_pass_h = []
            pr_h.append(proxy)
            user_pass_h.append(user_pass)
            i = user_pass_h.index("")
            del user_pass_h[i]
            try:
                i = pr_h.index("")
                del pr_h[i]
            except:
                pass

        don_flg = -1
        if pr_h != []:
            i = -1
            site = urlparse2(url).hostname
            listhandle = self.file_rd(self.sites_list, 'r')
            file_listhandle = self.file_rd(self.sites_list_files, 'r')
            link_done = 0
            url_pdf = {}
            for j in range(i + 1, len(pr_h)):
                if don_flg != 1 and not url.endswith('.pdf') and link_done == 0:

                    [html,proxy0,user_pass]=self.dowload_basePr_userpass(url,pr_h[j],user_pass_h[j])


                    if link_done == 0 and html!=[]:
                        links =self.soap_my(data=html, tag='Full Text as PDF', attr='a', href='href',url=url)
                        # links = self.soap_my(html, 'Full Text as PDF', 'a', 'href')
                        if links!=[]:
                            [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j])
                            links =self.soap_my(data=html, tag='<frame src="http://ieeexplore.ieee.org', attr='frame', href='src',url=str(links))
                            # links2=self.soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src')
                            # links=links2
                        if links == [] or links==None:
                            pass
                        else:
                            link_done = 1
                            break

                        for line in listhandle:
                            if re.findall(site, line) and link_done == 0 and (not re.findall("#", line.split("TAG:")[0])) :
                                if re.findall("TAG1:", line):
                                    try:
                                        Tag = line.split("TAG1:")[1].split("---")[0]
                                        Tag=Tag.replace("+++",'')
                                        atrr = line.split("Attr1:")[1].split("---")[0]
                                        atrr=atrr.replace("+++",'')
                                        href=line.split('Href1:')[1].split("---")[0]
                                        href=href.replace("+++",'')
                                        links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url)
                                        # links = self.soap_my(html, Tag, atrr,href)
                                        if links != [] and link_done!=None:
                                            try:
                                                Tag = line.split("TAG2:")[1].split("---")[0]
                                                Tag=Tag.replace("---",'').replace("+++",'')

                                                atrr = line.split("Attr2:")[1].split("---")[0]
                                                atrr=atrr.replace('---','').replace("+++",'')
                                                href=line.split('Href2:')[1].split("---")[0]
                                                href=href.replace("+++",'')
                                                [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j])
                                                links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=str(links))
                                                # links = self.soap_my(html, Tag, atrr,href)
                                            except:pass
                                                # [html,proxy0,user_pass]=self.dowload_basePr_userpass(links,pr_h[j],user_pass_h[j])
                                                # links =self.soap_my(data=html, tag=Tag, attr=atrr, href=href,url=url)
                                                # links = self.soap_my(html, Tag, atrr,href)
                                            if links != [] or links!=None:
                                                link_done = 1
                                                print '---------------we found Proper link which is :------------\n'+str(links)+\
                                                      '\n ----with proxy-------\n'+str(pr_h[j])+':'+str(user_pass_h[j])
                                                print '----------------- Link Found -------------------------'
                                                return links,pr_h[j],user_pass_h[j]

                                    except:
                                        pass
                                        #     Tag = line.split("TAG1:")[1]
                                        #     Tag=Tag.replace("---",'')
                                        #     try:
                                        #         abstract_match = re.search("Full Text as PDF([^\']+)", html, re.IGNORECASE)
                                        #         abstract_url = "http://ieeexplore.ieee.org%s" % abstract_match.group(0)
                                        #         import lxml.html, codecs
                                        #         abs = []
                                        #         root = lxml.html.fromstring(html)
                                        #         for div in root:
                                        #             t = div.text_content()
                                        #             if t:
                                        #                 abs.append(t)
                                        #
                                        #         links = LINK(url).soap_my(html, Tag)
                                        #         if links != []  and link_done!=None:
                                        #             link_done = 1
                                        #     except:
                                        #         pass
                                        # break


                    elif link_done == 1:
                        print "<li><a>tag found</a></li>"
                        print links
                        break

                elif url!=[] and url.endswith('.pdf'):

                    return url,'',''


            if link_done==0:
                links=[]
                pr_h[j]=[]
                user_pass_h[j]=[]
                print "we couldnt find link beacuase of no proxy is able to download .find good proxy over internet"


            return links,pr_h[j],user_pass_h[j]

        else: # pr_h[j]=[] there is no trusted proxy for it
            html=self.dowload_basePr_userpass(url,'None')
            links = LINK(url).soap_my(html, 'Full Text as PDF', 'a', 'href')
            if links==[]:
                html=self.dowload_basePr_userpass(links,'None')
                links2=LINK(links).soap_my(html,'<frame src="http://ieeexplore.ieee.org','frame','src')
                link=links2
                if links == [] or links==None:
                    print'there is no trusted proxy for downloading it'
                else:
                    link_done = 1
            return links,[],[]

示例#22

0

显示文件

文件： form_test_goodworking.py 项目： 001101/heroku-buildpack-python-ieee

    proxy_checker3_all_function = import_mod(
        from_module='proxy_checker3_all_function')
    try:
        [proxy, proxy_h,
         prx_pass] = proxy_checker3_all_function.make_returning_proxy(
             "configs//sites_proxy//", url)
        proxy = 'http://' + ''.join(proxy)
        os.environ['http_proxy'] = proxy
        # os.environ['http_proxy'] = 'http://222.66.115.233:80'
    except:
        pass
    # [proxy,proxy_h,prx_pass]=proxy_checker3_all_function.make_returning_proxy('configs//sites_proxy//',url,'configs//proxy_alive.txt')

    listform = file_rd(site_list_form, 'r')
    # file_input='configs/sites_proxy'
    host = urlparse2(url).hostname

    #timeout in seconds see http://docs.python.org/library/socket.html#socket.setdefaulttimeout
    socket.setdefaulttimeout(100)
    # os.environ['http_proxy']=''
    # import requests
    # content = requests.get(url)
    # print content

    # Navigate to Google
    for line in listform:
        if line.find(host) != -1:
            form_data = usr_tag(line)
            # html=login_to_site(url,form_data)
            import twill.commands
            # t1=TWILL_Browser(url,form_data).splinter()