Пример #1
0
    def download_pdf(self,record,force_overwrite=False,pdf_choice=None,**kwargs):
        import re
        from citco import setup_browser
        import os.path

        recid = self.get_recid(record)

        #first, check to see if we already have the pdf
        #if not, verify that pdfdir exists
        pdf_path = self.pdfdir + '/' + recid + '.pdf'
        if not force_overwrite and os.path.exists(pdf_path): 
            print 'already exists: ' + pdf_path
            return
        
        #start a non proxied browser for later use
        non_proxy_br = setup_browser()
        
        record_page = 'http://apps.isiknowledge.com/CitedFullRecord.do?product=WOS&SID='+self.isi_ID+'&search_mode=CitedFullRecord&isickref=' + recid + '&db_id=WOS&colname=WOS'
        html = self.br.open(record_page).read()
       
        #try to get the doi
        doi_re = re.compile('DOI.*>(.*)<')
        try:
            doi = doi_re.findall(html)[0].strip()
        
            #I was getting 404 Error: Forbidden when trying to access the doi pages with a proxied browser. Use no proxy instead
            html = non_proxy_br.open('http://dx.doi.org/'+doi).read()
            possible_pdfs = [l.absolute_url for l in non_proxy_br.links() if 'pdf' in l.url and doi in l.url]
            if len(possible_pdfs) > 1 and pdf_choice==None: return possible_pdfs
            if not pdf_choice: pdf_choice = 0
            pdf_link = possible_pdfs[pdf_choice]
        except IndexError:
            print 'Unable to get a doi for this record! Googling for the title. Wish me luck!'
            
            self.br.open('https://www.google.com')
            self.br.select_form(nr=0)
            self.br.form['q'] = record['Title']
            self.br.submit()
            links = [l for l in self.br.links() if 'http' in l.url and 'google' not in l.url]
            non_proxy_br.open(links[0].url)
            new_url = non_proxy_br.geturl()
            if new_url.endswith('.pdf'):
                pdf_link = new_url
            else:
                self.br.open(new_url)
                possible_pdfs = [l.absolute_url for l in self.br.links() if 'pdf' in l.url]
                if len(possible_pdfs) > 1 and pdf_choice==None: return possible_pdfs
                if not pdf_choice: pdf_choice = 0
                pdf_link = possible_pdfs[pdf_choice]
        
        print 'attempting to download '+pdf_link
        print 'saving to ' + pdf_path
        try: self.br.retrieve(pdf_link,filename=pdf_path)
        except: non_proxy_br.retrieve(pdf_link,filename=pdf_path)
Пример #2
0
    def search(self,search_term):
        import re
        from citco import rcParams, setup_browser

        url = "http://apps.isiknowledge.com/WOS_AdvancedSearch_input.do?product=WOS&SID="+self.isi_ID+"&search_mode=AdvancedSearch"
        br = setup_browser(**rcParams) 
        br.open(url)
        br.select_form(name="WOS_AdvancedSearch_input_form")
        br['value(input1)'] = search_term
        resp_page = br.submit().read()

        summary_page = re.compile("summary\.do.*?\"").findall(resp_page)
        summary_page = [i for i in summary_page if 'AdvancedSearch' in i]
        
        if not summary_page: return []
        new_url = 'http://apps.isiknowledge.com/'+summary_page[0][:-1]+'&page=1&action=changePageSize&pageSize=200'
        
        html = br.open(new_url).read()
        return self.isi2dict(html,br.links())