Пример #1
0
    def findPdfFromInfo(self, infoPageUrl):
        response = self.session.get(infoPageUrl, headers=self.headers)
        soup = BeautifulSoup(response.content, 'lxml')

        extract = soup.find('div', attrs={'id': 'gsc_title_gg'})
        if extract is None:
            return None

        #find pdf url
        tag = extract.find('span', attrs={'class': 'gsc_title_ggt'})
        if tag is not None and tag.text == "[PDF]" and not self.badSource(
                extract.find('a')):
            return PdfObj('url', extract.find('a')['href'])
        elif tag is not None:
            print('Non-PDF tag or bad source, using get it @ waterloo')

        potential_links = extract.findAll('div',
                                          attrs={'class': 'gsc_title_ggi'})
        for div in potential_links:
            text = div.text.strip()
            if text == 'Get It!@Waterloo':
                pdf_obj = self.getWatPDF(div.find('a')['href'])
                if pdf_obj is not None:
                    return pdf_obj
        return None
Пример #2
0
 def getWatPDF(self, url, title=None):
     print(url)
     time.sleep(15)
     status = WatLibSeleniumParser.downloadFromWatLib(url, 'paper.pdf')
     if status is None:
         return None
     else:
         newPdf = PdfObj('local', 'paper.pdf')
         return newPdf
 def getWatPDF(self, url, title=None, pdfName='paper.pdf'):
     print('Getting pdf from WatLib')
     print(url)
     status = WATPARSER.downloadFromWatLib(url, 'paper.pdf')
     print('fnish here')
     if status is None:
         print('None status')
         return None
     else:
         try:
             newPdf = PdfObj('local', pdfName)
             return newPdf
         except KeyboardInterrupt:
             return WATPARSER.reset()
    def findPapersFromCitations(self, url, toload):
        response = SESSION.get(url)
        soup = BeautifulSoup(response.content, 'lxml')

        papers_ul = soup.find('ul', attrs={'id': 'documentListUl'})
        paper_divs = papers_ul.findAll('li')

        papers_list = []

        count = 0
        for pdiv in paper_divs:
            title = pdiv.find('span', attrs={
                'class': 'docTitle'
            }).text.replace('\n', '')
            link = pdiv.find('a', attrs={'class': 'outwardLink'}, href=True)

            #if there is no valid waterloo link, try to find one
            while link.find(
                    'img',
                    attrs={'title': 'GetIt!@Waterloo(opens in a new window)'
                           }) is None:
                link = pdiv.find('a',
                                 attrs={'class': 'outwardLink'},
                                 href=True)
                if link is None:
                    break

            new_pdf = None
            if link is not None:
                link = link['href']
                new_pdf = self.getWatPDF(link)

            if new_pdf is None:
                new_pdf = PdfObj('local')

            new_pdf.setTitle(title)
            papers_list.append(new_pdf)

            count += 1
            # only load num specified
            if (count >= toload):
                break

        return papers_list
Пример #5
0
    def findPapersFromCitations(self, citationsUrl):
        response = self.session.get(citationsUrl, headers=self.headers)
        soup = BeautifulSoup(response.content, 'lxml')

        linkExtracts = soup.findAll('div', attrs={'class': 'gs_r'})
        pdfList = []

        if linkExtracts is None:
            return pdfList

        for extract in linkExtracts:
            title = extract.find('h3', attrs={'class': 'gs_rt'}).text
            if title is not None:
                title = re.sub('(\[.*\])', '', title)
            extract = extract.find('div', attrs={'class': 'gs_ggsm'})
            pdf_obj = PdfObj('local')
            pdf_obj.setTitle(title)
            print(pdf_obj.getTitle())

            if extract is None:
                print(
                    'Found PDF title but no PDF link. Returning only title: ' +
                    str(pdf_obj.getTitle()))
                pdfList.append(pdf_obj)
                continue

            #this code will skip links with [HTML] tag and throw error for links that are only "Get it at UWaterloo"
            tag = extract.find('span', attrs={'class': 'gs_ctg2'})
            if tag is not None and tag.text == "[PDF]" and not self.badSource(
                    extract.find('a')):
                pdf_obj.resetContent('url', extract.find('a')['href'])
                print('pdf url: ' + pdf_obj.getPathUrl() + ' has title ' +
                      str(pdf_obj.getTitle()))
                pdfList.append(pdf_obj)
                continue
            elif tag is not None:
                print('Non-PDF tag, using get it @ waterloo')

            potential_links = extract.findAll('a')

            notFound = True
            for link in potential_links:
                if link.text.strip() == "Get It!@Waterloo":
                    print('Get It!@Waterloo')
                    url = SessionInitializer.ROOT_URL + link['href']
                    pdf_obj = self.getWatPDF(url)
                    if pdf_obj is not None:
                        pdf_obj.setTitle(title)
                        notFound = False
                    else:
                        pdf_obj = PdfObj('local')
                        pdf_obj.setTitle(title)
                    break

            if notFound:
                print(
                    'Found PDF title but no PDF content. Returning only title.'
                    + str(pdf_obj.getTitle()))
            pdfList.append(pdf_obj)

        pdfList = [p for p in pdfList if p is not None]
        return pdfList