Python PdfObj.PdfObj примеры использования

Язык программирования: Python

Пространство имен/Пакет: ReferenceParser

Класс/Тип: PdfObj

Метод/Функция: PdfObj

Примеров на hotexamples.com: 5

Python PdfObj.PdfObj - 5 примеров найдено. Это лучшие примеры Python кода для ReferenceParser.PdfObj.PdfObj, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PdfObj(5)

setTitle(2)

getPathUrl(1)

getTitle(1)

resetContent(1)

Пример #1

Показать файл

    def findPdfFromInfo(self, infoPageUrl):
        response = self.session.get(infoPageUrl, headers=self.headers)
        soup = BeautifulSoup(response.content, 'lxml')

        extract = soup.find('div', attrs={'id': 'gsc_title_gg'})
        if extract is None:
            return None

        #find pdf url
        tag = extract.find('span', attrs={'class': 'gsc_title_ggt'})
        if tag is not None and tag.text == "[PDF]" and not self.badSource(
                extract.find('a')):
            return PdfObj('url', extract.find('a')['href'])
        elif tag is not None:
            print('Non-PDF tag or bad source, using get it @ waterloo')

        potential_links = extract.findAll('div',
                                          attrs={'class': 'gsc_title_ggi'})
        for div in potential_links:
            text = div.text.strip()
            if text == 'Get It!@Waterloo':
                pdf_obj = self.getWatPDF(div.find('a')['href'])
                if pdf_obj is not None:
                    return pdf_obj
        return None

Пример #2

Показать файл

 def getWatPDF(self, url, title=None):
     print(url)
     time.sleep(15)
     status = WatLibSeleniumParser.downloadFromWatLib(url, 'paper.pdf')
     if status is None:
         return None
     else:
         newPdf = PdfObj('local', 'paper.pdf')
         return newPdf

Пример #3

Показать файл

Файл: ScopusParse.py Проект: AnkaiJie/Academic-Fraud-Detector

 def getWatPDF(self, url, title=None, pdfName='paper.pdf'):
     print('Getting pdf from WatLib')
     print(url)
     status = WATPARSER.downloadFromWatLib(url, 'paper.pdf')
     print('fnish here')
     if status is None:
         print('None status')
         return None
     else:
         try:
             newPdf = PdfObj('local', pdfName)
             return newPdf
         except KeyboardInterrupt:
             return WATPARSER.reset()

Пример #4

Показать файл

Файл: ScopusParse.py Проект: AnkaiJie/Academic-Fraud-Detector

    def findPapersFromCitations(self, url, toload):
        response = SESSION.get(url)
        soup = BeautifulSoup(response.content, 'lxml')

        papers_ul = soup.find('ul', attrs={'id': 'documentListUl'})
        paper_divs = papers_ul.findAll('li')

        papers_list = []

        count = 0
        for pdiv in paper_divs:
            title = pdiv.find('span', attrs={
                'class': 'docTitle'
            }).text.replace('\n', '')
            link = pdiv.find('a', attrs={'class': 'outwardLink'}, href=True)

            #if there is no valid waterloo link, try to find one
            while link.find(
                    'img',
                    attrs={'title': 'GetIt!@Waterloo(opens in a new window)'
                           }) is None:
                link = pdiv.find('a',
                                 attrs={'class': 'outwardLink'},
                                 href=True)
                if link is None:
                    break

            new_pdf = None
            if link is not None:
                link = link['href']
                new_pdf = self.getWatPDF(link)

            if new_pdf is None:
                new_pdf = PdfObj('local')

            new_pdf.setTitle(title)
            papers_list.append(new_pdf)

            count += 1
            # only load num specified
            if (count >= toload):
                break

        return papers_list

Пример #5

Показать файл

    def findPapersFromCitations(self, citationsUrl):
        response = self.session.get(citationsUrl, headers=self.headers)
        soup = BeautifulSoup(response.content, 'lxml')

        linkExtracts = soup.findAll('div', attrs={'class': 'gs_r'})
        pdfList = []

        if linkExtracts is None:
            return pdfList

        for extract in linkExtracts:
            title = extract.find('h3', attrs={'class': 'gs_rt'}).text
            if title is not None:
                title = re.sub('(\[.*\])', '', title)
            extract = extract.find('div', attrs={'class': 'gs_ggsm'})
            pdf_obj = PdfObj('local')
            pdf_obj.setTitle(title)
            print(pdf_obj.getTitle())

            if extract is None:
                print(
                    'Found PDF title but no PDF link. Returning only title: ' +
                    str(pdf_obj.getTitle()))
                pdfList.append(pdf_obj)
                continue

            #this code will skip links with [HTML] tag and throw error for links that are only "Get it at UWaterloo"
            tag = extract.find('span', attrs={'class': 'gs_ctg2'})
            if tag is not None and tag.text == "[PDF]" and not self.badSource(
                    extract.find('a')):
                pdf_obj.resetContent('url', extract.find('a')['href'])
                print('pdf url: ' + pdf_obj.getPathUrl() + ' has title ' +
                      str(pdf_obj.getTitle()))
                pdfList.append(pdf_obj)
                continue
            elif tag is not None:
                print('Non-PDF tag, using get it @ waterloo')

            potential_links = extract.findAll('a')

            notFound = True
            for link in potential_links:
                if link.text.strip() == "Get It!@Waterloo":
                    print('Get It!@Waterloo')
                    url = SessionInitializer.ROOT_URL + link['href']
                    pdf_obj = self.getWatPDF(url)
                    if pdf_obj is not None:
                        pdf_obj.setTitle(title)
                        notFound = False
                    else:
                        pdf_obj = PdfObj('local')
                        pdf_obj.setTitle(title)
                    break

            if notFound:
                print(
                    'Found PDF title but no PDF content. Returning only title.'
                    + str(pdf_obj.getTitle()))
            pdfList.append(pdf_obj)

        pdfList = [p for p in pdfList if p is not None]
        return pdfList