Пример #1
0
    def _ids_received(self, message, callback, error_callback):

        if not message.status_code == Soup.KnownStatusCode.OK:
            error_callback('Pubmed replied with error code %d.' %
                           message.status_code)
        else:
            response_data = message.response_body.flatten().get_data()
            parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data)

            # Check whether there were any hits at all
            if int(parsed_response.esearchresult.count.string) == 0:
                return  # Nothing to do anymore

            # Continue with a second request asking for the summaries
            web_env = parsed_response.esearchresult.webenv.string
            query_key = parsed_response.esearchresult.querykey.string
            log_debug('Continuing Pubmed query (downloading summaries)')
            query = BASE_URL + ESUMMARY_QUERY % (query_key, web_env)

            message = Soup.Message.new(method='GET', uri_string=query)

            def mycallback(session, message, user_data):
                self._summaries_received(message, callback, error_callback)

            soup_session.queue_message(message, mycallback, None)
Пример #2
0
    def parse_response(self, response):
        node = BeautifulSoup.BeautifulSoup(
            response,
            convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
        papers = []
        for result in node.findAll('div', attrs={'class': 'gs_r'}):
            paper = {}
            try:
                title_node = result.findAll('h3', attrs={'class': 'gs_rt'})[0]
                #Can be a link or plain text
                title_link = title_node.findAll('a')
                if title_link:
                    log_debug('title_link: %s' % \
                              title_link[0].prettify())
                    paper['title'] = title_link[0].string
                    paper['import_url'] = title_link[0]['href']
                else:
                    paper['title'] = title_node.string
                    paper['import_url'] = ''

                if not paper['import_url'].startswith('http'):
                    paper['import_url'] = BASE_URL + paper['import_url']

                try:
                    author_journal_publisher = result.findAll(
                        'div', attrs={'class': 'gs_a'})[0]
                    log_debug('Author string: %s' % \
                                     str(author_journal_publisher.text))
                    authors, journal, publisher = \
                              author_journal_publisher.text.split(' - ')
                    paper['authors'] = authors.split(',')
                    journal_year = journal.split(',')
                    if len(journal_year) == 2:
                        paper['journal'] = journal_year[0]
                        paper['year'] = journal_year[1]
                    elif len(
                            journal_year) == 1:  # might be a year or a journal
                        try:
                            paper['year'] = str(int(journal_year[0]))
                        except ValueError:
                            paper['journal'] = journal_year[0]
                    paper['publisher'] = publisher
                except:
                    pass

                try:
                    paper['abstract'] = html_strip(
                        result.findAll('div', attrs='gs_rs')[0].text)
                except:
                    pass

                # Also attach the html data so it can be used later for
                # importing the document
                paper['data'] = result
            except:
                traceback.print_exc()
            papers.append(paper)

        return papers
Пример #3
0
 def save_file(self, filename, raw_contents, save=True):
     log_debug('Generating md5 sum')
     self.full_text_md5 = hashlib.md5(raw_contents).hexdigest()
     log_debug('Saving file content')
     self.full_text.save(filename,
                         django.core.files.base.ContentFile(raw_contents),
                         save)
     self.save()
Пример #4
0
 def save_file(self, filename, raw_contents, save=True):
     log_debug('Generating md5 sum')
     self.full_text_md5 = hashlib.md5(raw_contents).hexdigest()
     log_debug('Saving file content')
     self.full_text.save(filename,
                         django.core.files.base.ContentFile(raw_contents),
                         save)
     self.save()
Пример #5
0
    def parse_response(self, response):
        node = BeautifulSoup.BeautifulSoup(response,
                                           convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES)
        papers = []
        for result in node.findAll('div', attrs={'class': 'gs_r'}):
            paper = {}
            try:
                title_node = result.findAll('h3',
                                            attrs={'class': 'gs_rt'})[0]
                #Can be a link or plain text
                title_link = title_node.findAll('a')
                if title_link:
                    log_debug('title_link: %s' % \
                              title_link[0].prettify())
                    paper['title'] = title_link[0].string
                    paper['import_url'] = title_link[0]['href']
                else:
                    paper['title'] = title_node.string
                    paper['import_url'] = ''

                if not paper['import_url'].startswith('http'):
                    paper['import_url'] = BASE_URL + paper['import_url']

                try:
                    author_journal_publisher = result.findAll('div',
                                             attrs={'class': 'gs_a'})[0]
                    log_debug('Author string: %s' % \
                                     str(author_journal_publisher.text))
                    authors, journal, publisher = \
                              author_journal_publisher.text.split(' - ')
                    paper['authors'] = authors.split(',')
                    journal_year = journal.split(',')
                    if len(journal_year) == 2:
                        paper['journal'] = journal_year[0]
                        paper['year'] = journal_year[1]
                    elif len(journal_year) == 1:  # might be a year or a journal
                        try:
                            paper['year'] = str(int(journal_year[0]))
                        except ValueError:
                            paper['journal'] = journal_year[0]
                    paper['publisher'] = publisher
                except:
                    pass

                try:
                    paper['abstract'] = html_strip(result.findAll('div',
                                                                  attrs='gs_rs')[0].text)
                except:
                    pass

                # Also attach the html data so it can be used later for
                # importing the document
                paper['data'] = result
            except:
                traceback.print_exc()
            papers.append(paper)

        return papers
Пример #6
0
    def parse_response(self, response):
        """
        Parse the arXiv response, which is in Atom XML format.
        
        The feed provides itself provides more-or-less all the
        information required without needing any extra requests.
        """

        papers = []
        try:
            parsed = feedparser.parse(response)
        except Exception as ex:
            log_error("arxiv: error while parsing response: %s" % ex[0])
            return papers

        log_debug("arxiv: received response containing %d results" %
                  len(parsed.entries))
        for entry in parsed.entries:
            paper = {}

            try:
                paper['title'] = entry['title']
                for link in entry['links']:
                    if link.get('title', None) == 'pdf':
                        paper['import_url'] = link['href']
                        break

                paper['authors'] = [a['name'] for a in entry['authors']]
                if 'arxiv_journal_ref' in entry:
                    paper['journal'] = entry['arxiv_journal_ref']
                if 'arxiv_doi' in entry:
                    paper['doi'] = entry['arxiv_doi']
                if 'arxiv_comment' in entry:
                    paper['notes'] = entry['arxiv_comment']
                paper['year'] = entry['published_parsed'].tm_year
                paper['arxiv_id'] = entry['id']
                paper['url'] = entry['id']
                paper['abstract'] = entry['summary'].replace('\n', ' ')
                if 'arxiv_primary_category' in entry:
                    paper['arxiv_type'] = entry['arxiv_primary_category'].get(
                        'term', '')

                paper['created'] = datetime.datetime(
                    year=entry['published_parsed'].tm_year,
                    month=entry['published_parsed'].tm_mon,
                    day=entry['published_parsed'].tm_mday)
                paper['updated'] = datetime.datetime(
                    year=entry['updated_parsed'].tm_year,
                    month=entry['updated_parsed'].tm_mon,
                    day=entry['updated_parsed'].tm_mday)

                paper['data'] = paper  #messy

                papers += [paper]
            except Exception as ex:
                log_error("arxiv: error while reading item: %s" % ex[0])

        return papers
Пример #7
0
 def callback_wrapper(search_results):
     '''
     Before calling the actual callback, save the result in the
     cache and add `user_data` (tuple identifying request and search
     provider) to the call.
     '''
     log_debug('Saving %s in cache for "%s"' % (search_results, search_string))
     self.search_cache[search_string] = search_results
     callback(user_data, search_results)
Пример #8
0
    def _got_bibtex(self, message, callback, user_data):
        if message.status_code == Soup.KnownStatusCode.OK:
            bibtex_data = message.response_body.flatten().get_data()

            log_debug('Received BibTeX data:\n%s' % bibtex_data)
            paper_info = paper_info_from_bibtex(bibtex_data)
        else:
            log_error('google scholar got status code %d' % message.status_code)
            paper_info = None
        callback(paper_info, None, user_data)
Пример #9
0
def paper_info_from_bibtex(data):
    
    if data is None:
        return {}

    # ieee puts <br>s in their bibtex
    data = data.replace('<br>', '\n')

    paper_info = {}

    result = parse_str(data)
    if len(result) == 0:
        log_warn('Could not parse BibTeX data')
        return {}
    
    bibtex = {}
    # FIXME: This does not handle special cases well...
    for i, r in enumerate(result[0][2:]):
        bibtex[r[0].lower()] = ''.join([str(r_part) for r_part in r[1:]])

    # fix for ACM's doi retardedness
    if bibtex.get('doi', '').startswith('http://dx.doi.org/'):
        bibtex['doi'] = bibtex['doi'][ len('http://dx.doi.org/'): ]
    if bibtex.get('doi', '').startswith('http://doi.acm.org/'):
        bibtex['doi'] = bibtex['doi'][ len('http://doi.acm.org/'): ]

    # Mappings from BibTeX to our keys
    # TODO: Handle more fields
    mappings = {'doi': 'doi',
                'url': 'import_url',
                'title': 'title',
                'pages': 'pages',
                'abstract': 'abstract',
                'journal': 'journal',
                'year': 'year',
                'publisher': 'publisher'}

    for bibtex_key, our_key in mappings.items():
        if bibtex_key in bibtex:
            log_debug('Have key %s' % bibtex_key)
            # replace newlines with spaces and remove superfluous spaces
            paper_info[our_key] = bibtex[bibtex_key].replace('\n', ' ').strip()

    # TODO: Handle editors, etc.?
    if 'author' in bibtex:
        if ' AND ' in bibtex['author']:
            paper_info['authors'] = bibtex['author'].split(' AND ')
        else:
            paper_info['authors'] = bibtex['author'].split(' and ')

    paper_info['bibtex'] = data
    log_info('imported paper_info: %s\nFrom bibtex: %s' % (str(paper_info), str(bibtex)))

    return paper_info
Пример #10
0
 def parse_response(self, response):
     """
     Parse the arXiv response, which is in Atom XML format.
     
     The feed provides itself provides more-or-less all the
     information required without needing any extra requests.
     """
     
     papers = []
     try:
         parsed = feedparser.parse(response)
     except Exception as ex:
         log_error("arxiv: error while parsing response: %s" % ex[0])
         return papers
     
     log_debug("arxiv: received response containing %d results" % len(parsed.entries))    
     for entry in parsed.entries:
         paper = {}
         
         try:
             paper['title'] = entry['title']
             for link in entry['links']:
                 if link.get('title', None) == 'pdf':
                     paper['import_url'] = link['href']
                     break
                 
             paper['authors'] = [a['name'] for a in entry['authors']]
             if 'arxiv_journal_ref' in entry:
                 paper['journal'] = entry['arxiv_journal_ref']
             if 'arxiv_doi' in entry:
                 paper['doi'] = entry['arxiv_doi']
             if 'arxiv_comment' in entry:
                 paper['notes'] = entry['arxiv_comment']
             paper['year'] = entry['published_parsed'].tm_year
             paper['arxiv_id'] = entry['id']
             paper['url'] = entry['id']
             paper['abstract'] = entry['summary'].replace('\n', ' ')
             if 'arxiv_primary_category' in entry:
                 paper['arxiv_type'] = entry['arxiv_primary_category'].get('term', '')
             
             paper['created'] = datetime.datetime(year=entry['published_parsed'].tm_year,
                                                  month=entry['published_parsed'].tm_mon,
                                                  day=entry['published_parsed'].tm_mday)
             paper['updated'] = datetime.datetime(year=entry['updated_parsed'].tm_year,
                                                  month=entry['updated_parsed'].tm_mon,
                                                  day=entry['updated_parsed'].tm_mday)
             
             paper['data'] = paper #messy
             
             papers += [paper]
         except Exception as ex:
             log_error("arxiv: error while reading item: %s" % ex[0])
     
     return papers
Пример #11
0
    def _got_bibtex(self, message, callback, user_data):
        if message.status_code == Soup.KnownStatusCode.OK:
            bibtex_data = message.response_body.flatten().get_data()

            log_debug('Received BibTeX data:\n%s' % bibtex_data)
            paper_info = paper_info_from_bibtex(bibtex_data)
        else:
            log_error('google scholar got status code %d' %
                      message.status_code)
            paper_info = None
        callback(paper_info, None, user_data)
Пример #12
0
 def prepare_search_message(self, search_string):
     
     #if the query already uses arXiv syntax, don't add an
     #"all:" query on the front
     if not re.match("^(ti|au|abs|co|jr|cat|rn|id):\w+", search_string):
         search_string = 'all:' + search_string
     
     uri_string = BASE_URL + urllib.urlencode({'sortBy': SORT_BY,
                                               'sortOrder': SORT_ORDER,
                                               'max_results': MAX_RESULTS,
                                               'search_query': search_string})
     
     log_debug("arxiv: requesting %s" % uri_string)
     return Soup.Message.new(method='GET', uri_string=uri_string)
Пример #13
0
    def search_async(self, search_text, callback, error_callback):
        '''
        Returns a list of dictionaries: The PUBMED results for the given search
        query
        '''

        # First do a query only for ids that is saved on the server
        log_debug('Starting Pubmed query for string "%s"' % search_text)
        query = BASE_URL + ESEARCH_QUERY % urllib.quote_plus(search_text)
        message = Soup.Message.new(method='GET', uri_string=query)

        def mycallback(session, message, user_data):
            self._ids_received(message, callback, error_callback)

        soup_session.queue_message(message, mycallback, None)
Пример #14
0
    def search_async(self, search_text, callback, error_callback):
        '''
        Returns a list of dictionaries: The PUBMED results for the given search
        query
        '''

        # First do a query only for ids that is saved on the server
        log_debug('Starting Pubmed query for string "%s"' % search_text)
        query = BASE_URL + ESEARCH_QUERY % urllib.quote_plus(search_text)
        message = Soup.Message.new(method='GET', uri_string=query)

        def mycallback(session, message, user_data):
            self._ids_received(message, callback, error_callback)

        soup_session.queue_message(message, mycallback, None)
Пример #15
0
    def parse_response(self, response):
        parsed = BeautifulStoneSoup(response)
        papers = []
        for result in parsed.find('srw:records').findAll('srw:record'):
            result = result.find('srw:recorddata')
            log_debug('Single result: %s' % result.prettify())

            paper = self._parse_result(result)

            log_debug('JSTOR paper info: %s' % str(paper))

            # Add the full data, useful for later import
            paper['data'] = result
            papers.append(paper)

        return papers
Пример #16
0
    def prepare_search_message(self, search_string):

        #if the query already uses arXiv syntax, don't add an
        #"all:" query on the front
        if not re.match("^(ti|au|abs|co|jr|cat|rn|id):\w+", search_string):
            search_string = 'all:' + search_string

        uri_string = BASE_URL + urllib.urlencode({
            'sortBy': SORT_BY,
            'sortOrder': SORT_ORDER,
            'max_results': MAX_RESULTS,
            'search_query': search_string
        })

        log_debug("arxiv: requesting %s" % uri_string)
        return Soup.Message.new(method='GET', uri_string=uri_string)
Пример #17
0
    def parse_response(self, response):
        parsed = BeautifulStoneSoup(response)
        papers = []
        for result in parsed.find('srw:records').findAll('srw:record'):
            result = result.find('srw:recorddata')
            log_debug('Single result: %s' % result.prettify())

            paper = self._parse_result(result)

            log_debug('JSTOR paper info: %s' % str(paper))

            # Add the full data, useful for later import
            paper['data'] = result
            papers.append(paper)

        return papers
Пример #18
0
 def import_paper_after_search(self, paper_obj, callback):
     '''
     This method is called when a search result is requested to be imported.
     The given `paper_obj` is a :class:`VirtualPaper` which has all the
     information previously returned by the search as attributes, e.g.
     `paper_obj.doi` is its DOI. The special attribute `data` should be used
     for information that can be useful for importing the paper, in addition
     to the default paper attributes. For example,
     :class:`GoogleScholarSearch` saves the complete HTML code for a search
     result, which contains a link to BibTeX data and possibly to a PDF
     document.
     
     If this method is not overwritten, it asynchronously downloads a
     document given in import_url (if any) and returns the original 
     `paper_obj` and possibly the PDF document to the callback. In case the
     search provider does not have any info to add to the initial search
     result, this is all that is needed. In cases where the search provider
     can add more information (e.g. the :class:`PubMedSearch` only requests
     summaries for the search, but when a specific paper is requested it
     gets the full record), this method should be overwritten.
     '''
     # in case the paper already had an import URL, download from this URL
     if hasattr(paper_obj, 'import_url') and paper_obj.import_url:
         message = Soup.Message.new(method='GET',
                                    uri_string=paper_obj.import_url)
         
         def mycallback(session, message, user_data):
             if message.status_code == Soup.KnownStatusCode.OK:
                 paper_data = message.response_body.flatten().get_data()
                 callback(paper_obj=paper_obj,
                          paper_data=paper_data,
                          user_data=user_data)
             else:
                 log_error("%: got status %s while trying to fetch PDF" % (self.__class__.__name__,
                                                                           message.status_code))
                 callback(paper_obj=paper_obj, user_data=user_data)
         
         log_debug("%s: trying to fetch %s" % (self.__class__.__name__,
                                               paper_obj.import_url))
         soup_session.queue_message(message, mycallback,
                                    (self.label, paper_obj.import_url))
     else:
         callback(paper_obj=paper_obj, user_data=self.label)
Пример #19
0
    def import_paper_after_search(self, paper, callback):
        log_info('Trying to import google scholar citation')
        try:
            data = paper.data
            citations = data.findAll('div', {'class': 'gs_fl'})[0]
            log_debug('Citations: %s' % str(citations))
            for link in citations.findAll('a'):
                log_debug('Link: %s' % str(link))
                if link['href'].startswith('/scholar.bib'):
                    log_debug('Found BibTex link: %s' % link['href'])

                    def bibtex_callback(session, message, user_data):
                        self._got_bibtex(message, callback, user_data)

                    message = Soup.Message.new(method='GET',
                                               uri_string=BASE_URL +
                                               link['href'])
                    message.request_headers.append(
                        'Cookie', 'GSP=ID=%s:CF=4' % self.google_id)
                    soup_session.queue_message(message, bibtex_callback,
                                               self.label)
                    #FIXME: Google scholar does not always seem to include the
                    #       URL in the bibtex data -- in this case add a link
        except:
            traceback.print_exc()
Пример #20
0
    def search(self, search_string, callback, error_callback):
        '''
        This method will be called by the GUI with the `search_string` when a
        search is initiated. Returns search results from the cache or initiates
        a new search using :meth:`search_async` if the search has not been
        performed before. Before calling the `callback`, saves the search
        results to the cache.
        
        This method should normally not be overwritten.
        '''
        # A tuple identifying the search, making it possible for the callback
        # function to deal with the results properly (otherwise results arriving
        # out of order could lead to wrongly displayed results)
        user_data = (self.label, search_string)

        if not search_string:
            callback(user_data, [])
            return

        if search_string in self.search_cache:
            log_debug('Result for "%s" already in cache.' % search_string)
            callback(user_data, self.search_cache[search_string])
            return

        log_info('Search for "%s" is not cached by this provider, starting new search' % search_string)

        try:
            def callback_wrapper(search_results):
                '''
                Before calling the actual callback, save the result in the
                cache and add `user_data` (tuple identifying request and search
                provider) to the call.
                '''
                log_debug('Saving %s in cache for "%s"' % (search_results, search_string))
                self.search_cache[search_string] = search_results
                callback(user_data, search_results)

            self.search_async(search_string, callback_wrapper, error_callback)
        except Exception as ex:
            error_callback(ex, None)
Пример #21
0
    def _ids_received(self, message, callback, error_callback):

        if not message.status_code == Soup.KnownStatusCode.OK:
            error_callback('Pubmed replied with error code %d.' % message.status_code)
        else:
            response_data = message.response_body.flatten().get_data()
            parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data)

            # Check whether there were any hits at all
            if int(parsed_response.esearchresult.count.string) == 0:
                return # Nothing to do anymore

            # Continue with a second request asking for the summaries
            web_env = parsed_response.esearchresult.webenv.string
            query_key = parsed_response.esearchresult.querykey.string
            log_debug('Continuing Pubmed query (downloading summaries)')
            query = BASE_URL + ESUMMARY_QUERY % (query_key, web_env)

            message = Soup.Message.new(method='GET', uri_string=query)

            def mycallback(session, message, user_data):
                self._summaries_received(message, callback, error_callback)

            soup_session.queue_message(message, mycallback, None)
Пример #22
0
    def import_paper_after_search(self, paper, callback):
        log_info('Trying to import google scholar citation')
        try:
            data = paper.data
            citations = data.findAll('div', {'class': 'gs_fl'})[0]
            log_debug('Citations: %s' % str(citations))
            for link in citations.findAll('a'):
                log_debug('Link: %s' % str(link))
                if link['href'].startswith('/scholar.bib'):
                    log_debug('Found BibTex link: %s' % link['href'])

                    def bibtex_callback(session, message, user_data):
                        self._got_bibtex(message, callback, user_data)

                    message = Soup.Message.new(method='GET',
                                               uri_string=BASE_URL + link['href'])
                    message.request_headers.append('Cookie',
                                       'GSP=ID=%s:CF=4' % self.google_id)
                    soup_session.queue_message(message, bibtex_callback,
                                               self.label)
                    #FIXME: Google scholar does not always seem to include the
                    #       URL in the bibtex data -- in this case add a link
        except:
            traceback.print_exc()
Пример #23
0
    def _paper_info_received(self, message, callback, user_data):
        if not message.status_code == Soup.KnownStatusCode.OK:
            log_error('Pubmed replied with error code %d for paper_info with id: %s' % \
                      (message.status_code, user_data[1]))
            paper_info = None
        else:
            parsed_response = BeautifulStoneSoup(message.response_body.data)
            paper_info = {}

            # Journal
            try:
                journal = parsed_response.findAll('journal')[0]
                paper_info['journal'] = journal.findAll('title')[0].text
                try:
                    paper_info['issue'] = journal.findAll('issue')[0].text
                except:
                    pass

                paper_info['pages'] = parsed_response.findAll('medlinepgn')[0].text
                log_debug('Pages: %s' % paper_info['pages'])
            except Exception as ex:
                pass

            # Publication date
            try:
                articledate = parsed_response.findAll('articledate')[0]
                paper_info['year'] = articledate.year.text
            except:
                pass

            # Title and abstract
            try:
                paper_info['title'] = parsed_response.findAll('articletitle')[0].text
                log_debug('Title: %s' % paper_info['title'])
                paper_info['abstract'] = parsed_response.findAll('abstracttext')[0].text
                log_debug('Abstract: %s' % paper_info['abstract'])
            except Exception as ex:
                pass

            # Authors
            try:
                all_authors = []
                authors = parsed_response.findAll('author')
                for author in authors:
                    author_name = author.forename.text + ' ' + \
                                            author.lastname.text
                    log_debug('\tAuthor: %s' % author_name)
                    all_authors.append(author_name)
                if all_authors:
                    paper_info['authors'] = all_authors
            except Exception as ex:
                pass

            # URL + IDs
            try:
                articleids = parsed_response.findAll('articleid')
                for articleid in articleids:
                    if articleid['idtype'] == 'doi':
                        paper_info['doi'] = articleid.text
                    elif articleid['idtype'] == 'pubmed':
                        paper_info['pubmed_id'] = articleid.text
            except:
                pass

        callback(paper_info=paper_info, user_data=user_data)
Пример #24
0
def get_paper_info_from_pdf(data):
    fp = BytesIO(data)
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Initialize
    doc.initialize()
    # Extract the metadata
    for xref in doc.xrefs:
        info_ref = xref.trailer.get('Info')
        if info_ref:
            info = resolve1(info_ref)

    paper_info = {}
    if info:
        authors = info.get('Author')
        if authors:
            if ';' in authors:
                author_list = authors.split(';')
            elif ' AND ' in authors:
                author_list = authors.split(' AND ')
            elif ',' in authors:
                #FIXME: This cuts 'LastName, FirstName' in two...
                author_list = authors.split(',')
            else:
                author_list = [authors]

            paper_info['authors'] = author_list
        title = info.get('Title')
        if title:
            # Some PDFs have the doi as a title
            if title.lower().startswith('doi:'):
                paper_info['doi'] = title[4:]
            else:
                paper_info['title'] = title

        #TODO: Additional metadata?
        #TODO: What about embedded BibTeX (as done by JabRef)?

    #Extract text
    rsrcmgr = PDFResourceManager()
    content = cStringIO.StringIO()
    device = TextConverter(rsrcmgr,
                           content,
                           codec='utf-8',
                           laparams=LAParams())
    process_pdf(rsrcmgr, device, fp, check_extractable=True, caching=True)

    paper_info['extracted_text'] = content.getvalue()

    if not 'doi' in paper_info:  # Try to find a DOI in the text
        doi = p_doi.search(paper_info['extracted_text'])
        if doi is not None:
            doi = doi.group(1)
            log_debug('Found a DOI: %s' % doi)
            paper_info['doi'] = doi

    device.close()
    content.close()

    log_debug('Exctracted paper_info from PDF: %s' % paper_info)

    return paper_info
Пример #25
0
    def _paper_info_received(self, message, callback, user_data):
        if not message.status_code == Soup.KnownStatusCode.OK:
            log_error('Pubmed replied with error code %d for paper_info with id: %s' % \
                      (message.status_code, user_data[1]))
            paper_info = None
        else:
            parsed_response = BeautifulStoneSoup(message.response_body.data)
            paper_info = {}

            # Journal
            try:
                journal = parsed_response.findAll('journal')[0]
                paper_info['journal'] = journal.findAll('title')[0].text
                try:
                    paper_info['issue'] = journal.findAll('issue')[0].text
                except:
                    pass

                paper_info['pages'] = parsed_response.findAll(
                    'medlinepgn')[0].text
                log_debug('Pages: %s' % paper_info['pages'])
            except Exception as ex:
                pass

            # Publication date
            try:
                articledate = parsed_response.findAll('articledate')[0]
                paper_info['year'] = articledate.year.text
            except:
                pass

            # Title and abstract
            try:
                paper_info['title'] = parsed_response.findAll(
                    'articletitle')[0].text
                log_debug('Title: %s' % paper_info['title'])
                paper_info['abstract'] = parsed_response.findAll(
                    'abstracttext')[0].text
                log_debug('Abstract: %s' % paper_info['abstract'])
            except Exception as ex:
                pass

            # Authors
            try:
                all_authors = []
                authors = parsed_response.findAll('author')
                for author in authors:
                    author_name = author.forename.text + ' ' + \
                                            author.lastname.text
                    log_debug('\tAuthor: %s' % author_name)
                    all_authors.append(author_name)
                if all_authors:
                    paper_info['authors'] = all_authors
            except Exception as ex:
                pass

            # URL + IDs
            try:
                articleids = parsed_response.findAll('articleid')
                for articleid in articleids:
                    if articleid['idtype'] == 'doi':
                        paper_info['doi'] = articleid.text
                    elif articleid['idtype'] == 'pubmed':
                        paper_info['pubmed_id'] = articleid.text
            except:
                pass

        callback(paper_info=paper_info, user_data=user_data)
Пример #26
0
def get_paper_info_from_pdf(data):
    fp = BytesIO(data)
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Initialize
    doc.initialize()
    # Extract the metadata
    for xref in doc.xrefs:
        info_ref = xref.trailer.get('Info')
        if info_ref:
            info = resolve1(info_ref)

    paper_info = {}
    if info:
        authors = info.get('Author')
        if authors:
            if ';' in authors:
                author_list = authors.split(';')
            elif ' AND ' in authors:
                author_list = authors.split(' AND ')
            elif ',' in authors:
                #FIXME: This cuts 'LastName, FirstName' in two...
                author_list = authors.split(',')
            else:
                author_list = [authors]

            paper_info['authors'] = author_list
        title = info.get('Title')
        if title:
            # Some PDFs have the doi as a title
            if title.lower().startswith('doi:'):
                paper_info['doi'] = title[4:]
            else:
                paper_info['title'] = title

        #TODO: Additional metadata?
        #TODO: What about embedded BibTeX (as done by JabRef)?

    #Extract text
    rsrcmgr = PDFResourceManager()
    content = cStringIO.StringIO()
    device = TextConverter(rsrcmgr, content, codec='utf-8', laparams=LAParams())
    process_pdf(rsrcmgr, device, fp, check_extractable=True, caching=True)

    paper_info['extracted_text'] = content.getvalue()

    if not 'doi' in paper_info:  # Try to find a DOI in the text
        doi = p_doi.search(paper_info['extracted_text'])
        if doi is not None:
            doi = doi.group(1)
            log_debug('Found a DOI: %s' % doi)
            paper_info['doi'] = doi

    device.close()
    content.close()

    log_debug('Exctracted paper_info from PDF: %s' % paper_info)

    return paper_info