示例#1
0
    def search_async(self, search_string, callback, error_callback):
        '''
        Asynchronously search for `search_string` and hand over a list of
        search results to the callback. Each single search result is a 
        dictionary containing all the information that could be fetched from the
        webpage, e.g.:
        ..
        
            [{'title': 'A paper title', 'authors': ['Author A', 'Author B']},
             {'title': 'Another paper', 'authors': ['Author C'],
              'import_url': 'http://example.com/paper.pdf'}]
              
        In addition, each paper can also contain arbitrary additional data as the 
        value for a 'data' key. This could for example be used to save the full
        HTML code of a search result (which might be useful for an import of this
        paper) as opposed to only the extracted information.
        This method should not block but use the :class:`AsyncSoupSession` object
        `importer.soup_session` for getting the information.
        ''' 

        try:
            # Call the method defined in the subclass
            message = self.prepare_search_message(search_string)

            def my_callback(session, message, user_data):
                self.handle_response_received(message, callback, error_callback)

            soup_session.queue_message(message, my_callback, None)
        except Exception as ex:
            error_callback(ex, search_string)
示例#2
0
    def import_paper_after_search(self, paper, callback):
        log_info('Trying to import google scholar citation')
        try:
            data = paper.data
            citations = data.findAll('div', {'class': 'gs_fl'})[0]
            log_debug('Citations: %s' % str(citations))
            for link in citations.findAll('a'):
                log_debug('Link: %s' % str(link))
                if link['href'].startswith('/scholar.bib'):
                    log_debug('Found BibTex link: %s' % link['href'])

                    def bibtex_callback(session, message, user_data):
                        self._got_bibtex(message, callback, user_data)

                    message = Soup.Message.new(method='GET',
                                               uri_string=BASE_URL +
                                               link['href'])
                    message.request_headers.append(
                        'Cookie', 'GSP=ID=%s:CF=4' % self.google_id)
                    soup_session.queue_message(message, bibtex_callback,
                                               self.label)
                    #FIXME: Google scholar does not always seem to include the
                    #       URL in the bibtex data -- in this case add a link
        except:
            traceback.print_exc()
示例#3
0
    def _ids_received(self, message, callback, error_callback):

        if not message.status_code == Soup.KnownStatusCode.OK:
            error_callback('Pubmed replied with error code %d.' %
                           message.status_code)
        else:
            response_data = message.response_body.flatten().get_data()
            parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data)

            # Check whether there were any hits at all
            if int(parsed_response.esearchresult.count.string) == 0:
                return  # Nothing to do anymore

            # Continue with a second request asking for the summaries
            web_env = parsed_response.esearchresult.webenv.string
            query_key = parsed_response.esearchresult.querykey.string
            log_debug('Continuing Pubmed query (downloading summaries)')
            query = BASE_URL + ESUMMARY_QUERY % (query_key, web_env)

            message = Soup.Message.new(method='GET', uri_string=query)

            def mycallback(session, message, user_data):
                self._summaries_received(message, callback, error_callback)

            soup_session.queue_message(message, mycallback, None)
示例#4
0
    def import_paper_after_search(self, paper, callback):
        pubmed_id = paper.data
        log_info('Trying to import pubmed citation with id %s' % pubmed_id)
        query = BASE_URL + EFETCH_QUERY % pubmed_id
        message = Soup.Message.new(method='GET', uri_string=query)

        def mycallback(session, message, user_data):
            self._paper_info_received(message, callback, user_data)

        soup_session.queue_message(message, mycallback,
                                   (self.label, pubmed_id))
示例#5
0
    def import_paper_after_search(self, paper, callback):
        pubmed_id = paper.data
        log_info('Trying to import pubmed citation with id %s' % pubmed_id)
        query = BASE_URL + EFETCH_QUERY % pubmed_id
        message = Soup.Message.new(method='GET', uri_string=query)

        def mycallback(session, message, user_data):
            self._paper_info_received(message, callback, user_data)

        soup_session.queue_message(message, mycallback,
                                   (self.label, pubmed_id))
示例#6
0
    def search_async(self, search_text, callback, error_callback):
        '''
        Returns a list of dictionaries: The PUBMED results for the given search
        query
        '''

        # First do a query only for ids that is saved on the server
        log_debug('Starting Pubmed query for string "%s"' % search_text)
        query = BASE_URL + ESEARCH_QUERY % urllib.quote_plus(search_text)
        message = Soup.Message.new(method='GET', uri_string=query)

        def mycallback(session, message, user_data):
            self._ids_received(message, callback, error_callback)

        soup_session.queue_message(message, mycallback, None)
示例#7
0
    def search_async(self, search_text, callback, error_callback):
        '''
        Returns a list of dictionaries: The PUBMED results for the given search
        query
        '''

        # First do a query only for ids that is saved on the server
        log_debug('Starting Pubmed query for string "%s"' % search_text)
        query = BASE_URL + ESEARCH_QUERY % urllib.quote_plus(search_text)
        message = Soup.Message.new(method='GET', uri_string=query)

        def mycallback(session, message, user_data):
            self._ids_received(message, callback, error_callback)

        soup_session.queue_message(message, mycallback, None)
示例#8
0
 def import_paper_after_search(self, paper_obj, callback):
     '''
     This method is called when a search result is requested to be imported.
     The given `paper_obj` is a :class:`VirtualPaper` which has all the
     information previously returned by the search as attributes, e.g.
     `paper_obj.doi` is its DOI. The special attribute `data` should be used
     for information that can be useful for importing the paper, in addition
     to the default paper attributes. For example,
     :class:`GoogleScholarSearch` saves the complete HTML code for a search
     result, which contains a link to BibTeX data and possibly to a PDF
     document.
     
     If this method is not overwritten, it asynchronously downloads a
     document given in import_url (if any) and returns the original 
     `paper_obj` and possibly the PDF document to the callback. In case the
     search provider does not have any info to add to the initial search
     result, this is all that is needed. In cases where the search provider
     can add more information (e.g. the :class:`PubMedSearch` only requests
     summaries for the search, but when a specific paper is requested it
     gets the full record), this method should be overwritten.
     '''
     # in case the paper already had an import URL, download from this URL
     if hasattr(paper_obj, 'import_url') and paper_obj.import_url:
         message = Soup.Message.new(method='GET',
                                    uri_string=paper_obj.import_url)
         
         def mycallback(session, message, user_data):
             if message.status_code == Soup.KnownStatusCode.OK:
                 paper_data = message.response_body.flatten().get_data()
                 callback(paper_obj=paper_obj,
                          paper_data=paper_data,
                          user_data=user_data)
             else:
                 log_error("%: got status %s while trying to fetch PDF" % (self.__class__.__name__,
                                                                           message.status_code))
                 callback(paper_obj=paper_obj, user_data=user_data)
         
         log_debug("%s: trying to fetch %s" % (self.__class__.__name__,
                                               paper_obj.import_url))
         soup_session.queue_message(message, mycallback,
                                    (self.label, paper_obj.import_url))
     else:
         callback(paper_obj=paper_obj, user_data=self.label)
示例#9
0
    def _ids_received(self, message, callback, error_callback):

        if not message.status_code == Soup.KnownStatusCode.OK:
            error_callback('Pubmed replied with error code %d.' % message.status_code)
        else:
            response_data = message.response_body.flatten().get_data()
            parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data)

            # Check whether there were any hits at all
            if int(parsed_response.esearchresult.count.string) == 0:
                return # Nothing to do anymore

            # Continue with a second request asking for the summaries
            web_env = parsed_response.esearchresult.webenv.string
            query_key = parsed_response.esearchresult.querykey.string
            log_debug('Continuing Pubmed query (downloading summaries)')
            query = BASE_URL + ESUMMARY_QUERY % (query_key, web_env)

            message = Soup.Message.new(method='GET', uri_string=query)

            def mycallback(session, message, user_data):
                self._summaries_received(message, callback, error_callback)

            soup_session.queue_message(message, mycallback, None)
示例#10
0
    def import_paper_after_search(self, paper, callback):
        log_info('Trying to import google scholar citation')
        try:
            data = paper.data
            citations = data.findAll('div', {'class': 'gs_fl'})[0]
            log_debug('Citations: %s' % str(citations))
            for link in citations.findAll('a'):
                log_debug('Link: %s' % str(link))
                if link['href'].startswith('/scholar.bib'):
                    log_debug('Found BibTex link: %s' % link['href'])

                    def bibtex_callback(session, message, user_data):
                        self._got_bibtex(message, callback, user_data)

                    message = Soup.Message.new(method='GET',
                                               uri_string=BASE_URL + link['href'])
                    message.request_headers.append('Cookie',
                                       'GSP=ID=%s:CF=4' % self.google_id)
                    soup_session.queue_message(message, bibtex_callback,
                                               self.label)
                    #FIXME: Google scholar does not always seem to include the
                    #       URL in the bibtex data -- in this case add a link
        except:
            traceback.print_exc()