def _ids_received(self, message, callback, error_callback): if not message.status_code == Soup.KnownStatusCode.OK: error_callback('Pubmed replied with error code %d.' % message.status_code) else: response_data = message.response_body.flatten().get_data() parsed_response = BeautifulSoup.BeautifulStoneSoup(response_data) # Check whether there were any hits at all if int(parsed_response.esearchresult.count.string) == 0: return # Nothing to do anymore # Continue with a second request asking for the summaries web_env = parsed_response.esearchresult.webenv.string query_key = parsed_response.esearchresult.querykey.string log_debug('Continuing Pubmed query (downloading summaries)') query = BASE_URL + ESUMMARY_QUERY % (query_key, web_env) message = Soup.Message.new(method='GET', uri_string=query) def mycallback(session, message, user_data): self._summaries_received(message, callback, error_callback) soup_session.queue_message(message, mycallback, None)
def parse_response(self, response): node = BeautifulSoup.BeautifulSoup( response, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES) papers = [] for result in node.findAll('div', attrs={'class': 'gs_r'}): paper = {} try: title_node = result.findAll('h3', attrs={'class': 'gs_rt'})[0] #Can be a link or plain text title_link = title_node.findAll('a') if title_link: log_debug('title_link: %s' % \ title_link[0].prettify()) paper['title'] = title_link[0].string paper['import_url'] = title_link[0]['href'] else: paper['title'] = title_node.string paper['import_url'] = '' if not paper['import_url'].startswith('http'): paper['import_url'] = BASE_URL + paper['import_url'] try: author_journal_publisher = result.findAll( 'div', attrs={'class': 'gs_a'})[0] log_debug('Author string: %s' % \ str(author_journal_publisher.text)) authors, journal, publisher = \ author_journal_publisher.text.split(' - ') paper['authors'] = authors.split(',') journal_year = journal.split(',') if len(journal_year) == 2: paper['journal'] = journal_year[0] paper['year'] = journal_year[1] elif len( journal_year) == 1: # might be a year or a journal try: paper['year'] = str(int(journal_year[0])) except ValueError: paper['journal'] = journal_year[0] paper['publisher'] = publisher except: pass try: paper['abstract'] = html_strip( result.findAll('div', attrs='gs_rs')[0].text) except: pass # Also attach the html data so it can be used later for # importing the document paper['data'] = result except: traceback.print_exc() papers.append(paper) return papers
def save_file(self, filename, raw_contents, save=True): log_debug('Generating md5 sum') self.full_text_md5 = hashlib.md5(raw_contents).hexdigest() log_debug('Saving file content') self.full_text.save(filename, django.core.files.base.ContentFile(raw_contents), save) self.save()
def parse_response(self, response): node = BeautifulSoup.BeautifulSoup(response, convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES) papers = [] for result in node.findAll('div', attrs={'class': 'gs_r'}): paper = {} try: title_node = result.findAll('h3', attrs={'class': 'gs_rt'})[0] #Can be a link or plain text title_link = title_node.findAll('a') if title_link: log_debug('title_link: %s' % \ title_link[0].prettify()) paper['title'] = title_link[0].string paper['import_url'] = title_link[0]['href'] else: paper['title'] = title_node.string paper['import_url'] = '' if not paper['import_url'].startswith('http'): paper['import_url'] = BASE_URL + paper['import_url'] try: author_journal_publisher = result.findAll('div', attrs={'class': 'gs_a'})[0] log_debug('Author string: %s' % \ str(author_journal_publisher.text)) authors, journal, publisher = \ author_journal_publisher.text.split(' - ') paper['authors'] = authors.split(',') journal_year = journal.split(',') if len(journal_year) == 2: paper['journal'] = journal_year[0] paper['year'] = journal_year[1] elif len(journal_year) == 1: # might be a year or a journal try: paper['year'] = str(int(journal_year[0])) except ValueError: paper['journal'] = journal_year[0] paper['publisher'] = publisher except: pass try: paper['abstract'] = html_strip(result.findAll('div', attrs='gs_rs')[0].text) except: pass # Also attach the html data so it can be used later for # importing the document paper['data'] = result except: traceback.print_exc() papers.append(paper) return papers
def parse_response(self, response): """ Parse the arXiv response, which is in Atom XML format. The feed provides itself provides more-or-less all the information required without needing any extra requests. """ papers = [] try: parsed = feedparser.parse(response) except Exception as ex: log_error("arxiv: error while parsing response: %s" % ex[0]) return papers log_debug("arxiv: received response containing %d results" % len(parsed.entries)) for entry in parsed.entries: paper = {} try: paper['title'] = entry['title'] for link in entry['links']: if link.get('title', None) == 'pdf': paper['import_url'] = link['href'] break paper['authors'] = [a['name'] for a in entry['authors']] if 'arxiv_journal_ref' in entry: paper['journal'] = entry['arxiv_journal_ref'] if 'arxiv_doi' in entry: paper['doi'] = entry['arxiv_doi'] if 'arxiv_comment' in entry: paper['notes'] = entry['arxiv_comment'] paper['year'] = entry['published_parsed'].tm_year paper['arxiv_id'] = entry['id'] paper['url'] = entry['id'] paper['abstract'] = entry['summary'].replace('\n', ' ') if 'arxiv_primary_category' in entry: paper['arxiv_type'] = entry['arxiv_primary_category'].get( 'term', '') paper['created'] = datetime.datetime( year=entry['published_parsed'].tm_year, month=entry['published_parsed'].tm_mon, day=entry['published_parsed'].tm_mday) paper['updated'] = datetime.datetime( year=entry['updated_parsed'].tm_year, month=entry['updated_parsed'].tm_mon, day=entry['updated_parsed'].tm_mday) paper['data'] = paper #messy papers += [paper] except Exception as ex: log_error("arxiv: error while reading item: %s" % ex[0]) return papers
def callback_wrapper(search_results): ''' Before calling the actual callback, save the result in the cache and add `user_data` (tuple identifying request and search provider) to the call. ''' log_debug('Saving %s in cache for "%s"' % (search_results, search_string)) self.search_cache[search_string] = search_results callback(user_data, search_results)
def _got_bibtex(self, message, callback, user_data): if message.status_code == Soup.KnownStatusCode.OK: bibtex_data = message.response_body.flatten().get_data() log_debug('Received BibTeX data:\n%s' % bibtex_data) paper_info = paper_info_from_bibtex(bibtex_data) else: log_error('google scholar got status code %d' % message.status_code) paper_info = None callback(paper_info, None, user_data)
def paper_info_from_bibtex(data): if data is None: return {} # ieee puts <br>s in their bibtex data = data.replace('<br>', '\n') paper_info = {} result = parse_str(data) if len(result) == 0: log_warn('Could not parse BibTeX data') return {} bibtex = {} # FIXME: This does not handle special cases well... for i, r in enumerate(result[0][2:]): bibtex[r[0].lower()] = ''.join([str(r_part) for r_part in r[1:]]) # fix for ACM's doi retardedness if bibtex.get('doi', '').startswith('http://dx.doi.org/'): bibtex['doi'] = bibtex['doi'][ len('http://dx.doi.org/'): ] if bibtex.get('doi', '').startswith('http://doi.acm.org/'): bibtex['doi'] = bibtex['doi'][ len('http://doi.acm.org/'): ] # Mappings from BibTeX to our keys # TODO: Handle more fields mappings = {'doi': 'doi', 'url': 'import_url', 'title': 'title', 'pages': 'pages', 'abstract': 'abstract', 'journal': 'journal', 'year': 'year', 'publisher': 'publisher'} for bibtex_key, our_key in mappings.items(): if bibtex_key in bibtex: log_debug('Have key %s' % bibtex_key) # replace newlines with spaces and remove superfluous spaces paper_info[our_key] = bibtex[bibtex_key].replace('\n', ' ').strip() # TODO: Handle editors, etc.? if 'author' in bibtex: if ' AND ' in bibtex['author']: paper_info['authors'] = bibtex['author'].split(' AND ') else: paper_info['authors'] = bibtex['author'].split(' and ') paper_info['bibtex'] = data log_info('imported paper_info: %s\nFrom bibtex: %s' % (str(paper_info), str(bibtex))) return paper_info
def parse_response(self, response): """ Parse the arXiv response, which is in Atom XML format. The feed provides itself provides more-or-less all the information required without needing any extra requests. """ papers = [] try: parsed = feedparser.parse(response) except Exception as ex: log_error("arxiv: error while parsing response: %s" % ex[0]) return papers log_debug("arxiv: received response containing %d results" % len(parsed.entries)) for entry in parsed.entries: paper = {} try: paper['title'] = entry['title'] for link in entry['links']: if link.get('title', None) == 'pdf': paper['import_url'] = link['href'] break paper['authors'] = [a['name'] for a in entry['authors']] if 'arxiv_journal_ref' in entry: paper['journal'] = entry['arxiv_journal_ref'] if 'arxiv_doi' in entry: paper['doi'] = entry['arxiv_doi'] if 'arxiv_comment' in entry: paper['notes'] = entry['arxiv_comment'] paper['year'] = entry['published_parsed'].tm_year paper['arxiv_id'] = entry['id'] paper['url'] = entry['id'] paper['abstract'] = entry['summary'].replace('\n', ' ') if 'arxiv_primary_category' in entry: paper['arxiv_type'] = entry['arxiv_primary_category'].get('term', '') paper['created'] = datetime.datetime(year=entry['published_parsed'].tm_year, month=entry['published_parsed'].tm_mon, day=entry['published_parsed'].tm_mday) paper['updated'] = datetime.datetime(year=entry['updated_parsed'].tm_year, month=entry['updated_parsed'].tm_mon, day=entry['updated_parsed'].tm_mday) paper['data'] = paper #messy papers += [paper] except Exception as ex: log_error("arxiv: error while reading item: %s" % ex[0]) return papers
def prepare_search_message(self, search_string): #if the query already uses arXiv syntax, don't add an #"all:" query on the front if not re.match("^(ti|au|abs|co|jr|cat|rn|id):\w+", search_string): search_string = 'all:' + search_string uri_string = BASE_URL + urllib.urlencode({'sortBy': SORT_BY, 'sortOrder': SORT_ORDER, 'max_results': MAX_RESULTS, 'search_query': search_string}) log_debug("arxiv: requesting %s" % uri_string) return Soup.Message.new(method='GET', uri_string=uri_string)
def search_async(self, search_text, callback, error_callback): ''' Returns a list of dictionaries: The PUBMED results for the given search query ''' # First do a query only for ids that is saved on the server log_debug('Starting Pubmed query for string "%s"' % search_text) query = BASE_URL + ESEARCH_QUERY % urllib.quote_plus(search_text) message = Soup.Message.new(method='GET', uri_string=query) def mycallback(session, message, user_data): self._ids_received(message, callback, error_callback) soup_session.queue_message(message, mycallback, None)
def parse_response(self, response): parsed = BeautifulStoneSoup(response) papers = [] for result in parsed.find('srw:records').findAll('srw:record'): result = result.find('srw:recorddata') log_debug('Single result: %s' % result.prettify()) paper = self._parse_result(result) log_debug('JSTOR paper info: %s' % str(paper)) # Add the full data, useful for later import paper['data'] = result papers.append(paper) return papers
def prepare_search_message(self, search_string): #if the query already uses arXiv syntax, don't add an #"all:" query on the front if not re.match("^(ti|au|abs|co|jr|cat|rn|id):\w+", search_string): search_string = 'all:' + search_string uri_string = BASE_URL + urllib.urlencode({ 'sortBy': SORT_BY, 'sortOrder': SORT_ORDER, 'max_results': MAX_RESULTS, 'search_query': search_string }) log_debug("arxiv: requesting %s" % uri_string) return Soup.Message.new(method='GET', uri_string=uri_string)
def import_paper_after_search(self, paper_obj, callback): ''' This method is called when a search result is requested to be imported. The given `paper_obj` is a :class:`VirtualPaper` which has all the information previously returned by the search as attributes, e.g. `paper_obj.doi` is its DOI. The special attribute `data` should be used for information that can be useful for importing the paper, in addition to the default paper attributes. For example, :class:`GoogleScholarSearch` saves the complete HTML code for a search result, which contains a link to BibTeX data and possibly to a PDF document. If this method is not overwritten, it asynchronously downloads a document given in import_url (if any) and returns the original `paper_obj` and possibly the PDF document to the callback. In case the search provider does not have any info to add to the initial search result, this is all that is needed. In cases where the search provider can add more information (e.g. the :class:`PubMedSearch` only requests summaries for the search, but when a specific paper is requested it gets the full record), this method should be overwritten. ''' # in case the paper already had an import URL, download from this URL if hasattr(paper_obj, 'import_url') and paper_obj.import_url: message = Soup.Message.new(method='GET', uri_string=paper_obj.import_url) def mycallback(session, message, user_data): if message.status_code == Soup.KnownStatusCode.OK: paper_data = message.response_body.flatten().get_data() callback(paper_obj=paper_obj, paper_data=paper_data, user_data=user_data) else: log_error("%: got status %s while trying to fetch PDF" % (self.__class__.__name__, message.status_code)) callback(paper_obj=paper_obj, user_data=user_data) log_debug("%s: trying to fetch %s" % (self.__class__.__name__, paper_obj.import_url)) soup_session.queue_message(message, mycallback, (self.label, paper_obj.import_url)) else: callback(paper_obj=paper_obj, user_data=self.label)
def import_paper_after_search(self, paper, callback): log_info('Trying to import google scholar citation') try: data = paper.data citations = data.findAll('div', {'class': 'gs_fl'})[0] log_debug('Citations: %s' % str(citations)) for link in citations.findAll('a'): log_debug('Link: %s' % str(link)) if link['href'].startswith('/scholar.bib'): log_debug('Found BibTex link: %s' % link['href']) def bibtex_callback(session, message, user_data): self._got_bibtex(message, callback, user_data) message = Soup.Message.new(method='GET', uri_string=BASE_URL + link['href']) message.request_headers.append( 'Cookie', 'GSP=ID=%s:CF=4' % self.google_id) soup_session.queue_message(message, bibtex_callback, self.label) #FIXME: Google scholar does not always seem to include the # URL in the bibtex data -- in this case add a link except: traceback.print_exc()
def search(self, search_string, callback, error_callback): ''' This method will be called by the GUI with the `search_string` when a search is initiated. Returns search results from the cache or initiates a new search using :meth:`search_async` if the search has not been performed before. Before calling the `callback`, saves the search results to the cache. This method should normally not be overwritten. ''' # A tuple identifying the search, making it possible for the callback # function to deal with the results properly (otherwise results arriving # out of order could lead to wrongly displayed results) user_data = (self.label, search_string) if not search_string: callback(user_data, []) return if search_string in self.search_cache: log_debug('Result for "%s" already in cache.' % search_string) callback(user_data, self.search_cache[search_string]) return log_info('Search for "%s" is not cached by this provider, starting new search' % search_string) try: def callback_wrapper(search_results): ''' Before calling the actual callback, save the result in the cache and add `user_data` (tuple identifying request and search provider) to the call. ''' log_debug('Saving %s in cache for "%s"' % (search_results, search_string)) self.search_cache[search_string] = search_results callback(user_data, search_results) self.search_async(search_string, callback_wrapper, error_callback) except Exception as ex: error_callback(ex, None)
def import_paper_after_search(self, paper, callback): log_info('Trying to import google scholar citation') try: data = paper.data citations = data.findAll('div', {'class': 'gs_fl'})[0] log_debug('Citations: %s' % str(citations)) for link in citations.findAll('a'): log_debug('Link: %s' % str(link)) if link['href'].startswith('/scholar.bib'): log_debug('Found BibTex link: %s' % link['href']) def bibtex_callback(session, message, user_data): self._got_bibtex(message, callback, user_data) message = Soup.Message.new(method='GET', uri_string=BASE_URL + link['href']) message.request_headers.append('Cookie', 'GSP=ID=%s:CF=4' % self.google_id) soup_session.queue_message(message, bibtex_callback, self.label) #FIXME: Google scholar does not always seem to include the # URL in the bibtex data -- in this case add a link except: traceback.print_exc()
def _paper_info_received(self, message, callback, user_data): if not message.status_code == Soup.KnownStatusCode.OK: log_error('Pubmed replied with error code %d for paper_info with id: %s' % \ (message.status_code, user_data[1])) paper_info = None else: parsed_response = BeautifulStoneSoup(message.response_body.data) paper_info = {} # Journal try: journal = parsed_response.findAll('journal')[0] paper_info['journal'] = journal.findAll('title')[0].text try: paper_info['issue'] = journal.findAll('issue')[0].text except: pass paper_info['pages'] = parsed_response.findAll('medlinepgn')[0].text log_debug('Pages: %s' % paper_info['pages']) except Exception as ex: pass # Publication date try: articledate = parsed_response.findAll('articledate')[0] paper_info['year'] = articledate.year.text except: pass # Title and abstract try: paper_info['title'] = parsed_response.findAll('articletitle')[0].text log_debug('Title: %s' % paper_info['title']) paper_info['abstract'] = parsed_response.findAll('abstracttext')[0].text log_debug('Abstract: %s' % paper_info['abstract']) except Exception as ex: pass # Authors try: all_authors = [] authors = parsed_response.findAll('author') for author in authors: author_name = author.forename.text + ' ' + \ author.lastname.text log_debug('\tAuthor: %s' % author_name) all_authors.append(author_name) if all_authors: paper_info['authors'] = all_authors except Exception as ex: pass # URL + IDs try: articleids = parsed_response.findAll('articleid') for articleid in articleids: if articleid['idtype'] == 'doi': paper_info['doi'] = articleid.text elif articleid['idtype'] == 'pubmed': paper_info['pubmed_id'] = articleid.text except: pass callback(paper_info=paper_info, user_data=user_data)
def get_paper_info_from_pdf(data): fp = BytesIO(data) # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. doc = PDFDocument() # Connect the parser and document objects. parser.set_document(doc) doc.set_parser(parser) # Initialize doc.initialize() # Extract the metadata for xref in doc.xrefs: info_ref = xref.trailer.get('Info') if info_ref: info = resolve1(info_ref) paper_info = {} if info: authors = info.get('Author') if authors: if ';' in authors: author_list = authors.split(';') elif ' AND ' in authors: author_list = authors.split(' AND ') elif ',' in authors: #FIXME: This cuts 'LastName, FirstName' in two... author_list = authors.split(',') else: author_list = [authors] paper_info['authors'] = author_list title = info.get('Title') if title: # Some PDFs have the doi as a title if title.lower().startswith('doi:'): paper_info['doi'] = title[4:] else: paper_info['title'] = title #TODO: Additional metadata? #TODO: What about embedded BibTeX (as done by JabRef)? #Extract text rsrcmgr = PDFResourceManager() content = cStringIO.StringIO() device = TextConverter(rsrcmgr, content, codec='utf-8', laparams=LAParams()) process_pdf(rsrcmgr, device, fp, check_extractable=True, caching=True) paper_info['extracted_text'] = content.getvalue() if not 'doi' in paper_info: # Try to find a DOI in the text doi = p_doi.search(paper_info['extracted_text']) if doi is not None: doi = doi.group(1) log_debug('Found a DOI: %s' % doi) paper_info['doi'] = doi device.close() content.close() log_debug('Exctracted paper_info from PDF: %s' % paper_info) return paper_info
def _paper_info_received(self, message, callback, user_data): if not message.status_code == Soup.KnownStatusCode.OK: log_error('Pubmed replied with error code %d for paper_info with id: %s' % \ (message.status_code, user_data[1])) paper_info = None else: parsed_response = BeautifulStoneSoup(message.response_body.data) paper_info = {} # Journal try: journal = parsed_response.findAll('journal')[0] paper_info['journal'] = journal.findAll('title')[0].text try: paper_info['issue'] = journal.findAll('issue')[0].text except: pass paper_info['pages'] = parsed_response.findAll( 'medlinepgn')[0].text log_debug('Pages: %s' % paper_info['pages']) except Exception as ex: pass # Publication date try: articledate = parsed_response.findAll('articledate')[0] paper_info['year'] = articledate.year.text except: pass # Title and abstract try: paper_info['title'] = parsed_response.findAll( 'articletitle')[0].text log_debug('Title: %s' % paper_info['title']) paper_info['abstract'] = parsed_response.findAll( 'abstracttext')[0].text log_debug('Abstract: %s' % paper_info['abstract']) except Exception as ex: pass # Authors try: all_authors = [] authors = parsed_response.findAll('author') for author in authors: author_name = author.forename.text + ' ' + \ author.lastname.text log_debug('\tAuthor: %s' % author_name) all_authors.append(author_name) if all_authors: paper_info['authors'] = all_authors except Exception as ex: pass # URL + IDs try: articleids = parsed_response.findAll('articleid') for articleid in articleids: if articleid['idtype'] == 'doi': paper_info['doi'] = articleid.text elif articleid['idtype'] == 'pubmed': paper_info['pubmed_id'] = articleid.text except: pass callback(paper_info=paper_info, user_data=user_data)