def parse_semantic_scholar_corpus_file(path, database_path="aip.db"): database = DatabaseManager(location=database_path) hash, parsed = database.did_parse_file(path) if parsed: return True file_iterator_func = iterload_file_lines_gzip if path.endswith("gz") else iterload_file_lines # print(corpus_file) # The json files contain stacked json objects, which is bad practice. It should be wrapped in a JSON array. # Libraries will throw errors if you attempt to load the file, so now we lazy load each object line by line. publication_iterator = file_iterator_func(path) for publication in publication_iterator: if publication is None: # Corrupt JSON line possibly. Skip it. continue if "venue" not in publication: # While parsing we sometimes get KeyError: 'venue'... continue # Try to match the publication to a venue we are interested in. # Wrap in str() as it sometimes is an int (???) venue_string = str(publication['venue']) if len(venue_string) == 0: continue # Check if any of the venue strings are a substring of the mentioned value, add it to that set. publication_title = publication['title'] publication_abstract = publication['paperAbstract'] publication_year = publication['year'] if 'year' in publication else -1 publication_journal_volume = publication['journalVolume'].replace(" ", "_") # Empty for conferences. # publication_keywords = publication['entities'] publication_id = publication['id'] num_citations = 0 if "inCitations" in publication: num_citations = len(publication["inCitations"]) publication_doi = publication['doi'] if publication_doi is None or len(publication_doi) == 0: publication_doi_url = publication['doiUrl'] if "doi.org/" in publication_doi_url: publication_doi = publication['doiUrl'][ publication['doiUrl'].index("doi.org/") + len("doi.org/"):] database.update_or_insert_paper(id=publication_id, doi=publication_doi, title=publication_title, abstract=publication_abstract, raw_venue_string=venue_string, year=publication_year, volume=publication_journal_volume, num_citations=num_citations) # database.flush_missing_venues() database.add_parsed_file(hash) database.close() return True
def parse_aminer_corpus_file(path, database_path="aip", logger_disabled=False): logger.disabled = logger_disabled database = DatabaseManager(location=database_path) hash, parsed = database.did_parse_file(path) if parsed: return True # print(path) # The json files contain stacked json objects, which is bad practice. # It should be wrapped in a JSON array. # Libraries will throw errors if you attempt to load the file, so now we lazy load each object. file_iterator_func = iterload_file_lines_gzip if path.endswith( "gz") else iterload_file_lines publication_iterator = file_iterator_func(path) for publication in tqdm(publication_iterator): if publication is None: # Corrupt JSON line possibly. Skip it. continue # Try to match the publication to a venue we are interested in. # Warning: contrary to the documentation, the key is "venue" NOT "venue.raw"! if 'venue' not in publication: logger.warning("Skipping line missing venue: %s in %s.", publication, path) continue if 'title' not in publication: logger.warning("Skipping line missing title: %s in %s.", publication, path) continue venue_string = publication['venue'] # Sometimes the venue string is yet another dict... if isinstance(venue_string, dict) and "raw" in venue_string: venue_string = venue_string["raw"] publication_title = str(publication['title']).rstrip(".") publication_abstract = publication[ 'abstract'] if 'abstract' in publication else "" publication_year = publication[ 'year'] if 'year' in publication else None publication_journal_volume = publication[ 'volume'] if 'volume' in publication else None # publication_keywords = publication['keywords'] publication_id = publication['id'] # citation_count = int(publication['n_citation']) if "n_citation" in publication else None publication_doi = publication['doi'] if 'doi' in publication else None # Sometimes in the urls, a doi link is used. If there is, we attempt to extract the doi from the link. if publication_doi is None or len(publication_doi) == 0: publication_doi_urls = publication[ 'url'] if 'url' in publication else [] for publication_doi_url in publication_doi_urls: if "doi.org/" in publication_doi_url: publication_doi = publication_doi_url[publication_doi_url. index("doi.org/") + len("doi.org/"):] break database.update_or_insert_paper(id=publication_id, doi=publication_doi, title=publication_title, abstract=publication_abstract, raw_venue_string=venue_string, year=publication_year, volume=publication_journal_volume) # database.flush_missing_venues() database.add_parsed_file(hash) database.close() return True
def parse(dblp_file, database_path="aip.db"): database = DatabaseManager(location=database_path) hash, parsed = database.did_parse_file(dblp_file) if parsed: return True counter = 0 # counter for new keys. # dtd = etree.DTD(file="/media/lfdversluis/datastore/dblp.dtd") for event, element in etree.iterparse(dblp_file, load_dtd=True, dtd_validation=True): if element.tag not in ['article', 'inproceedings', 'proceedings']: continue if 'key' in element.attrib: id = str(element.attrib['key']) else: id = "id" + str(counter) counter += 1 title = element.find('title') # type: Optional[str] if title is not None: title = str(title.text).rstrip(".") year = element.find('year') # type: Optional[int] if year is not None: try: year = int(re.search(r'\d+', str(year.text)).group()) if 20 < year < 100: # Weird cases like 92-93 year += 1900 elif year < 20: # weird cases like '12 year += 2000 except: year = None volume = element.find('volume') # type: Optional[int] if volume is not None: try: volume = int(volume.text) except: volume = None # authors = element.find('author') # type: Optional[str] venue = element.find('booktitle') # type: Optional[str] if venue is None and len(element.findall('journal')) > 0: venue = element.find('journal') if venue is not None and venue.text is not None: venue = str(venue.text) else: venue = None doi = None for ee in element.findall('ee'): ee_str = str(ee.text) if ee is not None and "doi.org" in ee_str: doi = ee_str[ee_str.index("doi.org/") + len("doi.org/"):] break if title is not None and year is not None and venue is not None: # Clean the title which may have HTML elements database.update_or_insert_paper(id=id, doi=doi, title=title, abstract="", raw_venue_string=venue, year=year, volume=volume, num_citations=-1) # Get the authors for this paper and add them to the database authors = [] # tuples of ID, orcid for author_element in element.findall('author'): orcid = None if "orcid" in author_element.attrib: orcid = str(author_element.attrib['orcid']) authors.append((author_element.text, orcid)) database.add_authors_for_article(authors=authors, article_id=id) element.clear() # database.flush_missing_venues() database.add_parsed_file(hash) database.close() return True