def run(input_directory, articles_filename, citations_filename): writer = csv.writer(open(articles_filename, 'w')) citations_writer = csv.writer(open(citations_filename, 'w')) last_journal, i, j, started = None, 0, 0, time.time() for journal, filename, xml in get_graphs(input_directory): if last_journal != journal: last_journal, i, duration = journal, i + 1, time.time() - started if last_journal: print "%4d %4d %6.2f %6.4f %s" % (i, j, duration, (duration / j) if j else 0, last_journal) j, started = 0, time.time() j += 1 for article in xml.xpath("/article-data/node[@type='article']"): fields = dict((n, '') for n in article_field_set) for datum in article.xpath('data'): if datum.attrib['key'] not in article_field_set: continue fields[datum.attrib['key']] = ' '.join((datum.text or '').split()).encode('utf-8') fields['id'] = article.attrib['id'] fields['filename'] = filename # fields['title'] = ' '.join(fields['title'].split()) # fields['abstract'] = ' '.join(fields['abstract'].split()) article = Article(**fields) writer.writerow(article) del fields, article for citation in xml.xpath("/article-data/edge[@type='cites']"): try: count = citation.xpath("data[@key='count']")[0].text except IndexError: count = '' citations_writer.writerow([citation.attrib['source'], citation.attrib['target'], count])
def run(input_directory, articles_filename, citations_filename): writer = csv.writer(open(articles_filename, 'w')) citations_writer = csv.writer(open(citations_filename, 'w')) last_journal, i, j, started = None, 0, 0, time.time() for journal, filename, xml in get_graphs(input_directory): if last_journal != journal: last_journal, i, duration = journal, i + 1, time.time() - started if last_journal: print "%4d %4d %6.2f %6.4f %s" % (i, j, duration, (duration / j) if j else 0, last_journal) j, started = 0, time.time() j += 1 for article in xml.xpath("/article-data/node[@type='article']"): fields = dict((n, '') for n in article_field_set) for datum in article.xpath('data'): if datum.attrib['key'] not in article_field_set: continue fields[datum.attrib['key']] = ' '.join( (datum.text or '').split()).encode('utf-8') fields['id'] = article.attrib['id'] fields['filename'] = filename # fields['title'] = ' '.join(fields['title'].split()) # fields['abstract'] = ' '.join(fields['abstract'].split()) article = Article(**fields) writer.writerow(article) del fields, article for citation in xml.xpath("/article-data/edge[@type='cites']"): try: count = citation.xpath("data[@key='count']")[0].text except IndexError: count = '' citations_writer.writerow( [citation.attrib['source'], citation.attrib['target'], count])
def run(input_directory, articles_filename): seen = set() parse_lock_filename = os.path.join(os.path.expanduser("~"), ".pubmed", "parse.lock") if os.path.exists(articles_filename) and os.path.exists(parse_lock_filename): os.rename(articles_filename, articles_filename + ".old") tar = tarfile.open(articles_filename, "w:gz") if os.path.exists(articles_filename + ".old"): print "Loading previous progress" try: old_tar = tarfile.open(articles_filename + ".old", "r:gz") except: print "Failed to load previous progress" else: for tar_info in old_tar: if tar_info.name.endswith(".json"): seen.add(tar_info.name.replace(".json", ".nxml").rsplit("/", 1)[-1]) data = old_tar.extractfile(tar_info) tar.addfile(tar_info, data) old_tar.close() os.unlink(articles_filename + ".old") print "Done loading previous progress, found %d articles" % len(seen) # Touch the lock file to say we've started. with open(parse_lock_filename, "w") as f: pass last_journal, i, j, started = None, 0, 0, time.time() url_mapping = get_source_url_mapping() for journal, filename, xml in get_graphs(input_directory, filter=seen_before(seen)): if last_journal != journal: last_journal, i, duration = journal, i + 1, time.time() - started if last_journal: print "%4d %4d %6.2f %6.4f %s" % (i, j, duration, (duration / j) if j else 0, last_journal) j, started = 0, time.time() j += 1 record_list = [] dataset = {"recordList": record_list} pmc = xml.xpath("/article-data/node[1]/data[@key='pmc']") source_url = None if not len(pmc): print "No PMC found for article" elif pmc[0].text in url_mapping: source_url = url_mapping[pmc[0].text] else: print "Couldn't find source URL for PMC%s (%s)" % (pmc[0].text, filename) for node in xml.xpath("/article-data/node"): data = Data(node) if not data._id: print "Missing id:", filename if data._type == "article": record = article_record(xml, node, data) elif data._type == "person": record = person_record(xml, node, data) elif data._type == "journal": record = journal_record(xml, node, data) elif data._type in ("organisation", "organization"): # I can't spell record = organisation_record(xml, node, data) else: print data._type, filename print etree.tostring(node) continue if source_url: record["x-source-url"] = source_url record_list.append(record) tar_info = tarfile.TarInfo("pmc_open_access/%s/%s" % (journal, filename.replace(".nxml", ".json"))) data = StringIO.StringIO() simplejson.dump(dataset, data) tar_info.size = data.len data.seek(0) tar.addfile(tar_info, data) if os.path.exists(parse_lock_filename): os.unlink(parse_lock_filename)
def run(input_directory, articles_filename): # # restore old progress: # seen = set() parse_lock_filename = os.path.join(os.path.expanduser('~'), '.pubmed', 'parse.lock') if os.path.exists(articles_filename) and os.path.exists(parse_lock_filename): os.rename(articles_filename, articles_filename+'.old') tar = tarfile.open(articles_filename, 'w:gz') if os.path.exists(articles_filename+'.old'): print "Loading previous progress" try: old_tar = tarfile.open(articles_filename+'.old', 'r:gz') except: print "Failed to load previous progress" else: for tar_info in old_tar: if tar_info.name.endswith('.json'): seen.add(tar_info.name.replace('.json', '.nxml').rsplit('/', 1)[-1]) data = old_tar.extractfile(tar_info) tar.addfile(tar_info, data) old_tar.close() os.unlink(articles_filename+'.old') print "Done loading previous progress, found %d articles" % len(seen) # Touch the lock file to say we've started. with open(parse_lock_filename, 'w') as f: pass # for status reporting last_journal, i, j, started = None, 0, 0, time.time() # URL map is used for PMC_ID restoring url_mapping = get_source_url_mapping() # we disable directory filtering # for journal, filename, xml in get_graphs(input_directory, filter=subset): #seen_before(seen)): for journal, filename, xml in get_graphs(input_directory): #seen_before(seen)): # # We are looping through articles in input_dir parsed into etree objects: xml # # status report if last_journal != journal: last_journal, i, duration = journal, i + 1, time.time() - started if last_journal: print "%4d %4d %6.2f %6.4f %s" % (i, j, duration, (duration / j) if j else 0, last_journal) j, started = 0, time.time() j += 1 record_list = [] dataset = { 'recordList' : record_list, } # # Get PMC ID for current article # pmc = xml.xpath("/article-data/node[1]/data[@key='pmc']") source_url = None if not len(pmc): print "No PMC found for article" elif pmc[0].text in url_mapping: source_url = url_mapping[pmc[0].text] else: print "Couldn't find source URL for PMC%s (%s)" % (pmc[0].text, filename) for node in xml.xpath("/article-data/node"): data = Data(node) if not data._id: print "Missing id:", filename if data._type == 'article': record = article_record(xml, node, data) elif data._type == 'person': record = person_record(xml, node, data) elif data._type == 'journal': record = journal_record(xml, node, data) elif data._type in ('organisation', 'organization'): # I can't spell record = organisation_record(xml, node, data) else: print data._type, filename print etree.tostring(node) continue if source_url: record['x-source-url'] = source_url record_list.append(record) tar_info = tarfile.TarInfo('pmc_open_access/%s/%s' % (journal, filename.replace('.nxml', '.json'))) data = StringIO.StringIO() # There were errors using indent = ' '. # It seems to work with an integer now. #simplejson.dump(dataset, data, indent=' ') simplejson.dump(dataset, data, indent = 2) tar_info.size = data.len data.seek(0) tar.addfile(tar_info, data) if os.path.exists(parse_lock_filename): os.unlink(parse_lock_filename)
def run(input_directory, articles_filename): # # restore old progress: # seen = set() parse_lock_filename = os.path.join(os.path.expanduser('~'), '.pubmed', 'parse.lock') if os.path.exists(articles_filename) and os.path.exists( parse_lock_filename): os.rename(articles_filename, articles_filename + '.old') tar = tarfile.open(articles_filename, 'w:gz') if os.path.exists(articles_filename + '.old'): print "Loading previous progress" try: old_tar = tarfile.open(articles_filename + '.old', 'r:gz') except: print "Failed to load previous progress" else: for tar_info in old_tar: if tar_info.name.endswith('.json'): seen.add( tar_info.name.replace('.json', '.nxml').rsplit('/', 1)[-1]) data = old_tar.extractfile(tar_info) tar.addfile(tar_info, data) old_tar.close() os.unlink(articles_filename + '.old') print "Done loading previous progress, found %d articles" % len( seen) # Touch the lock file to say we've started. with open(parse_lock_filename, 'w') as f: pass # for status reporting last_journal, i, j, started = None, 0, 0, time.time() # URL map is used for PMC_ID restoring url_mapping = get_source_url_mapping() # we disable directory filtering # for journal, filename, xml in get_graphs(input_directory, filter=subset): #seen_before(seen)): for journal, filename, xml in get_graphs( input_directory): #seen_before(seen)): # # We are looping through articles in input_dir parsed into etree objects: xml # # status report if last_journal != journal: last_journal, i, duration = journal, i + 1, time.time() - started if last_journal: print "%4d %4d %6.2f %6.4f %s" % (i, j, duration, (duration / j) if j else 0, last_journal) j, started = 0, time.time() j += 1 record_list = [] dataset = { 'recordList': record_list, } # # Get PMC ID for current article # pmc = xml.xpath("/article-data/node[1]/data[@key='pmc']") source_url = None if not len(pmc): print "No PMC found for article" elif pmc[0].text in url_mapping: source_url = url_mapping[pmc[0].text] else: print "Couldn't find source URL for PMC%s (%s)" % (pmc[0].text, filename) for node in xml.xpath("/article-data/node"): data = Data(node) if not data._id: print "Missing id:", filename if data._type == 'article': record = article_record(xml, node, data) elif data._type == 'person': record = person_record(xml, node, data) elif data._type == 'journal': record = journal_record(xml, node, data) elif data._type in ('organisation', 'organization'): # I can't spell record = organisation_record(xml, node, data) else: print data._type, filename print etree.tostring(node) continue if source_url: record['x-source-url'] = source_url record_list.append(record) tar_info = tarfile.TarInfo( 'pmc_open_access/%s/%s' % (journal, filename.replace('.nxml', '.json'))) data = StringIO.StringIO() # There were errors using indent = ' '. # It seems to work with an integer now. #simplejson.dump(dataset, data, indent=' ') simplejson.dump(dataset, data, indent=2) tar_info.size = data.len data.seek(0) tar.addfile(tar_info, data) if os.path.exists(parse_lock_filename): os.unlink(parse_lock_filename)