def generate_corpus_file(pmids, medline_dir, fn): print "generating corpus file" my_medline = medline.medline(medline_dir) fh = open(fn, "w") print >> fh, "\t".join(["## pmid", "stop_stem"]) for i, (pmid, abstract) in enumerate(my_medline.get_processed_abstracts(pmids)): if i%1000000==0: print i print >> fh, "\t".join([pmid, abstract]) fh.close()
def determine_stats(medline_dir): # determine stats for medline files my_medline = medline.medline(medline_dir) fh_m = open("../data/medline_mesh.txt","w") print >> fh_m, "\t".join(["## pmid", "major", "key", "minors"]) for i, (pmid, d_info) in enumerate(my_medline._read_pmids(log_fn = "../data/medline_stats2.txt")): meshs = d_info["source"].get("MH", []) for mesh in meshs: mymesh = medline.mesh(mesh) print >> fh_m, "\t".join([pmid, mymesh.major, str(mymesh.key)]+mymesh.minors) fh_m.close()