def strain_paragraph_starter(): nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) key_to_strain = dict([(x.unique_key(), x) for x in nex_session.query(Strain).all()]) # Strain for strain_key, paragraph in strain_paragraphs.iteritems(): if strain_key in key_to_strain: text = paragraph[0] html = link_gene_names(text, {"HO"}, nex_session) html = link_strain_names(html, {key_to_strain[strain_key].display_name}, nex_session) yield {"source": key_to_source["SGD"], "text": text, "html": html, "strain": key_to_strain[strain_key]} else: print "Strain not found: " + str(strain_key) yield None nex_session.close()
def reference_paragraph_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) id_to_reference = dict([(x.id, x) for x in nex_session.query(Reference).all()]) for old_abstract in make_db_starter(bud_session.query(Abstract), 1000)(): reference_id = old_abstract.reference_id if reference_id in id_to_reference: yield { 'source': key_to_source['SGD'], 'text': old_abstract.text, 'html': link_gene_names(old_abstract.text, set(), nex_session), 'reference': id_to_reference[reference_id], } else: print 'Reference not found: ' + str(reference_id) yield None bud_session.close() nex_session.close()
def strain_paragraph_starter(): nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) key_to_strain = dict([(x.unique_key(), x) for x in nex_session.query(Strain).all()]) #Strain for strain_key, paragraph in strain_paragraphs.iteritems(): if strain_key in key_to_strain: text = paragraph[0] html = link_gene_names(text, {'HO'}, nex_session) html = link_strain_names(html, {key_to_strain[strain_key].display_name}, nex_session) yield { 'source': key_to_source['SGD'], 'text': text, 'html': html, 'strain': key_to_strain[strain_key], } else: print 'Strain not found: ' + str(strain_key) yield None nex_session.close()
def bioentity_paragraph_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) key_to_bioentity = dict([(x.unique_key(), x) for x in nex_session.query(Locus).all()]) id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Locus).all()]) sgdid_to_reference = dict([(x.sgdid, x) for x in nex_session.query(Reference).all()]) sgdid_to_bioentity = dict([(x.sgdid, x) for x in nex_session.query(Bioentity).all()]) goid_to_go = dict([(int(x.go_id[3:]), x) for x in nex_session.query(Go).all()]) #LSP for feature in bud_session.query(Feature).all(): paragraph_feats = feature.paragraph_feats if len(paragraph_feats) > 0 and feature.id in id_to_bioentity: paragraph_feats.sort(key=lambda x: x.order) paragraph_html, paragraph_text = clean_paragraph(id_to_bioentity[feature.id], '<p>' + ('</p><p>'.join([x.paragraph.text for x in paragraph_feats])) + '</p>', str([x.paragraph.id for x in paragraph_feats]), sgdid_to_reference, sgdid_to_bioentity, goid_to_go) date_edited = None year = 0 month = 0 day = 0 for paragraph_feat in paragraph_feats: my_date = paragraph_feat.paragraph.date_edited this_date = str(my_date).split(' ')[0].replace('-0', '-').split('-') this_year = int(this_date[0]) this_month = int(this_date[1]) this_day = int(this_date[2]) if date_edited is None or datetime(this_year, this_month, this_day) > datetime(year, month, day): date_edited = my_date year = this_year month = this_month day = this_day yield { 'bioentity': id_to_bioentity[feature.id], 'source': key_to_source['SGD'], 'text': paragraph_text, 'html': paragraph_html, 'date_edited': date_edited, 'date_created': paragraph_feats[0].paragraph.date_created, 'created_by': paragraph_feats[0].paragraph.created_by, 'category': 'LSP' } bioentity_key_to_date = dict() #Go for gofeature in bud_session.query(GoFeature).all(): bioentity_key = (gofeature.feature.name, 'LOCUS') if gofeature.annotation_type == 'manually curated' and bioentity_key not in bioentity_key_to_date: bioentity_key_to_date[bioentity_key] = gofeature.date_last_reviewed for bioentity_key, date_last_reviewed in bioentity_key_to_date.iteritems(): if bioentity_key in key_to_bioentity: yield { 'bioentity': key_to_bioentity[bioentity_key], 'source': key_to_source['SGD'], 'text': str(date_last_reviewed), 'html': str(date_last_reviewed), 'date_created': None, 'created_by': None, 'category': 'GODATE' } else: #print 'Bioentity not found: ' + str(bioentity_key) yield None for pieces in make_file_starter('src/sgd/convert/data/gp_information.559292_sgd')(): if len(pieces) >= 8: sgdid = pieces[8] if sgdid.startswith('SGD:'): sgdid = sgdid[4:] go_annotation = [x[22:].strip() for x in pieces[9].split('|') if x.startswith('go_annotation_summary')] if len(go_annotation) == 1: if sgdid in sgdid_to_bioentity: yield { 'bioentity': sgdid_to_bioentity[sgdid], 'source': key_to_source['SGD'], 'text': go_annotation[0], 'html': go_annotation[0], 'date_created': None, 'created_by': None, 'category': 'GO' } else: print 'Bioentity not found: ' + sgdid yield None #Regulation file_names = ['src/sgd/convert/data/regulationSummaries', 'src/sgd/convert/data/15-8regulationSummaries.txt', 'src/sgd/convert/data/15-9regulationSummaries.txt', 'src/sgd/convert/data/15-10regulationSummaries.txt', 'src/sgd/convert/data/15-11regulationSummaries.txt', 'src/sgd/convert/data/16-1regulationSummaries.txt', 'src/sgd/convert/data/16-2regulationSummaries.txt', 'src/sgd/convert/data/16-3regulationSummaries.txt', 'src/sgd/convert/data/16-4regulationSummaries.txt', 'src/sgd/convert/data/16-5regulationSummaries.txt'] for file_name in file_names: for row in make_file_starter(file_name)(): bioentity_key = (row[0], 'LOCUS') if bioentity_key in key_to_bioentity: bioentity = key_to_bioentity[bioentity_key] yield { 'bioentity': bioentity, 'source': key_to_source['SGD'], 'text': row[2], 'html': link_gene_names(row[2], {bioentity.display_name, bioentity.format_name, bioentity.display_name + 'P', bioentity.format_name + 'P'}, nex_session), 'category': 'REGULATION' } else: #print 'Bioentity not found: ' + str(bioentity_key) yield None #Phenotype file_names = ['src/sgd/convert/data/PhenotypeSummaries032015.txt', 'src/sgd/convert/data/15-6phenoSummariesTyposFixed.txt', 'src/sgd/convert/data/15-7phenoSummaries.txt', 'src/sgd/convert/data/15-8phenoSummaries.txt', 'src/sgd/convert/data/15-9phenoSummaries.txt', 'src/sgd/convert/data/15-10phenoSummaries.txt', 'src/sgd/convert/data/15-11phenoSummaries.txt', 'src/sgd/convert/data/15-12phenoSummaries.txt', 'src/sgd/convert/data/16-1phenoSummaries.txt', 'src/sgd/convert/data/16-2phenoSummaries.txt', 'src/sgd/convert/data/16-3phenoSummaries.txt', 'src/sgd/convert/data/16-4phenoSummaries.txt', 'src/sgd/convert/data/16-5phenoSummaries.txt', 'src/sgd/convert/data/16-6phenoSummaries.txt', 'src/sgd/convert/data/16-7phenoSummaries.txt', 'src/sgd/convert/data/16-9phenoSummaries.txt', 'src/sgd/convert/data/16-10phenoSummaries.txt'] for file_name in file_names: for row in make_file_starter(file_name)(): bioentity_key = (row[0], 'LOCUS') if bioentity_key in key_to_bioentity: bioentity = key_to_bioentity[bioentity_key] yield { 'bioentity': bioentity, 'source': key_to_source['SGD'], 'text': row[1], 'html': link_gene_names(row[1], {bioentity.display_name, bioentity.format_name, bioentity.display_name + 'P', bioentity.format_name + 'P'}, nex_session), 'category': 'PHENOTYPE' } else: #print 'Bioentity not found: ' + str(bioentity_key) yield None bud_session.close() nex_session.close()
def bioentity_paragraph_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) key_to_bioentity = dict([(x.unique_key(), x) for x in nex_session.query(Locus).all()]) id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Locus).all()]) sgdid_to_reference = dict([(x.sgdid, x) for x in nex_session.query(Reference).all()]) sgdid_to_bioentity = dict([(x.sgdid, x) for x in nex_session.query(Bioentity).all()]) goid_to_go = dict([(int(x.go_id[3:]), x) for x in nex_session.query(Go).all()]) # LSP for feature in bud_session.query(Feature).all(): paragraph_feats = feature.paragraph_feats if len(paragraph_feats) > 0 and feature.id in id_to_bioentity: paragraph_feats.sort(key=lambda x: x.order) paragraph_html, paragraph_text = clean_paragraph( id_to_bioentity[feature.id], "<p>" + ("</p><p>".join([x.paragraph.text for x in paragraph_feats])) + "</p>", str([x.paragraph.id for x in paragraph_feats]), sgdid_to_reference, sgdid_to_bioentity, goid_to_go, ) yield { "bioentity": id_to_bioentity[feature.id], "source": key_to_source["SGD"], "text": paragraph_text, "html": paragraph_html, "date_edited": paragraph_feats[0].paragraph.date_edited, "date_created": paragraph_feats[0].paragraph.date_created, "created_by": paragraph_feats[0].paragraph.created_by, "category": "LSP", } bioentity_key_to_date = dict() # Go for gofeature in bud_session.query(GoFeature).all(): bioentity_key = (gofeature.feature.name, "LOCUS") if gofeature.annotation_type == "manually curated" and bioentity_key not in bioentity_key_to_date: bioentity_key_to_date[bioentity_key] = gofeature.date_last_reviewed for bioentity_key, date_last_reviewed in bioentity_key_to_date.iteritems(): if bioentity_key in key_to_bioentity: yield { "bioentity": key_to_bioentity[bioentity_key], "source": key_to_source["SGD"], "text": str(date_last_reviewed), "html": str(date_last_reviewed), "date_created": None, "created_by": None, "category": "GODATE", } else: # print 'Bioentity not found: ' + str(bioentity_key) yield None for pieces in make_file_starter("src/sgd/convert/data/gp_information.559292_sgd")(): if len(pieces) >= 8: sgdid = pieces[8] if sgdid.startswith("SGD:"): sgdid = sgdid[4:] go_annotation = [ x[22:].strip() for x in pieces[9].split("|") if x.startswith("go_annotation_summary") ] if len(go_annotation) == 1: if sgdid in sgdid_to_bioentity: yield { "bioentity": sgdid_to_bioentity[sgdid], "source": key_to_source["SGD"], "text": go_annotation[0], "html": go_annotation[0], "date_created": None, "created_by": None, "category": "GO", } else: print "Bioentity not found: " + sgdid yield None # Regulation file_names = [ "src/sgd/convert/data/regulationSummaries", "src/sgd/convert/data/15-8regulationSummaries.txt", "src/sgd/convert/data/15-9regulationSummaries.txt", "src/sgd/convert/data/15-10regulationSummaries.txt", "src/sgd/convert/data/15-11regulationSummaries.txt", ] for file_name in file_names: for row in make_file_starter(file_name)(): bioentity_key = (row[0], "LOCUS") if bioentity_key in key_to_bioentity: bioentity = key_to_bioentity[bioentity_key] yield { "bioentity": bioentity, "source": key_to_source["SGD"], "text": row[2], "html": link_gene_names( row[2], { bioentity.display_name, bioentity.format_name, bioentity.display_name + "P", bioentity.format_name + "P", }, nex_session, ), "category": "REGULATION", } else: # print 'Bioentity not found: ' + str(bioentity_key) yield None # Phenotype file_names = [ "src/sgd/convert/data/PhenotypeSummaries032015.txt", "src/sgd/convert/data/15-6phenoSummariesTyposFixed.txt", "src/sgd/convert/data/15-7phenoSummaries.txt", "src/sgd/convert/data/15-8phenoSummaries.txt", "src/sgd/convert/data/15-9phenoSummaries.txt", "src/sgd/convert/data/15-10phenoSummaries.txt", "src/sgd/convert/data/15-11phenoSummaries.txt", ] for file_name in file_names: for row in make_file_starter(file_name)(): bioentity_key = (row[0], "LOCUS") if bioentity_key in key_to_bioentity: bioentity = key_to_bioentity[bioentity_key] yield { "bioentity": bioentity, "source": key_to_source["SGD"], "text": row[1], "html": link_gene_names( row[1], { bioentity.display_name, bioentity.format_name, bioentity.display_name + "P", bioentity.format_name + "P", }, nex_session, ), "category": "PHENOTYPE", } else: # print 'Bioentity not found: ' + str(bioentity_key) yield None bud_session.close() nex_session.close()