def create_evidence(row, row_id, key_to_experiment, key_to_bioent, pubmed_to_reference_id): from model_new_schema.regulation import Regulationevidence #bioent1_gene_name = row[0] bioent1_format_name = row[1].upper().strip() bioent2_format_name = row[3].upper().strip() experiment_format_name = create_format_name(row[4].strip()) experiment_eco_id = row[5].strip() conditions = row[6].strip() #unknown_field1 = row[7] #unknown_field2 = row[8] #unknown_field3 = row[9] pubmed_id = int(row[10].strip()) source = row[11].strip() if (bioent1_format_name, 'LOCUS') in key_to_bioent: bioent1 = key_to_bioent[(bioent1_format_name, 'LOCUS')] elif (bioent1_format_name, 'BIOENTITY') in key_to_bioent: bioent1 = key_to_bioent[(bioent1_format_name, 'BIOENTITY')] else: print 'Bioent does not exist ' + str(bioent1_format_name) return None bioent1_id = bioent1.id if (bioent2_format_name, 'LOCUS') in key_to_bioent: bioent2 = key_to_bioent[(bioent2_format_name, 'LOCUS')] elif (bioent2_format_name, 'BIOENTITY') in key_to_bioent: bioent2 = key_to_bioent[(bioent2_format_name, 'BIOENTITY')] else: print 'Bioent does not exist ' + str(bioent2_format_name) return None bioent2_id = bioent2.id experiment_key = experiment_format_name if experiment_key not in key_to_experiment: experiment_key = create_format_name(experiment_eco_id) if experiment_key not in key_to_experiment: print 'Experiment does not exist ' + str(experiment_key) return None experiment_id = key_to_experiment[experiment_key].id if pubmed_id not in pubmed_to_reference_id: print 'Reference does not exist ' + str(pubmed_id) return None reference_id = pubmed_to_reference_id[pubmed_id] if conditions == '""': conditions = None else: conditions.replace('??', "\00b5") strain_id = None if pubmed_id in pubmed_to_strain: strain_id = pubmed_to_strain[pubmed_id] new_evidence = Regulationevidence(create_evidence_id(row_id), experiment_id, reference_id, strain_id, source, bioent1_id, bioent2_id, conditions, None, None) return [new_evidence]
def create_domain(row): from model_new_schema.protein import Domain source = row[13].strip() format_name = create_format_name(row[3].strip()) display_name = row[3].strip() description = row[4].strip() interpro_id = row[5].strip() interpro_description = row[6].strip() #Need to check these links if source == 'JASPAR': link = 'http://jaspar.binf.ku.dk/cgi-bin/jaspar_db.pl?rm=present&collection=CORE&ID=' + display_name elif source == 'HMMSmart': source = 'SMART' link = "http://smart.embl-heidelberg.de/smart/do_annotation.pl?DOMAIN=" + display_name elif source == 'HMMPfam': source = 'Pfam' link = "http://pfam.sanger.ac.uk/family?type=Family&entry=" + display_name elif source == 'Gene3D': link = "http://www.cathdb.info/version/latest/superfamily/" + display_name[6:] elif source == 'superfamily': source = 'SUPERFAMILY' link = "http://supfam.org/SUPERFAMILY/cgi-bin/scop.cgi?ipid=" + display_name elif source == 'Seg': link = None elif source == 'Coil': link = None elif source == 'HMMPanther': source = 'PANTHER' link = "http://www.pantherdb.org/panther/family.do?clsAccession=" + display_name elif source == 'HMMTigr': source = 'TIGRFAMs' link = "http://cmr.tigr.org/tigr-scripts/CMR/HmmReport.cgi?hmm_acc=" + display_name elif source == 'FPrintScan': source = 'PRINTS' link = "http:////www.bioinf.man.ac.uk/cgi-bin/dbbrowser/sprint/searchprintss.cgi?display_opts=Prints&category=None&queryform=false&prints_accn=" + display_name elif source == 'BlastProDom': source = 'ProDom' link = "http://prodom.prabi.fr/prodom/current/cgi-bin/request.pl?question=DBEN&query=" + display_name elif source == 'HMMPIR': source = "PIR superfamily" link = "http://pir.georgetown.edu/cgi-bin/ipcSF?" + display_name elif source == 'ProfileScan': source = 'PROSITE' link = "http://prodom.prabi.fr/prodom/cgi-bin/prosite-search-ac?" + display_name elif source == 'PatternScan': source = 'PROSITE' link = "http://prodom.prabi.fr/prodom/cgi-bin/prosite-search-ac?" + display_name else: print 'No link for source = ' + source + ' ' + str(display_name) return None if description == 'no description': description = None if interpro_description == 'NULL': interpro_description = None domain = Domain(format_name, display_name, description, interpro_id, interpro_description, link, source) return [domain]
def create_experiment_from_binding_row(display_name, row_id): from model_new_schema.evelement import Experiment format_name = create_format_name(display_name) link = experiment_link(format_name) new_experiment = Experiment(create_experiment_id_from_binding_row(row_id, format_name), display_name, format_name, link, None, None, None, None) return [new_experiment]
def create_author(old_author): from model_new_schema.reference import Author as NewAuthor display_name = old_author.name format_name = create_format_name(display_name) link = author_link(format_name) new_author = NewAuthor(create_author_id(old_author.id), display_name, format_name, link, old_author.date_created, old_author.created_by) return [new_author]
def create_chemical(expt_property): from model_new_schema.chemical import Chemical as NewChemical display_name = expt_property.value format_name = create_format_name(display_name) link = chemical_link(format_name) new_chemical = NewChemical(display_name, format_name, link, 'SGD', expt_property.date_created, expt_property.created_by) return [new_chemical]
def create_chemical(old_cv_term): from model_new_schema.chemical import Chemical as NewChemical display_name = old_cv_term.name format_name = create_format_name(display_name) source = 'EBI' new_chemical = NewChemical(display_name, format_name, source, old_cv_term.date_created, old_cv_term.created_by) return new_chemical
def create_chemrels(old_cv_term, key_to_chemical): from model_new_schema.chemical import ChemicalRelation as NewChemicalRelations child_chemical_key = create_format_name(old_cv_term.name) if child_chemical_key not in key_to_chemical: print 'Chemical does not exist.' return [] child_id = key_to_chemical[child_chemical_key].id new_chemrels = [] for parent_rel in old_cv_term.parent_rels: parent_chemical_key = create_format_name(parent_rel.parent.name) if parent_chemical_key not in key_to_chemical: print 'Chemical does not exist.' else: parent_id = key_to_chemical[parent_chemical_key].id new_chemrels.append(NewChemicalRelations(parent_rel.id, parent_id, child_id, parent_rel.date_created, parent_rel.created_by)) return new_chemrels
def create_experiment(old_cv_term): from model_new_schema.evelement import Experiment as NewExperiment display_name = old_cv_term.name format_name = create_format_name(display_name) description = old_cv_term.definition link = experiment_link(format_name) new_experiment = NewExperiment(create_experiment_id(old_cv_term.id), display_name, format_name, link, description, None, old_cv_term.date_created, old_cv_term.created_by) return [new_experiment]
def create_experiment_relation(old_cv_term, key_to_experiment): from model_new_schema.evelement import ExperimentRelation as NewExperimentRelation child_key = create_format_name(old_cv_term.name) if child_key not in key_to_experiment: print 'Experiment does not exist.' return None child_id = key_to_experiment[child_key].id new_rels = [] for parent_rel in old_cv_term.parent_rels: parent_key = create_format_name(parent_rel.parent.name) if parent_key not in key_to_experiment: print 'Experiment does not exist.' else: parent_id = key_to_experiment[parent_key].id new_rels.append(NewExperimentRelation(create_experiment_relation_id(parent_rel.id), parent_id, child_id, parent_rel.date_created, parent_rel.created_by)) return new_rels
def create_strain(old_cv_term): from model_new_schema.evelement import Strain as NewStrain display_name = old_cv_term.name format_name = create_format_name(display_name) description = old_cv_term.definition link = strain_link(format_name) new_strain = NewStrain(create_strain_id(old_cv_term.id), display_name, format_name, link, description, old_cv_term.date_created, old_cv_term.created_by) return [new_strain]
def create_alias(old_cv_term, key_to_chemical): from model_new_schema.chemical import ChemicalAlias as NewChemicalAlias chemical_key = create_format_name(old_cv_term.name) if chemical_key not in key_to_chemical: print 'Chemical does not exist.' return [] chemical_id = key_to_chemical[chemical_key].id new_aliases = [NewChemicalAlias(synonym.synonym, 'EBI', chemical_id, synonym.date_created, synonym.created_by) for synonym in old_cv_term.cv_synonyms] return new_aliases
def create_experiment_alias(old_cv_term, key_to_experiment): from model_new_schema.evelement import Experimentalias as NewExperimentalias experiment_key = create_format_name(old_cv_term.name) if experiment_key not in key_to_experiment: print 'Experiment does not exist.' return None experiment_id = key_to_experiment[experiment_key].id new_altids = [NewExperimentalias(dbxref.dbxref_id, 'SGD', 'APOID', experiment_id, dbxref.date_created, dbxref.created_by) for dbxref in old_cv_term.dbxrefs] return new_altids
def create_altids(old_cv_term, key_to_chemical): from model_new_schema.chemical import ChemicalAltid as NewChemicalAltid chemical_key = create_format_name(old_cv_term.name) if chemical_key not in key_to_chemical: print 'Chemical does not exist.' return [] chemical_id = key_to_chemical[chemical_key].id new_altids = [NewChemicalAltid(dbxref.dbxref_id, 'EBI', 'CHEBI', chemical_id, dbxref.date_created, dbxref.created_by) for dbxref in old_cv_term.dbxrefs] return new_altids
def create_domain_from_tf_file(row): from model_new_schema.protein import Domain source = 'JASPAR' display_name = row[0] format_name = create_format_name(row[0]) description = 'Class: ' + row[4] + ', Family: ' + row[3] interpro_id = None interpro_description = None link = 'http://jaspar.binf.ku.dk/cgi-bin/jaspar_db.pl?rm=present&collection=CORE&ID=' + display_name domain = Domain(format_name, display_name, description, interpro_id, interpro_description, link, source) return [domain]
def create_phenotype(old_phenotype): from model_new_schema.phenotype import Phenotype as NewPhenotype observable = old_phenotype.observable qualifier = old_phenotype.qualifier mutant_type = old_phenotype.mutant_type display_name = create_phenotype_display_name(observable, qualifier, mutant_type) format_name = create_format_name(display_name) link = biocon_link("Phenotype", format_name) new_phenotype = NewPhenotype(create_phenotype_id(old_phenotype.id), display_name, format_name, link, observable, qualifier, mutant_type, create_phenotype_type(old_phenotype.observable), old_phenotype.date_created, old_phenotype.created_by) return [new_phenotype]
def create_experiment_from_reg_row(display_name, eco_id, row_id): from model_new_schema.evelement import Experiment if display_name is None: display_name = eco_id format_name = create_format_name(display_name) if display_name.endswith('evidence'): display_name = display_name[:-9] link = experiment_link(format_name) new_experiment = Experiment(create_experiment_id_from_reg_row(row_id, format_name), display_name, format_name, link, None, eco_id, None, None) return [new_experiment]
def create_domain_evidence(row, row_id, key_to_bioentity, key_to_domain): from model_new_schema.protein import Domainevidence bioent_format_name = row[1].strip() source = row[13].strip() domain_format_name = create_format_name(row[3].strip()) start = row[10].strip() end = row[11].strip() evalue = row[12].strip() status = None date_of_run = None bioent_key = (bioent_format_name + 'P', 'PROTEIN') if bioent_key not in key_to_bioentity: print 'Protein not found. ' + bioent_format_name + 'P' return None protein_id = key_to_bioentity[bioent_key].id if source == 'HMMSmart': source = 'SMART' if source == 'HMMPanther': source = 'PANTHER' if source == 'FPrintScan': source = 'PRINTS' if source == 'HMMPfam': source = 'Pfam' if source == 'PatternScan' or source == 'ProfileScan': source = 'PROSITE' if source == 'BlastProDom': source = 'ProDom' if source == 'HMMTigr': source = 'TIGRFAMs' if source == 'HMMPIR': source = 'PIR superfamily' if domain_format_name not in key_to_domain: print 'Domain not found. ' + domain_format_name return None domain_id = key_to_domain[domain_format_name].id #S288C strain_id = 1 domain_evidence = Domainevidence(create_domain_evidence_id(row_id), None, strain_id, source, int(start), int(end), evalue, status, date_of_run, protein_id, domain_id, None, None) return [domain_evidence]
def create_physical_interevidence(old_interaction, key_to_experiment, reference_ids, bioent_ids): from model_new_schema.interaction import Physinteractionevidence as NewPhysinteractionevidence if old_interaction.interaction_type == 'physical interactions': reference_ids = old_interaction.reference_ids if len(reference_ids) != 1: print 'Too many references' return None reference_id = reference_ids[0] note = old_interaction.interaction_references[0].note if reference_id not in reference_ids: print 'Reference does not exist.' return None bioent_ids = list(old_interaction.feature_ids) bioent_ids.sort() bioent1_id = bioent_ids[0] bioent2_id = bioent_ids[1] if bioent1_id > bioent2_id: print 'Out of order.' return None if bioent1_id not in bioent_ids: print 'Bioentity does not exist.' return None if bioent2_id not in bioent_ids: print 'Bioentity does not exist.' return None experiment_key = create_format_name(old_interaction.experiment_type) if experiment_key not in key_to_experiment: print 'Experiment does not exist. ' + str(experiment_key) return None experiment_id = key_to_experiment[experiment_key].id feat_interacts = sorted(old_interaction.feature_interactions, key=lambda x: x.feature_id) bait_hit = '-'.join([x.action for x in feat_interacts]) new_physical_interevidence = NewPhysinteractionevidence(create_physical_evidence_id(old_interaction.id), experiment_id, reference_id, None, old_interaction.source, bioent1_id, bioent2_id, old_interaction.annotation_type, old_interaction.modification, bait_hit, note, old_interaction.date_created, old_interaction.created_by) return [new_physical_interevidence] return None
def create_evidence_chemical(old_evidence, key_to_chemical, id_to_phenoevidence): from model_new_schema.evidence import EvidenceChemical as NewEvidenceChemical evidence_chemicals = [] new_phenoevidence_id = create_evidence_id(old_evidence.id) if new_phenoevidence_id not in id_to_phenoevidence: print 'Phenoevidence does not exist. ' + str(new_phenoevidence_id) return None if old_evidence.experiment is not None: chemical_infos = old_evidence.experiment.chemicals if chemical_infos is not None: for chemical_info in chemical_infos: chemical_key = create_format_name(chemical_info[0]) if chemical_key not in key_to_chemical: print 'Chemical does not exist. ' + chemical_key else: chemical_id = key_to_chemical[chemical_key].id chemical_amount = chemical_info[1] evidence_chemicals.append(NewEvidenceChemical(new_phenoevidence_id, chemical_id, chemical_amount, 'PHENOTYPE')) return evidence_chemicals
def create_evidence(row, row_id, key_to_experiment, key_to_bioent, pubmed_to_reference_id): from model_new_schema.sequence import Bindingevidence bioent_format_name = row[2][1:-1] motif_id = row[3][1:-1] total_score = row[6][1:-1] expert_confidence = row[8][1:-1] experiment_format_name = create_format_name(row[9][1:-1]) pubmed_id = int(row[10][1:-1]) source = 'YeTFaSCo' if expert_confidence != 'High': return None if (bioent_format_name, 'LOCUS') in key_to_bioent: bioent1 = key_to_bioent[(bioent_format_name, 'LOCUS')] elif (bioent_format_name, 'BIOENTITY') in key_to_bioent: bioent1 = key_to_bioent[(bioent_format_name, 'BIOENTITY')] else: print 'Bioent does not exist ' + str(bioent_format_name) return None bioent_id = bioent1.id experiment_key = experiment_format_name if experiment_key not in key_to_experiment: print 'Experiment does not exist ' + str(experiment_key) return None experiment_id = key_to_experiment[experiment_key].id reference_id = None if pubmed_id in pubmed_to_reference_id: reference_id = pubmed_to_reference_id[pubmed_id] img_url = "/static/img/yetfasco/" + bioent_format_name + "_" + motif_id + ".0.png" new_evidence = Bindingevidence(create_evidence_id(row_id), experiment_id, reference_id, None, source, bioent_id, total_score, expert_confidence, img_url, motif_id, None, None) return [new_evidence]
def convert_author_reference(old_session_maker, new_session_maker, chunk_size): from model_new_schema.reference import Author as NewAuthor, Reference as NewReference, AuthorReference as NewAuthorReference from model_old_schema.reference import AuthorReference as OldAuthorReference, Author as OldAuthor log = logging.getLogger('convert.reference_in_depth.author_reference') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() old_session = old_session_maker() #Values to check values_to_check = ['author_type'] #Grab cached dictionaries reference_ids = set([x.id for x in new_session.query(NewReference).all()]) #Simplify author conversion old_id_to_key = dict([(x.id, create_format_name(x.name)) for x in old_session.query(OldAuthor).all()]) new_key_to_id = dict([(x.unique_key(), x.id) for x in new_session.query(NewAuthor).all()]) old_id_to_new_id_author = dict([(x, new_key_to_id[y]) for x, y in old_id_to_key.iteritems()]) used_unique_keys = set() count = old_session.query(func.max(OldAuthorReference.id)).first()[0] num_chunks = ceil(1.0*count/chunk_size) min_id = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(NewAuthorReference).filter(NewAuthorReference.id >= min_id).filter(NewAuthorReference.id < min_id+chunk_size).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_objs = old_session.query(OldAuthorReference).filter( OldAuthorReference.id >= min_id).filter( OldAuthorReference.id < min_id+chunk_size).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_author_reference(old_obj, old_id_to_new_id_author, reference_ids) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) used_unique_keys.add(unique_key) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id + chunk_size except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def create_evidence(old_phenotype_feature, key_to_reflink, key_to_phenotype, reference_ids, bioent_ids, key_to_strain, key_to_experiment, key_to_allele): from model_new_schema.phenotype import Phenotypeevidence as NewPhenotypeevidence evidence_id = create_evidence_id(old_phenotype_feature.id) reference_id = key_to_reflink[('PHENO_ANNOTATION_NO', old_phenotype_feature.id)].reference_id if reference_id not in reference_ids: print 'Reference does not exist. ' + str(reference_id) return None bioent_id = old_phenotype_feature.feature_id if bioent_id not in bioent_ids: print 'Bioentity does not exist. ' + str(bioent_id) return None phenotype_key = create_phenotype_key(old_phenotype_feature.observable, old_phenotype_feature.qualifier, old_phenotype_feature.mutant_type) if phenotype_key not in key_to_phenotype: print 'Phenotype does not exist. ' + str(phenotype_key) return None biocon_id = key_to_phenotype[phenotype_key].id experiment_key = create_format_name(old_phenotype_feature.experiment_type) if experiment_key not in key_to_experiment: print 'Experiment does not exist. ' + str(experiment_key) return None experiment_id = key_to_experiment[experiment_key].id strain_id = None mutant_allele_id = None allele_info = None reporter = None reporter_desc = None strain_details = None experiment_details = None conditions = None details = None if old_phenotype_feature.experiment is not None: experiment = old_phenotype_feature.experiment reporter = None if experiment.reporter == None else experiment.reporter[0] reporter_desc = None if experiment.reporter == None else experiment.reporter[1] strain_key = None if experiment.strain == None else experiment.strain[0] strain_details = None if experiment.strain == None else experiment.strain[1] strain_id = None if strain_key in key_to_strain: strain_id = key_to_strain[strain_key].id allele_info = experiment.allele if allele_info is not None: allele_name = allele_info[0] mutant_allele_id = key_to_allele[allele_name].id allele_info = allele_info[1] comment = experiment.experiment_comment if comment is not None: experiment_details = comment if len(experiment.condition) > 0: conditions = [] for (a, b) in experiment.condition: if b is None: conditions.append(a) else: conditions.append(a + '- ' + b) condition_info = ', '.join(conditions) conditions = condition_info if len(experiment.details) > 0: details = [] for (a, b) in experiment.details: if b is None: details.append(a) else: details.append(a + '- ' + b) detail_info = ', '.join(details) details = detail_info new_phenoevidence = NewPhenotypeevidence(evidence_id, experiment_id, reference_id, strain_id, old_phenotype_feature.source, bioent_id, biocon_id, mutant_allele_id, allele_info, reporter, reporter_desc, strain_details, experiment_details, conditions, details, old_phenotype_feature.date_created, old_phenotype_feature.created_by) return [new_phenoevidence]
def create_phenotype_key(observable, qualifier, mutant_type): display_name = create_phenotype_display_name(observable, qualifier, mutant_type) format_name = create_format_name(display_name) return (format_name, 'PHENOTYPE')
def create_genetic_interevidence(old_interaction, key_to_experiment, key_to_phenotype, reference_ids, bioent_ids): from model_new_schema.interaction import Geninteractionevidence as NewGeninteractionevidence if old_interaction.interaction_type == "genetic interactions": reference_ids = old_interaction.reference_ids if len(reference_ids) != 1: print "Too many references" return None reference_id = reference_ids[0] if reference_id not in reference_ids: print "Reference does not exist." return None note = old_interaction.interaction_references[0].note bioent_ids = list(old_interaction.feature_ids) bioent_ids.sort() bioent1_id = bioent_ids[0] bioent2_id = bioent_ids[1] if bioent1_id > bioent2_id: print "Out of order." return None if bioent1_id not in bioent_ids: print "Bioentity does not exist." return None if bioent2_id not in bioent_ids: print "Bioentity does not exist." return None old_phenotypes = old_interaction.interaction_phenotypes phenotype_id = None if len(old_phenotypes) == 1: old_phenotype = old_phenotypes[0].phenotype phenotype_key = create_phenotype_key( old_phenotype.observable, old_phenotype.qualifier, old_phenotype.mutant_type ) if phenotype_key not in key_to_phenotype: print "Phenotype does not exist. " + str(phenotype_key) return None phenotype_id = key_to_phenotype[phenotype_key].id elif len(old_phenotypes) > 1: print "Too many phenotypes." return None experiment_key = create_format_name(old_interaction.experiment_type) if experiment_key not in key_to_experiment: print "Experiment does not exist. " + str(experiment_key) return None experiment_id = key_to_experiment[experiment_key].id feat_interacts = sorted(old_interaction.feature_interactions, key=lambda x: x.feature_id) bait_hit = "-".join([x.action for x in feat_interacts]) new_genetic_interevidence = NewGeninteractionevidence( create_genetic_evidence_id(old_interaction.id), experiment_id, reference_id, None, old_interaction.source, bioent1_id, bioent2_id, phenotype_id, old_interaction.annotation_type, bait_hit, note, old_interaction.date_created, old_interaction.created_by, ) return [new_genetic_interevidence] return None