def parse_ecocyc(strain, session): for path in ecocyc_paths: interaction_file_name = "Data/Ecoli/ecocyc_files/interactions_sif/" + path + "_interactions.txt" #if there was a problem with obtaining the sif files for a pathway, they may not exist if not exists(interaction_file_name): continue with open(interaction_file_name) as file: reader = csv.DictReader(file) for interaction_row in reader: interactors_A, interactors_B = [], [] new_metabolite_A, new_metabolite_B = None, None id_A = interaction_row['PARTICIPANT_A'] id_B = interaction_row['PARTICIPANT_B'] #if id_A isn't in ecocyc_compounds, it's a uniprot id; search for ecoli orthologs matching id_A if id_A not in ecocyc_compounds: for ortholog in session.query(OrthologEcoli).filter_by( ortholog_uniprot=id_A, strain_protein=strain).all(): if ortholog is not None: # add both the pseudomonas protein and the ortholog id (will be needed later to # create interaction reference) to interactors_A interactors_A.append( [ortholog.protein, ortholog.ortholog_id]) # if id_A is in ecocyc_compounds, it means it's a metabolite id else: A_ecocyc = ecocyc_compounds[id_A]['ecocyc'] #check if the metabolite already exists in database (only need to search ecocyc id since # update_metabolites_ecocyc was called) metabolite = session.query(Metabolite).filter_by( ecocyc=A_ecocyc).first() if metabolite is not None: # if metabolite exists, add both the metabolite and it's name (will be needed later # to create interaction reference) to interactors_A interactors_A.append([metabolite, metabolite.name]) else: # if metabolite doesn't exist yet, store it's id to create it later (don't create it now # since if interactor_B is invalid, there is no need for new metabolite to be created) new_metabolite_A = A_ecocyc # same as for id_A above, now with second interactor if id_B not in ecocyc_compounds: for ortholog in session.query(OrthologEcoli).filter_by( ortholog_uniprot=id_B, strain_protein=strain).all(): if ortholog is not None: interactors_B.append( [ortholog.protein, ortholog.ortholog_id]) else: B_ecocyc = ecocyc_compounds[id_B]['ecocyc'] metabolite = session.query(Metabolite).filter_by( ecocyc=B_ecocyc).first() if metabolite is not None: interactors_B.append([metabolite, metabolite.name]) else: new_metabolite_B = B_ecocyc # store new interactor pairs from which to create interactions here interactor_pairs = [] # case where no unknown metabolites were found if (new_metabolite_A is None) and (new_metabolite_B is None): # iterate through known protein interactors, add them together to interactor_pairs for interactor_A in interactors_A: for interactor_B in interactors_B: # only add the interactor pair if at least one of them is not a metabolite if (interactor_A[0].type != 'm') | (interactor_B[0].type != 'm'): interactor_pairs.append( [interactor_A, interactor_B]) # case where there is one new metabolite (new_metabolite_A) elif new_metabolite_A is not None: for interactor_B in interactors_B: # don't add a new interactor pair if both are metabolites if interactor_B[0].type != 'm': # check if new metabolite exists in database (eg. if more than one ortholog was found for # interactors_B, you don't want to create the same new metabolite twice) metabolite = session.query(Metabolite).filter_by( ecocyc=new_metabolite_A).first() # create a new metabolite if it doesn't exist if metabolite is None: metabolite = Metabolite( id=new_metabolite_A, name=id_A, ecocyc=new_metabolite_A, pubchem=ecocyc_compounds[id_A]['pubchem'], kegg=ecocyc_compounds[id_A]['kegg'], cas=ecocyc_compounds[id_A]['cas'], chebi=ecocyc_compounds[id_A]['chebi']) session.add(metabolite) # add the interactor pair (for the new metabolite, make sure to add it's name (for # reference later) interactor_pairs.append( [interactor_B, [metabolite, id_A]]) # same as previous case, but if new metabolite is new_metabolite_B elif new_metabolite_B is not None: for interactor_A in interactors_A: if interactor_A[0].type != 'm': metabolite = session.query(Metabolite).filter_by( ecocyc=new_metabolite_B).first() if metabolite is None: metabolite = Metabolite( id=new_metabolite_B, name=id_B, ecocyc=new_metabolite_B, pubchem=ecocyc_compounds[id_B]['pubchem'], kegg=ecocyc_compounds[id_B]['kegg'], cas=ecocyc_compounds[id_B]['cas'], chebi=ecocyc_compounds[id_B]['chebi']) session.add(metabolite) interactor_pairs.append( [interactor_A, [metabolite, id_B]]) # iterate through all interactor pairs and create new interactions # note interactor_pairs will be empty if: # 1) both interactors were new metabolites # 2) one or both ecoli interactors did not have orthologs in Pseudomonas for interactor_pair in interactor_pairs: homogenous = ( interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains( interactor_pair[0][0]), Interaction.interactors.contains( interactor_pair[1][0]), Interaction.homogenous == homogenous).first() source = session.query(InteractionSource).filter_by( data_source='EcoCyc').first() # if interaction doesn't exist, add it, and EcoCyc as a source if interaction is None: # if this interaction is created for first time, mark it as ortholog derived from Ecoli interaction = Interaction( type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0].type), strain=strain, homogenous=homogenous, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], ortholog_derived='Ecoli') interaction.sources.append(source) session.add(interaction), session.commit() # add EcoCyc as source for interaction if it isn't already elif source not in interaction.sources: interaction.sources.append(source) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog) interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] comment = interactor_pair[0][1] + interaction_row[ "INTERACTION_TYPE"] + interactor_pair[1][1] # iterate through all the pmids listed as reference for given interaction for pmid in interaction_row["INTERACTION_PUBMED_ID"].split( ';'): # check if interaction reference already exists in db reference = session.query( InteractionReference).filter_by( pmid=pmid, source_db='ecocyc', comment=comment, interactor_a=interactor_a, interactor_b=interactor_b).first() # if reference doesn't exist, create it, add the interaction to its references, and the # EcoCyc source to its sources if reference is None: reference = InteractionReference( pmid=pmid, source_db='ecocyc', comment=comment, interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(reference) reference.sources.append(source) # if reference does exist, add interaction to its interactions and source to its sources # (if it doesn't have them already) else: if interaction not in reference.interactions: reference.interactions.append(interaction) if source not in reference.sources: reference.sources.append(source) session.commit() print('ecocyc', session.query(Interaction).count())
def parse_ecoli_imex(session): with open('Data/Ecoli/PSICQUIC/IMEx.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: if (row['#ID(s) interactor A'] == '-') | (row['ID(s) interactor B'] == '-'): continue interactors = [] orthologs_B = [] id_B = row['ID(s) interactor B'].split(':') if id_B[0] == 'uniprotkb': orthologs_B = session.query(OrthologEcoli).filter(OrthologEcoli.ortholog_uniprot == id_B[1]).all() elif id_B[0] == 'refseq': orthologs_B = session.query(OrthologEcoli).filter(OrthologEcoli.ortholog_refseq == id_B[1]).all() if len(orthologs_B) == 0: continue orthologs_A = [] metabolite = None id_A = row['#ID(s) interactor A'].split(':') if id_A[0] == 'uniprotkb': orthologs_A = session.query(OrthologEcoli).filter(OrthologEcoli.ortholog_uniprot == id_A[1]).all() elif id_A[0] == 'refseq': orthologs_A = session.query(OrthologEcoli).filter(OrthologEcoli.ortholog_refseq == id_A[1]).all() elif id_A[0] == 'chebi': metabolite = session.query(Metabolite).filter(Metabolite.chebi == id_A[1]).first() if metabolite is None: metabolite = Metabolite(id = id_A[1], chebi = id_A[1]) session.add(metabolite), session.commit() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append([[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) if metabolite is not None: for ortholog_B in orthologs_B: interactors.append([[metabolite, metabolite.id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: strain = None if interactor_pair[0][0].type == 'p': strain = interactor_pair[0][0].strain else: strain = interactor_pair[1][0].strain interaction = Interaction(strain=strain, interactors=[interactor_pair[0][0], interactor_pair[1][0]], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0].type), ortholog_derived='fe') session.add(interaction), session.commit() # interactor_a, interactor_b = None, None # if interaction.interactors[0] == interactor_pair[0][0]: # interactor_a = interactor_pair[0][1] # interactor_b = interactor_pair[1][1] # else: # interactor_b = interactor_pair[0][1] # interactor_a = interactor_pair[1][1] # # psimi_detection, psimi_db, psimi_type, author, date, confidences = None, None, None, None, None, [None] # if 'MI' in row['Interaction detection method(s)']: # psimi_detection=row['Interaction detection method(s)'].split('MI:')[1][:4] # if 'MI' in row['Interaction type(s)']: # psimi_type = row['Interaction type(s)'].split('MI:')[1][:4] # if 'MI' in row['Source database(s)']: # psimi_db = row['Source database(s)'].split('MI:')[1][:4] # if row['Publication 1st author(s)'] != '-': # author = row['Publication 1st author(s)'].split(' ')[0] # date=row['Publication 1st author(s)'].split('(')[1][:-1] # if ('intact-miscore' in row['Confidence value(s)']) | ('author score' in row['Confidence value(s)']): # del confidences[0] # confidence_ids = row['Confidence value(s)'].split('|') # for confidence in confidence_ids: # if (confidence.split(':')[0] == 'intact-miscore') | \ # (confidence.split(':')[0] == 'author score'): # confidences.append(confidence) # for confidence in confidences: # reference = InteractionReference(interaction_id=interaction.id, # psimi_detection=psimi_detection, # detection_method= # row['Interaction detection method(s)'].split('(')[1][:-1], # author_ln=author, # pub_date=date, # pmid= # row['Publication Identifier(s)'].split('pubmed:')[1].split('|')[0], # psimi_type=psimi_type, # interaction_type=row['Interaction type(s)'].split('(')[1][:-1], # psimi_db=psimi_db, # source_db=row['Source database(s)'].split('(')[1][:-1], # confidence=confidence, # interactor_a_id=interactor_a, # interactor_b_id=interactor_b) # session.add(reference) # # source = session.query(InteractionSource).filter( # InteractionSource.interaction_id == interaction.id, # InteractionSource.data_source == 'IMEx').first() # # if source is None: # source = InteractionSource(interaction_id=interaction.id, data_source='IMEx') # session.add(source) session.commit() print(session.query(Interaction).count())
def parse_ecoli_uniprot(session): with open('Ecoli/PSICQUIC/UniProt.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] orthologs_B = [] id_B = row['ID(s) interactor B'].split(':') if id_B[0] == 'uniprotkb': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_B[1]).all() if len(orthologs_B) == 0: continue orthologs_A = [] metabolite = None id_A = row['#ID(s) interactor A'].split(':') if id_A[0] == 'uniprotkb': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_A[1]).all() elif id_A[0] == 'chebi': metabolite = session.query(Metabolite).filter( Metabolite.chebi == id_A[1]).first() if metabolite is None: metabolite = Metabolite(id=id_A[1], chebi=id_A[1]) session.add(metabolite), session.commit() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) if metabolite is not None: for ortholog_B in orthologs_B: interactors.append( [[metabolite, metabolite.id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0]), ortholog_derived='fe') if 'MI:' in row['Interaction detection method(s)']: if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] reference = InteractionReference( interaction_id=interaction.id, psimi_detection=row['Interaction detection method(s)']. split('MI:')[1][:4], detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], author_ln=row['Publication 1st author(s)'].split(' ')[0], pub_date=row['Publication 1st author(s)'].split( '(')[1][:-1], pmid=row['Publication Identifier(s)'].split( 'pubmed:')[1].split('|')[0], psimi_type=row['Interaction type(s)'].split('MI:')[1][:4], interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], psimi_db=row['Source database(s)'].split('MI:')[1][:4], source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)'], interactor_a_id=interactor_a, interactor_b_id=interactor_b) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'UniProt').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='UniProt') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_ecoli_bindingdb(session): with open('Data/Ecoli/PSICQUIC/BindingDB.txt') as csvfile: reader = csv.DictReader(csvfile) # iterate through each interaction for row in reader: uniprot_protein = None # check if interactor B has uniprot ID if 'uniprotkb' in row['ID(s) interactor B']: uniprot_protein = row['ID(s) interactor B'].split( 'uniprotkb:')[1].split('|')[0] if uniprot_protein is None: continue orthologs = [] for ecoli_ortholog in session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprot_protein).all(): if ecoli_ortholog is not None: orthologs.append( [ecoli_ortholog.protein, ecoli_ortholog.ortholog_id]) if len(orthologs) == 0: continue ids_metabolite = row['#ID(s) interactor A'].split('|') chebi_metabolite, pubchem_metabolite = None, None # check if interactor A has ChEBI id for id in ids_metabolite: if id.split(':')[0] == 'chebi': chebi_metabolite = id.split(':')[1][1:-1] metabolite = None # if interactor A has ChEBI id, query for matching metabolite if chebi_metabolite is not None: metabolite = session.query(Metabolite).filter( Metabolite.chebi == chebi_metabolite).first() # if unable to identify metabolite based on ChEBI id, try using pubchem id if metabolite is None: alt_ids_metabolite = row['Alt. ID(s) interactor A'].split('|') for id in alt_ids_metabolite: if id.split(':')[0] == 'pubchem': pubchem_metabolite = id.split(':')[1] metabolite = session.query(Metabolite).filter( Metabolite.id == pubchem_metabolite).first() # if unable to find interactor A in database, create new metabolite if metabolite is None: metabolite = Metabolite(id=pubchem_metabolite, pubchem=pubchem_metabolite, chebi=chebi_metabolite) session.add(metabolite), session.commit() for interactor in orthologs: interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor[0]), Interaction.interactors.contains(metabolite)).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor.strain, interactors=[metabolite, interactor[0]], type='p-m', ortholog_derived='fe') # should ortholog interactions be marked as experimental? if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == metabolite: interactor_a = metabolite.id interactor_b = interactor[1] else: interactor_b = metabolite.id interactor_a = interactor[1] author, date, pmid = None, None, None if row['Publication 1st author(s)'] != '-': author = row['Publication 1st author(s)'].split(' ')[0] date = row['Publication 1st author(s)'].split('(')[1][:-1] if 'pubmed:' in row['Publication Identifier(s)']: pmid = row['Publication Identifier(s)'].split( 'pubmed:')[1][:8] reference = InteractionReference( interaction_id=interaction.id, detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], author_ln=author, pmid=pmid, pub_date=date, interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], source_db=row['Source database(s)'].split('(')[1][:-1], confidence=row['Confidence value(s)'].split('(')[0], interactor_a=interactor_a, interactor_b=interactor_b) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'BindingDB').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='BindingDB') session.add(source) session.add(reference) session.commit()
def parse_kegg(org_id, strain, sourcedb, session): # get pathways for organism specified by org_id pathways = kegg_list(database='pathway', org=org_id).read().split('path:') path_ids = [] # make list of path ids to iterate through for path in pathways: if path != '': path_ids.append(path[:8]) # iterate through each path and obtain interactions for path in path_ids: # get kgml representation of path kgml_path = read(kegg_get(path, option='kgml')) path_name = kgml_path._getname() # dictionary of compounds in current path (node_id: kegg_id) # compound._getid() returns node id (only relevant in context of current path) # compound._getname() returns kegg id (relevant in overall KEGG DB) compound_ids = {} for compound in kgml_path.compounds: compound_ids[compound._getid()] = compound._getname()[-6:] # go through each relation in path for relation in kgml_path.relations: relation_type = relation.element.attrib['type'] # ignore maplink relations if relation_type == 'maplink': continue # relation._getentry1/2() returns protein id (locus) or compound id (KEGG id) entries = [relation._getentry1()._getname(), relation._getentry2()._getname()] # if one or both interactors are listed as undefined, move on to next interaction if (entries[0] == 'undefined') | (entries[1] == 'undefined'): continue # list to hold existing interactors interactors = [[], []] # list to hold new metabolite ids for interactions with metabolites not yet in the database new_metabolites = [[], []] # go through each entry in the relation for num in range(0, 2): # each entry may contain >1 id; go through all of them for id in entries[num].split(' '): if id == '': continue # if interactor is not protein or compound, continue if (id.split(':')[0] != org_id) & (id.split(':')[1] not in kegg_compounds): continue # check if the id is a kegg id by searching in kegg_compounds kegg_id= None if id.split(':')[1] in kegg_compounds: kegg_id = id.split(':')[1] # check if interactor (protein) already exists if (kegg_id is None) & (org_id != 'eco'): interactor = session.query(Interactor).get(id.split(':')[1]) if interactor is not None: # make sure to add None value; this will be needed to create interaction reference later # None is appended rather than the interactor id because the interactor is not an ortholog interactors[num].append([interactor, None]) # if it doesnt exist, it's not a valid protein, so check if it is a valid compound elif kegg_id is not None: interactor = session.query(Metabolite).filter_by(kegg = kegg_id).first() # if metabolite with id was not found, append the kegg_id to new_metabolites to create if interactor is None: new_metabolites[num].append(kegg_id) else: # if the metabolite was found, add it to the existing interactor list interactors[num].append([interactor, interactor.id]) # if parsing E. coli path, add all orthologs to interactor list elif org_id == 'eco': for ortholog in session.query(OrthologEcoli).filter_by(ortholog_id = id.split(':')[1], strain_protein = strain).all(): if ortholog is not None: # add the id of the ecoli protein for the interaction reference later interactors[num].append([ortholog.protein, id.split(':')[1]]) # create list of interactor pairs from two separate lists interactor_pairs = [] # create interactor pairs from interactors which already exist in db for interactor1 in interactors[0]: for interactor2 in interactors[1]: if (interactor1[0].type != 'm') | (interactor2[0].type != 'm'): interactor_pairs.append([interactor1, interactor2]) # create interactor pair from interactors and new metabolites for interactor1 in interactors[0]: for id in new_metabolites[1]: # ignore interactor pairs which would result in m-m interactions if interactor1[0].type == 'm': continue # Note: can query metabolite with kegg only because we updated the metabolite info first metabolite = session.query(Metabolite).filter_by(kegg = id).first() if metabolite is None: metabolite = Metabolite(id = id, kegg = id, pubchem = kegg_compounds[id]['pubchem'], chebi = kegg_compounds[id]['chebi']) session.add(metabolite) interactor_pairs.append([interactor1, [metabolite, metabolite.id]]) for interactor1 in interactors[1]: for id in new_metabolites[0]: if interactor1[0].type == 'm': continue metabolite = session.query(Metabolite).filter_by(kegg = id).first() if metabolite is None: metabolite = Metabolite(id = id, kegg = id, pubchem = kegg_compounds[id]['pubchem'], chebi = kegg_compounds[id]['chebi']) session.add(metabolite) interactor_pairs.append([interactor1, [metabolite, metabolite.id]]) # if no interactor pairs were found, move on the the next interaction if len(interactor_pairs) == 0: continue # get all intermediates in reaction of type compound intermeds = [] for subtype in relation.element.iter(tag='subtype'): # if the subtype element is a compound, get its node id if 'compound' in subtype.attrib: compound_node_id = subtype.attrib['compound'] if compound_node_id is None: continue # if the node id was not stored in the compound ids for this path, move on to the next sybtype if int(compound_node_id) not in compound_ids: continue # if compound id is valid, either add existing matching metabolite or create new one and add kegg_id = compound_ids[int(compound_node_id)] metabolite = session.query(Metabolite).filter_by(kegg = kegg_id).first() if metabolite is None: metabolite = Metabolite(id=kegg_id, name=kegg_compounds[kegg_id]['name'], pubchem=kegg_compounds[kegg_id]['pubchem'], chebi=kegg_compounds[kegg_id]['chebi'], kegg=kegg_id) session.add(metabolite) intermeds.append([metabolite, metabolite.id]) # add protein - intermediate interactor pairs for interactor_list in interactors: for interactor in interactor_list: if interactor[0].type != 'm': for intermed in intermeds: interactor_pairs.append([interactor, intermed]) # go through each interaction pair and add interaction if it doesnt exist yet for interactor_pair in interactor_pairs: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() source = session.query(InteractionSource).filter_by(data_source=sourcedb).first() #create interaction if it doesnt exist yet, add source to its sources if it isn't already if interaction is None: interaction = Interaction(type=interactor_pair[0][0].type + '-' + interactor_pair[1][0].type, strain=strain, homogenous=homogenous, interactors=[interactor_pair[0][0], interactor_pair[1][0]]) interaction.sources.append(source) if org_id == 'eco': interaction.ortholog_derived = 'Ecoli' session.add(interaction), session.commit() elif source not in interaction.sources: interaction.sources.append(source) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog if the org id is eco) interactor_a, interactor_b = None, None if org_id == 'eco': if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] # search for reference reference = session.query(InteractionReference).filter_by(source_db='kegg', comment='in ' + path_name + ' path', interactor_a=interactor_a, interactor_b=interactor_b).first() # if the reference doesnt exist, create it, add it to the interaction's references and add the source # to the reference's sources if reference is None: reference = InteractionReference(source_db='kegg', comment='in ' + path_name + ' path', interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(reference) reference.sources.append(source) # if the reference does exist, add it to the interaction's reference list and add the source to the # reference's source list if it isn't there already else: if interaction not in reference.interactions: reference.interactions.append(interaction) if source not in reference.sources: reference.sources.append(source) session.commit() print(sourcedb, session.query(Interaction).count())
def parse_psimi(session, file, source): with open(file) as csvfile: reader = csv.DictReader(csvfile, fieldnames=cols, delimiter='\t') # iterate through each interaction for row in reader: uniprot_A, refseq_A, orthologs_A, uniprot_B, refseq_B, orthologs_B = None, None, None, None, None, None # if one of the interactors is metabolite, save it's ids in pubchem and chebi pubchem, chebi = None, None # if one of the interactors is a metabolite, metabolite will be that metabolite and orthologs # will be set to the interaction's protein ortholog(s) metabolite_info, metabolite, orthologs = None, None, None # check if interactor A has uniprot or refseq id if 'uniprotkb' in row['interactor_A']: uniprot_A = row['interactor_A'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_A']: refseq_A = row['interactor_A'].split('refseq:')[1].split( '|')[0] # if uniprot id was found, look for orthologs matching that id if uniprot_A is not None: orthologs_A = session.query(OrthologEcoli).filter_by( ortholog_uniprot=uniprot_A).all() # if no orthologs were found but a refseq id was found, try to find ortholog based on refseq if (orthologs_A is None) and (refseq_A is not None): orthologs_A = session.query(OrthologEcoli).filter_by( ortholog_refseq=refseq_A).all() # if no orthologs were found for interactor A, but a uniprot or refseq does exist, # that means the ecoli interactor A is a protein without orthologs, so continue to next interaction if (orthologs_A is None) & ((uniprot_A is not None) | (refseq_A is not None)): continue # same as for interactor A above if 'uniprotkb' in row['interactor_B']: uniprot_B = row['interactor_B'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_B']: refseq_B = row['interactor_B'].split('refseq:')[1].split( '|')[0] if uniprot_B is not None: orthologs_B = session.query(OrthologEcoli).filter_by( ortholog_uniprot=uniprot_B).all() if (orthologs_B is None) and (refseq_B is not None): orthologs_B = session.query(OrthologEcoli).filter_by( ortholog_refseq=refseq_B).all() if (orthologs_B is None) & ((uniprot_B is not None) | (refseq_B is not None)): continue # if both orthologs_A and orthologs_B are None, then there are no protein interactors for this # interaction, so move on to the next interaction if (orthologs_A is None) and (orthologs_B is None): continue # if there were no orthologs for interactor A (and no refseq or uniprot was found), # search the file for pubchem or chebi ids for interactor A (as it may be a metabolite) if orthologs_A is None: if 'chebi' in row['interactor_A']: chebi = row['interactor_A'].split('CHEBI:')[1].split( '|')[0][:-1] if 'pubchem' in row['altID_A']: pubchem = row['altID_A'].split('pubchem:')[1].split('|')[0] if (chebi is None) & ('chebi' in row['altID_A']): chebi = row['altID_A'].split('CHEBI:')[1].split( '|')[0][:-1] # if no metabolite ids were found in the interaction row, then move on to the next interaction # because no interactor_A was identified if (chebi is None) & (pubchem is None): continue # if a pubchem or chebi id was found, then this interaction will be a p-m interaction, so # set the protein interactors(orthologs) to orthologs_B orthologs = orthologs_B # other case where orthologs_B were not identified so need to check if interactor B has metabolite ids elif orthologs_B is None: if 'chebi' in row['interactor_B']: chebi = row['interactor_B'].split('CHEBI:')[1].split( '|')[0][:-1] if 'pubchem' in row['altID_B']: pubchem = row['altID_B'].split('pubchem:')[1].split('|')[0] if (chebi is None) & ('chebi' in row['altID_B']): chebi = row['altID_B'].split('CHEBI:')[1].split( '|')[0][:-1] if (chebi is None) & (pubchem is None): continue orthologs = orthologs_A # if one of the interactors was identified to be a metabolite, search for the metabolite and set metabolite # variable to that value. if the metabolite doesnt exist create it # Note: if this point was reached, it means one of the interactors had protein orthologs, # so we can safely create a new metabolite knowing it will have a protein interaction partner if (chebi is not None) | (pubchem is not None): id = None # preferentially set id for new metabolites to be chebi if chebi is not None: id = chebi metabolite = session.query(Metabolite).filter_by( chebi=chebi).first() # if no metabolite with chebi was found, but pubchem id exists, try to find # metabolite with that pubchem if (metabolite is None) & (pubchem is not None): id = pubchem metabolite = session.query(Metabolite).filter_by( pubchem=pubchem).first() # if no metabolite was found with pubchem or chebi id, create new metabolite if metabolite is None: metabolite = Metabolite(id=id, chebi=chebi, pubchem=pubchem) session.add(metabolite) # if a metabolite was found, update its chebi and pubchem if it has none else: if metabolite.pubchem is None: metabolite.pubchem = pubchem if metabolite.chebi is None: metabolite.chebi = chebi # list of interactor pairs for interaction interactors = [] # if no metabolite was found for interaction, it is a p-p interaction, so iterate through # orthologs to create interactor pairs if metabolite is None: for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): # only add the interactor pair if the protein strains match if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append([[ ortholog_A.protein, ortholog_A.ortholog_id ], [ ortholog_B.protein, ortholog_B.ortholog_id ]]) else: # if a metabolite was found, add pairs of all orthologs with metabolite to interactor pairs for ortholog in orthologs: interactors.append( [[metabolite, metabolite.id], [ortholog.protein, ortholog.ortholog_id]]) # for each interactor pair, create interaction if it doesnt exist, otherwise update attributes for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is None: # since one of the interactors may be a metabolite, set strain to match strain of protein strain = None if interactor_pair[0][0].type == 'p': strain = interactor_pair[0][0].strain else: strain = interactor_pair[1][0].strain # if interaction did not exist, set it to Ecoli ortholog derived interaction = Interaction( strain=strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0].type), ortholog_derived='Ecoli') session.add(interaction), session.commit() ref_parameter_list = get_psimi_ref_list(row) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog) interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] is_experimental = is_experimental_interaction(row) # check to see if source exists nsource = session.query(InteractionSource).filter_by( data_source=source, is_experimental=is_experimental).first() # if source doesn't exist, create and add it to the interaction's sources if nsource is None: nsource = InteractionSource( data_source=source, is_experimental=is_experimental) interaction.sources.append(nsource) # if the source does exist, add it to the interaction's sources if it isn't already elif nsource not in interaction.sources: interaction.sources.append(nsource) # go through each reference in the ref_parameter list, search for it, and if it doesnt exist create it for ref in ref_parameter_list: nref = session.query(InteractionReference).filter_by( detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6], interactor_a=interactor_a, interactor_b=interactor_b).first() # if nref doesn't exist, create and add it to the interaction's reference list, # and add the source to the reference's sources if nref is None: nref = InteractionReference(detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6], interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(nref) nref.sources.append(nsource) # if nref does exist, add the interaction and source to it's attributes if they aren't added else: if interaction not in nref.interactions: nref.interactions.append(interaction) if nsource not in nref.sources: nref.sources.append(nsource) session.commit() print(source, session.query(Interaction).count())