def parse(session): with open('Data/PAO1/Zhang.csv') as csvfile: reader = csv.DictReader(csvfile) source = InteractionSource(data_source='Zhang', is_experimental=0) session.add(source), session.commit() for row in reader: if float(row['Confidence']) < 0.9: continue interactor_A = session.query(Interactor).get(row['Protein1']) if interactor_A is None: continue interactor_B = session.query(Interactor).get(row['Protein2']) if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() if interaction is None: interaction = Interaction( strain='PAO1', homogenous=homogenous, type='p-p', interactors=[interactor_A, interactor_B]) interaction.sources.append(source) session.add(interaction), session.commit() elif source not in interaction.sources: interaction.sources.append(source) reference = session.query(InteractionReference).filter_by( detection_method='computational prediction', pmid='22848443', interaction_type='predicted', confidence=row['Confidence'], comment=row['Comment']).first() if reference is None: reference = InteractionReference( detection_method='computational prediction', author_ln='Zhang', pub_date='2012', pmid='22848443', interaction_type='predicted', confidence=row['Confidence'], comment=row['Comment']) interaction.references.append(reference) reference.sources.append(source) else: if reference not in interaction.references: interaction.references.append(reference) if source not in reference.sources: reference.sources.append(source) session.commit() print('zhang', session.query(Interaction).count())
def parse(session): with open('Data/PAO1/xlinkdb.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') reference = InteractionReference( detection_method='chemical cross-linking mass spectrometry', interaction_type='physical association', author_ln='Navari', pub_date='2015', pmid='25800553', source_db='xlinkdb') source = InteractionSource(data_source='XLinkDB', is_experimental=1) source.references.append(reference) session.add(source), session.add(reference), session.commit() for row in reader: interactor_A = session.query(Interactor).get(row['proA']) if interactor_A is None: interactor_A = session.query(Protein).filter_by( uniprotkb=row['proA']).first() if interactor_A is None: continue interactor_B = session.query(Interactor).get(row['proB']) if interactor_B is None: interactor_B = session.query(Protein).filter_by( uniprotkb=row['proB']).first() if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() if interaction is None: interaction = Interaction( strain='PAO1', homogenous=homogenous, type='p-p', interactors=[interactor_A, interactor_B]) interaction.references.append(reference) interaction.sources.append(source) session.add(interaction), session.commit() else: if reference not in interaction.references: interaction.references.append(reference) if source not in interaction.sources: interaction.sources.append(source) session.commit() print('xlinkdb', session.query(Interaction).count())
def parse(session): with open('Data/PAO1/GeoffWinsor.csv') as csvfile: reader = csv.DictReader(csvfile) source = InteractionSource(data_source='Geoff', is_experimental=1) session.add(source), session.commit() for row in reader: interactor_A = session.query(Interactor).get(row['locus_tag']) if interactor_A is None: continue row = next(reader) interactor_B = session.query(Interactor).get(row['locus_tag']) if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() if interaction is None: interaction = Interaction(strain='PAO1', homogenous=homogenous , type='p-p', interactors = [interactor_A, interactor_B]) interaction.sources.append(source) session.add(interaction), session.commit() elif source not in interaction.sources: interaction.sources.append(source) reference = session.query(InteractionReference).filter_by(detection_method=row['experimental_type'], pmid=row['pmid']).first() if reference is None: reference = InteractionReference(detection_method=row['experimental_type'], pmid=row['pmid']) interaction.references.append(reference) reference.sources.append(source) else: if interaction not in reference.interactions: reference.interactions.append(interaction) if source not in reference.sources: reference.sources.append(source) session.commit() print('geoff', session.query(Interaction).count())
def parse(session): with open('Data/Ecoli/RegulonDB.csv') as csvfile: reader = csv.DictReader(csvfile) # since all the interactions from here will use the same source, create and add it at the beginning # Note: since no references are available, is_experimental is set to 2 source = InteractionSource(data_source='RegulonDB(Ecoli)', is_experimental=2) session.add(source), session.commit() for row in reader: interactors = [] orthologs_A = session.query(OrthologEcoli).filter_by( ortholog_name=(row['TF name'][0].lower() + row['TF name'][1:])).all() # if no orthologs for first interactor were found, skip to next interaction if orthologs_A is None: continue orthologs_B = session.query(OrthologEcoli).filter_by( ortholog_name=row['Regulated gene']).all() # if no orthologs for second interactor were found, skip to next interaction if orthologs_B is None: continue # iterate through each ortholog in ortholog A and B to create interactor pairs from their # respective pseudomonas proteins for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: # only add the pseudomonas interactors if their strains match if ortholog_A.strain_protein == ortholog_B.strain_protein: # make sure to add ortholog id for creating the interaction reference later interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) # iterate through each interactor pair, create a new interaction if it doesnt exist yet for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is None: # if interaction is None, make ortholog_derived = Ecoli and add source to interaction sources interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type='p-p', ortholog_derived='Ecoli') interaction.sources.append(source) session.add(interaction), session.commit() elif source not in interaction.sources: interaction.sources.append(source) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog) interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] type = 'TF/sigma-binding site (' + row[ 'Regulatory effect'] + 'regulation)' comment = interactor_pair[0][1] + ' regulates(' + row[ 'Regulatory effect'] + ') ' + interactor_pair[1][1] # create a reference for each evidence type listed for interaction for evidence in row['Evidence'][1:-1].split(', '): # check if interaction reference already exists in db reference = session.query(InteractionReference).filter_by( detection_method=evidence, interaction_type=type, source_db='regulondb', confidence=row['Evidence type'], comment=comment, interactor_a=interactor_a, interactor_b=interactor_b).first() if reference is None: # if reference is None, add reference to interaction references list and add source # to reference sources list reference = InteractionReference( detection_method=evidence, interaction_type=type, comment=comment, source_db='regulondb', confidence=row['Evidence type'], interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(reference) reference.sources.append(source) # if reference exists, check that its interactions contains interaction, and sources contains # source, and add if they are not present else: if interaction not in reference.interactions: interaction.references.append(reference) if source not in reference.sources: reference.sources.append(source) session.commit() print('regulondb', session.query(Interaction).count())
def parse_ecoli_imex(session): with open('Data/Ecoli/PSICQUIC/IMEx.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: if (row['#ID(s) interactor A'] == '-') | (row['ID(s) interactor B'] == '-'): continue interactors = [] orthologs_B = [] id_B = row['ID(s) interactor B'].split(':') if id_B[0] == 'uniprotkb': orthologs_B = session.query(OrthologEcoli).filter(OrthologEcoli.ortholog_uniprot == id_B[1]).all() elif id_B[0] == 'refseq': orthologs_B = session.query(OrthologEcoli).filter(OrthologEcoli.ortholog_refseq == id_B[1]).all() if len(orthologs_B) == 0: continue orthologs_A = [] metabolite = None id_A = row['#ID(s) interactor A'].split(':') if id_A[0] == 'uniprotkb': orthologs_A = session.query(OrthologEcoli).filter(OrthologEcoli.ortholog_uniprot == id_A[1]).all() elif id_A[0] == 'refseq': orthologs_A = session.query(OrthologEcoli).filter(OrthologEcoli.ortholog_refseq == id_A[1]).all() elif id_A[0] == 'chebi': metabolite = session.query(Metabolite).filter(Metabolite.chebi == id_A[1]).first() if metabolite is None: metabolite = Metabolite(id = id_A[1], chebi = id_A[1]) session.add(metabolite), session.commit() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append([[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) if metabolite is not None: for ortholog_B in orthologs_B: interactors.append([[metabolite, metabolite.id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: strain = None if interactor_pair[0][0].type == 'p': strain = interactor_pair[0][0].strain else: strain = interactor_pair[1][0].strain interaction = Interaction(strain=strain, interactors=[interactor_pair[0][0], interactor_pair[1][0]], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0].type), ortholog_derived='fe') session.add(interaction), session.commit() # interactor_a, interactor_b = None, None # if interaction.interactors[0] == interactor_pair[0][0]: # interactor_a = interactor_pair[0][1] # interactor_b = interactor_pair[1][1] # else: # interactor_b = interactor_pair[0][1] # interactor_a = interactor_pair[1][1] # # psimi_detection, psimi_db, psimi_type, author, date, confidences = None, None, None, None, None, [None] # if 'MI' in row['Interaction detection method(s)']: # psimi_detection=row['Interaction detection method(s)'].split('MI:')[1][:4] # if 'MI' in row['Interaction type(s)']: # psimi_type = row['Interaction type(s)'].split('MI:')[1][:4] # if 'MI' in row['Source database(s)']: # psimi_db = row['Source database(s)'].split('MI:')[1][:4] # if row['Publication 1st author(s)'] != '-': # author = row['Publication 1st author(s)'].split(' ')[0] # date=row['Publication 1st author(s)'].split('(')[1][:-1] # if ('intact-miscore' in row['Confidence value(s)']) | ('author score' in row['Confidence value(s)']): # del confidences[0] # confidence_ids = row['Confidence value(s)'].split('|') # for confidence in confidence_ids: # if (confidence.split(':')[0] == 'intact-miscore') | \ # (confidence.split(':')[0] == 'author score'): # confidences.append(confidence) # for confidence in confidences: # reference = InteractionReference(interaction_id=interaction.id, # psimi_detection=psimi_detection, # detection_method= # row['Interaction detection method(s)'].split('(')[1][:-1], # author_ln=author, # pub_date=date, # pmid= # row['Publication Identifier(s)'].split('pubmed:')[1].split('|')[0], # psimi_type=psimi_type, # interaction_type=row['Interaction type(s)'].split('(')[1][:-1], # psimi_db=psimi_db, # source_db=row['Source database(s)'].split('(')[1][:-1], # confidence=confidence, # interactor_a_id=interactor_a, # interactor_b_id=interactor_b) # session.add(reference) # # source = session.query(InteractionSource).filter( # InteractionSource.interaction_id == interaction.id, # InteractionSource.data_source == 'IMEx').first() # # if source is None: # source = InteractionSource(interaction_id=interaction.id, data_source='IMEx') # session.add(source) session.commit() print(session.query(Interaction).count())
def parse_ecoli_uniprot(session): with open('Ecoli/PSICQUIC/UniProt.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] orthologs_B = [] id_B = row['ID(s) interactor B'].split(':') if id_B[0] == 'uniprotkb': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_B[1]).all() if len(orthologs_B) == 0: continue orthologs_A = [] metabolite = None id_A = row['#ID(s) interactor A'].split(':') if id_A[0] == 'uniprotkb': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_A[1]).all() elif id_A[0] == 'chebi': metabolite = session.query(Metabolite).filter( Metabolite.chebi == id_A[1]).first() if metabolite is None: metabolite = Metabolite(id=id_A[1], chebi=id_A[1]) session.add(metabolite), session.commit() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) if metabolite is not None: for ortholog_B in orthologs_B: interactors.append( [[metabolite, metabolite.id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0]), ortholog_derived='fe') if 'MI:' in row['Interaction detection method(s)']: if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] reference = InteractionReference( interaction_id=interaction.id, psimi_detection=row['Interaction detection method(s)']. split('MI:')[1][:4], detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], author_ln=row['Publication 1st author(s)'].split(' ')[0], pub_date=row['Publication 1st author(s)'].split( '(')[1][:-1], pmid=row['Publication Identifier(s)'].split( 'pubmed:')[1].split('|')[0], psimi_type=row['Interaction type(s)'].split('MI:')[1][:4], interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], psimi_db=row['Source database(s)'].split('MI:')[1][:4], source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)'], interactor_a_id=interactor_a, interactor_b_id=interactor_b) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'UniProt').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='UniProt') session.add(source) session.commit() print(session.query(Interaction).count())
def parse(session): # create and add sources for the interactions (do this before since they all use the same source) # Note: is_experimental is set to 2 because we cannot confirm that detection method was experimental or not source_PAO1 = InteractionSource(data_source='Galan-Vasquez(PAO1)', is_experimental=2) source_PA14 = InteractionSource(data_source='Galan-Vasquez(PA14)', is_experimental=2) session.add(source_PAO1), session.add(source_PA14), session.commit() with open('Data/PAO1_PA14/regulatory_network.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: # a row describing an interaction may have >1 strain strains = row['Strain'].split(',') for strain in strains: # only care about PAO1 and PA14 strain interactions if (strain != 'PAO1') and (strain != 'PA14'): continue # search for interactor A by name interactor_A = session.query(Protein).filter_by( name=row['Regulator'], strain=strain).first() # if no interactor was found by name, id listed may be a gene locus, so search by this id if interactor_A is None: interactor_A = session.query(Interactor).get( row['Regulator']) # if no interactor A was found for this interaction, skip to next if interactor_A is None: continue # same as A above interactor_B = session.query(Protein).filter_by( name=row['Target'], strain=strain).first() if interactor_B is None: interactor_B = session.query(Interactor).get(row['Target']) if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() # if interaction between these 2 interactors does not yet exist, create and add it if interaction is None: interaction = Interaction( strain=strain, type='p-p', homogenous=homogenous, interactors=[interactor_A, interactor_B]) session.add(interaction), session.commit() # specify the source to be used for the interaction and reference based on strain of interaction source = None if strain == 'PAO1': source = source_PAO1 else: source = source_PA14 # add the source to the interaction source list if it isn't there already if source not in interaction.sources: interaction.sources.append(source) # get source db and detections if they are present in the file source_db, detections = None, [None] if row['source_db'] != '': source_db = row['source_db'] if row['evidence'] != '': del detections[0] for type in row['evidence'].split(', '): detections.append(type) # create a new reference for each detection found, add the reference to the interaction's # reference list, and add the source to the reference's sources for detection in detections: reference = InteractionReference( detection_method=detection, pmid=row['pmid'], interaction_type='TF/sigma-binding site (' + row['mode'] + 'regulation)', source_db=source_db, comment=interactor_A.id + ' regulates(' + row['mode'] + ') ' + interactor_B.id) interaction.references.append(reference) reference.sources.append(source) session.commit() print('regnet', session.query(Interaction).count())
def parse_mpidb(session): with open('PAO1/PSICQUIC/MPIDB.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] if (row['Taxid interactor A'].split('|')[0] != 'taxid:208964(pseae)') |\ (row['Taxid interactor B'].split('|')[0] != 'taxid:208964(pseae)'): continue A_id = row['#ID(s) interactor A'].split(':')[1] B_id = row['ID(s) interactor B'].split(':')[1] if session.query(Interactor).filter( Interactor.id == A_id).first() is not None: interactors.append( session.query(Interactor).filter( Interactor.id == A_id).one()) elif session.query(Protein).filter( Protein.uniprotkb == A_id).first() is not None: interactors.append( session.query(Protein).filter( Protein.uniprotkb == A_id).one()) if session.query(Interactor).filter( Interactor.id == B_id).first() is not None: interactors.append( session.query(Interactor).filter( Interactor.id == B_id).one()) elif session.query(Protein).filter( Protein.uniprotkb == B_id).first() is not None: interactors.append( session.query(Protein).filter( Protein.uniprotkb == B_id).one()) if len(interactors) != 2: continue homogenous = (interactors[0] == interactors[1]) interaction = session.query(Interaction).filter( (Interaction.interactors.contains(interactors[0])), (Interaction.interactors.contains(interactors[1])), (Interaction.homogenous == homogenous)).first() if interaction is None: type = interactors[0].type + '-' + interactors[1].type interaction = Interaction(strain='PAO1', type=type, homogenous=homogenous, interactors=interactors) if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 else: interaction.is_experimental = 0 session.add(interaction), session.commit() else: if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 reference = InteractionReference( interaction_id=interaction.id, detection_method=row['Interaction detection method(s)'].split( '(')[1][:-1], author_ln=row['Publication 1st author(s)'].split(' ')[0], pub_date=row['Publication 1st author(s)'].split('(')[1][:-1], pmid=row['Publication Identifier(s)'].split('pubmed:')[1][:8], confidence=row['Confidence value(s)'], interaction_type=row['Interaction type(s)'].split('(')[1][:-1], source_db=row['Source database(s)']) session.add(reference) for xref in row['Interaction identifier(s)'].split('|'): xref_field = xref.split(':') xref = session.query(InteractionXref).filter( InteractionXref.accession == xref_field[1], InteractionXref.interaction_id == interaction.id).first() if xref is None: xref = InteractionXref(interaction_id=interaction.id, accession=xref_field[1], data_source=xref_field[0]) session.add(xref) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'MPIDB').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='MPIDB') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_psimi(session, file, source): with open(file) as csvfile: reader = csv.DictReader(csvfile, fieldnames=cols, delimiter='\t') # iterate through each interaction for row in reader: uniprot_A, refseq_A, orthologs_A, uniprot_B, refseq_B, orthologs_B = None, None, None, None, None, None # if one of the interactors is metabolite, save it's ids in pubchem and chebi pubchem, chebi = None, None # if one of the interactors is a metabolite, metabolite will be that metabolite and orthologs # will be set to the interaction's protein ortholog(s) metabolite_info, metabolite, orthologs = None, None, None # check if interactor A has uniprot or refseq id if 'uniprotkb' in row['interactor_A']: uniprot_A = row['interactor_A'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_A']: refseq_A = row['interactor_A'].split('refseq:')[1].split( '|')[0] # if uniprot id was found, look for orthologs matching that id if uniprot_A is not None: orthologs_A = session.query(OrthologEcoli).filter_by( ortholog_uniprot=uniprot_A).all() # if no orthologs were found but a refseq id was found, try to find ortholog based on refseq if (orthologs_A is None) and (refseq_A is not None): orthologs_A = session.query(OrthologEcoli).filter_by( ortholog_refseq=refseq_A).all() # if no orthologs were found for interactor A, but a uniprot or refseq does exist, # that means the ecoli interactor A is a protein without orthologs, so continue to next interaction if (orthologs_A is None) & ((uniprot_A is not None) | (refseq_A is not None)): continue # same as for interactor A above if 'uniprotkb' in row['interactor_B']: uniprot_B = row['interactor_B'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_B']: refseq_B = row['interactor_B'].split('refseq:')[1].split( '|')[0] if uniprot_B is not None: orthologs_B = session.query(OrthologEcoli).filter_by( ortholog_uniprot=uniprot_B).all() if (orthologs_B is None) and (refseq_B is not None): orthologs_B = session.query(OrthologEcoli).filter_by( ortholog_refseq=refseq_B).all() if (orthologs_B is None) & ((uniprot_B is not None) | (refseq_B is not None)): continue # if both orthologs_A and orthologs_B are None, then there are no protein interactors for this # interaction, so move on to the next interaction if (orthologs_A is None) and (orthologs_B is None): continue # if there were no orthologs for interactor A (and no refseq or uniprot was found), # search the file for pubchem or chebi ids for interactor A (as it may be a metabolite) if orthologs_A is None: if 'chebi' in row['interactor_A']: chebi = row['interactor_A'].split('CHEBI:')[1].split( '|')[0][:-1] if 'pubchem' in row['altID_A']: pubchem = row['altID_A'].split('pubchem:')[1].split('|')[0] if (chebi is None) & ('chebi' in row['altID_A']): chebi = row['altID_A'].split('CHEBI:')[1].split( '|')[0][:-1] # if no metabolite ids were found in the interaction row, then move on to the next interaction # because no interactor_A was identified if (chebi is None) & (pubchem is None): continue # if a pubchem or chebi id was found, then this interaction will be a p-m interaction, so # set the protein interactors(orthologs) to orthologs_B orthologs = orthologs_B # other case where orthologs_B were not identified so need to check if interactor B has metabolite ids elif orthologs_B is None: if 'chebi' in row['interactor_B']: chebi = row['interactor_B'].split('CHEBI:')[1].split( '|')[0][:-1] if 'pubchem' in row['altID_B']: pubchem = row['altID_B'].split('pubchem:')[1].split('|')[0] if (chebi is None) & ('chebi' in row['altID_B']): chebi = row['altID_B'].split('CHEBI:')[1].split( '|')[0][:-1] if (chebi is None) & (pubchem is None): continue orthologs = orthologs_A # if one of the interactors was identified to be a metabolite, search for the metabolite and set metabolite # variable to that value. if the metabolite doesnt exist create it # Note: if this point was reached, it means one of the interactors had protein orthologs, # so we can safely create a new metabolite knowing it will have a protein interaction partner if (chebi is not None) | (pubchem is not None): id = None # preferentially set id for new metabolites to be chebi if chebi is not None: id = chebi metabolite = session.query(Metabolite).filter_by( chebi=chebi).first() # if no metabolite with chebi was found, but pubchem id exists, try to find # metabolite with that pubchem if (metabolite is None) & (pubchem is not None): id = pubchem metabolite = session.query(Metabolite).filter_by( pubchem=pubchem).first() # if no metabolite was found with pubchem or chebi id, create new metabolite if metabolite is None: metabolite = Metabolite(id=id, chebi=chebi, pubchem=pubchem) session.add(metabolite) # if a metabolite was found, update its chebi and pubchem if it has none else: if metabolite.pubchem is None: metabolite.pubchem = pubchem if metabolite.chebi is None: metabolite.chebi = chebi # list of interactor pairs for interaction interactors = [] # if no metabolite was found for interaction, it is a p-p interaction, so iterate through # orthologs to create interactor pairs if metabolite is None: for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): # only add the interactor pair if the protein strains match if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append([[ ortholog_A.protein, ortholog_A.ortholog_id ], [ ortholog_B.protein, ortholog_B.ortholog_id ]]) else: # if a metabolite was found, add pairs of all orthologs with metabolite to interactor pairs for ortholog in orthologs: interactors.append( [[metabolite, metabolite.id], [ortholog.protein, ortholog.ortholog_id]]) # for each interactor pair, create interaction if it doesnt exist, otherwise update attributes for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is None: # since one of the interactors may be a metabolite, set strain to match strain of protein strain = None if interactor_pair[0][0].type == 'p': strain = interactor_pair[0][0].strain else: strain = interactor_pair[1][0].strain # if interaction did not exist, set it to Ecoli ortholog derived interaction = Interaction( strain=strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0].type), ortholog_derived='Ecoli') session.add(interaction), session.commit() ref_parameter_list = get_psimi_ref_list(row) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog) interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] is_experimental = is_experimental_interaction(row) # check to see if source exists nsource = session.query(InteractionSource).filter_by( data_source=source, is_experimental=is_experimental).first() # if source doesn't exist, create and add it to the interaction's sources if nsource is None: nsource = InteractionSource( data_source=source, is_experimental=is_experimental) interaction.sources.append(nsource) # if the source does exist, add it to the interaction's sources if it isn't already elif nsource not in interaction.sources: interaction.sources.append(nsource) # go through each reference in the ref_parameter list, search for it, and if it doesnt exist create it for ref in ref_parameter_list: nref = session.query(InteractionReference).filter_by( detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6], interactor_a=interactor_a, interactor_b=interactor_b).first() # if nref doesn't exist, create and add it to the interaction's reference list, # and add the source to the reference's sources if nref is None: nref = InteractionReference(detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6], interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(nref) nref.sources.append(nsource) # if nref does exist, add the interaction and source to it's attributes if they aren't added else: if interaction not in nref.interactions: nref.interactions.append(interaction) if nsource not in nref.sources: nref.sources.append(nsource) session.commit() print(source, session.query(Interaction).count())
def parse_mentha(file, strain, taxid, session): with open(file) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] if ((row['Taxid interactor A'].split('|')[0] != taxid) | (row['Taxid interactor B'].split('|')[0] != taxid)): continue A_id = row['#ID(s) interactor A'].split(':')[1] B_id = row['ID(s) interactor B'].split(':')[1] if session.query(Interactor).filter(Interactor.id == A_id).first() is not None: interactors.append(session.query(Interactor).filter(Interactor.id == A_id).one()) elif session.query(Protein).filter(Protein.uniprotkb == A_id).first() is not None: interactors.append(session.query(Protein).filter(Protein.uniprotkb == A_id).one()) if session.query(Interactor).filter(Interactor.id == B_id).first() is not None: interactors.append(session.query(Interactor).filter(Interactor.id == B_id).one()) elif session.query(Protein).filter(Protein.uniprotkb == B_id).first() is not None: interactors.append(session.query(Protein).filter(Protein.uniprotkb == B_id).one()) if len(interactors) != 2: continue homogenous = (interactors[0] == interactors[1]) interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactors[0]), Interaction.interactors.contains(interactors[1]), Interaction.homogenous == homogenous).first() if interaction is None: type=(interactors[0].type + '-' + interactors[1].type) interaction = Interaction(strain=strain, type=type, homogenous=homogenous, interactors=interactors) if is_experimental_psimi(row['Interaction detection method(s)'].split('MI:')[1][:4]): interaction.is_experimental = 1 else: interaction.is_experimental = 0 session.add(interaction), session.commit() else: if is_experimental_psimi(row['Interaction detection method(s)'].split('MI:')[1][:4]): interaction.is_experimental = 1 reference = InteractionReference(interaction_id=interaction.id, detection_method=row['Interaction detection method(s)'].split('(')[1][:-1], pmid=row['Publication Identifier(s)'].split('pubmed:')[1][:8], interaction_type=row['Interaction type(s)'].split('(')[1][:-1], source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)']) session.add(reference) xref_field = row['Interaction identifier(s)'].split(':') xref = session.query(InteractionXref).filter(InteractionXref.accession == xref_field[1], InteractionXref.interaction_id == interaction.id).first() if xref is None: xref = InteractionXref(interaction_id=interaction.id, accession=xref_field[1], data_source=xref_field[0]) session.add(xref) source = session.query(InteractionSource).filter(InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'mentha').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='mentha') session.add(source) print(session.query(Interaction).count())
def parse_irefindex(file, strain, taxid, session): with open(file) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] if ((row['Taxid interactor A'].split('|')[0] != taxid) | (row['Taxid interactor B'].split('|')[0] != taxid)): continue A_id = row['#ID(s) interactor A'].split(':') B_id = row['ID(s) interactor B'].split(':') if A_id[0] == 'uniprotkb': if session.query(Interactor).filter(Interactor.id == A_id[1]).first() is not None: interactors.append(session.query(Interactor).filter(Interactor.id == A_id[1]).one()) elif session.query(Protein).filter(Protein.uniprotkb == A_id[1]).first() is not None: interactors.append(session.query(Protein).filter(Protein.uniprotkb == A_id[1]).one()) elif A_id[0] == 'refseq': if session.query(Protein).filter(Protein.ncbi_acc == A_id[1]).first() is not None: interactors.append(session.query(Protein).filter(Protein.ncbi_acc == A_id[1]).one()) if B_id[0] == 'uniprotkb': if session.query(Interactor).filter(Interactor.id == B_id[1]).first() is not None: interactors.append(session.query(Interactor).filter(Interactor.id == B_id[1]).one()) elif session.query(Protein).filter(Protein.uniprotkb == B_id[1]).first() is not None: interactors.append(session.query(Protein).filter(Protein.uniprotkb == B_id[1]).one()) elif B_id[0] == 'refseq': if session.query(Protein).filter(Protein.ncbi_acc == B_id[1]).first() is not None: interactors.append(session.query(Protein).filter(Protein.ncbi_acc == B_id[1]).one()) if len(interactors) != 2: continue homogenous = (interactors[0] == interactors[1]) interaction = session.query(Interaction).filter((Interaction.interactors.contains(interactors[0])), (Interaction.interactors.contains(interactors[1])), (Interaction.homogenous == homogenous)).first() if interaction is None: type = interactors[0].type + '-' + interactors[1].type interaction = Interaction(strain=strain, type=type, homogenous=homogenous, interactors=interactors) if row['Interaction detection method(s)'] != '-': if is_experimental_psimi(row['Interaction detection method(s)'].split('MI:')[1][:4]): interaction.is_experimental = 1 else: interaction.is_experimental = 0 else: if is_experimental_psimi(row['Interaction detection method(s)'].split('MI:')[1][:4]): interaction.is_experimental = 1 elif (row['Interaction detection method(s)'] == '-') and (interaction.is_experimental == 0): interaction.is_experimental = None author, date, type= None, None, None pmids, detections = [None], [None] if row['Interaction detection method(s)'] != '-': del detections[0] for method in row['Interaction detection method(s)'].split('|'): detections.append(method.split('(')[1][:-1]) if (row['Interaction type(s)'] != '-'): type = row['Interaction type(s)'].split('(')[1][:-1] if (row['Publication 1st author(s)'] != '-'): author = row['Publication 1st author(s)'].split('-')[0][0].upper() + \ row['Publication 1st author(s)'].split('-')[0][1:] date = row['Publication 1st author(s)'].split('-')[1] if (row['Publication Identifier(s)'] != '-'): del pmids[0] for pmid in row['Publication Identifier(s)'].split('|'): pmids.append(pmid.split('pubmed:')[1][:8]) for pmid in pmids: for detection in detections: reference = InteractionReference(interaction_id=interaction.id, detection_method=detection, author_ln=author, pub_date=date, pmid=pmid, interaction_type=type, source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)']) session.add(reference) for xref in row['Interaction identifier(s)'].split('|'): xref_field = xref.split(':') xref = session.query(InteractionXref).filter(InteractionXref.accession == xref_field[1], InteractionXref.interaction_id == interaction.id).first() if xref is None: xref = InteractionXref(interaction_id=interaction.id, accession=xref_field[1], data_source=xref_field[0]) session.add(xref) source = session.query(InteractionSource).filter(InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'iRefIndex').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='iRefIndex') session.add(source) print(session.query(Interaction).count())
def parse_ecoli_bindingdb(session): with open('Data/Ecoli/PSICQUIC/BindingDB.txt') as csvfile: reader = csv.DictReader(csvfile) # iterate through each interaction for row in reader: uniprot_protein = None # check if interactor B has uniprot ID if 'uniprotkb' in row['ID(s) interactor B']: uniprot_protein = row['ID(s) interactor B'].split( 'uniprotkb:')[1].split('|')[0] if uniprot_protein is None: continue orthologs = [] for ecoli_ortholog in session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprot_protein).all(): if ecoli_ortholog is not None: orthologs.append( [ecoli_ortholog.protein, ecoli_ortholog.ortholog_id]) if len(orthologs) == 0: continue ids_metabolite = row['#ID(s) interactor A'].split('|') chebi_metabolite, pubchem_metabolite = None, None # check if interactor A has ChEBI id for id in ids_metabolite: if id.split(':')[0] == 'chebi': chebi_metabolite = id.split(':')[1][1:-1] metabolite = None # if interactor A has ChEBI id, query for matching metabolite if chebi_metabolite is not None: metabolite = session.query(Metabolite).filter( Metabolite.chebi == chebi_metabolite).first() # if unable to identify metabolite based on ChEBI id, try using pubchem id if metabolite is None: alt_ids_metabolite = row['Alt. ID(s) interactor A'].split('|') for id in alt_ids_metabolite: if id.split(':')[0] == 'pubchem': pubchem_metabolite = id.split(':')[1] metabolite = session.query(Metabolite).filter( Metabolite.id == pubchem_metabolite).first() # if unable to find interactor A in database, create new metabolite if metabolite is None: metabolite = Metabolite(id=pubchem_metabolite, pubchem=pubchem_metabolite, chebi=chebi_metabolite) session.add(metabolite), session.commit() for interactor in orthologs: interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor[0]), Interaction.interactors.contains(metabolite)).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor.strain, interactors=[metabolite, interactor[0]], type='p-m', ortholog_derived='fe') # should ortholog interactions be marked as experimental? if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == metabolite: interactor_a = metabolite.id interactor_b = interactor[1] else: interactor_b = metabolite.id interactor_a = interactor[1] author, date, pmid = None, None, None if row['Publication 1st author(s)'] != '-': author = row['Publication 1st author(s)'].split(' ')[0] date = row['Publication 1st author(s)'].split('(')[1][:-1] if 'pubmed:' in row['Publication Identifier(s)']: pmid = row['Publication Identifier(s)'].split( 'pubmed:')[1][:8] reference = InteractionReference( interaction_id=interaction.id, detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], author_ln=author, pmid=pmid, pub_date=date, interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], source_db=row['Source database(s)'].split('(')[1][:-1], confidence=row['Confidence value(s)'].split('(')[0], interactor_a=interactor_a, interactor_b=interactor_b) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'BindingDB').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='BindingDB') session.add(source) session.add(reference) session.commit()
def parse_psimi(file, strain, source, session): with open(file) as csvfile: reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=cols) next(reader) for row in reader: uniprot_A, refseq_A, interactor_A, uniprot_B, refseq_B, interactor_B = None, None, None, None, None, None # check if interactor A has uniprot or refseq id, store these values if 'uniprotkb' in row['interactor_A']: uniprot_A = row['interactor_A'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_A']: refseq_A = row['interactor_A'].split('refseq:')[1].split( '|')[0] # if a uniprot id was found, try to find the interactor in the database if uniprot_A is not None: # check if there is a protein-complex with this uniprot id interactor_A = session.query(Interactor).get(uniprot_A) # if no protein complex, check for protein matching the uniprot id if interactor_A is None: interactor_A = session.query(Protein).filter_by( uniprotkb=uniprot_A).first() # if no interactor A was found but there was also a refseq id, try to find the protein based on # it's refseq if (interactor_A is None) and (refseq_A is not None): interactor_A = session.query(Protein).filter_by( ncbi_acc=refseq_A).first() # if no interactor A was found, move on to next interaction if interactor_A is None: continue # same as for A above if 'uniprotkb' in row['interactor_B']: uniprot_B = row['interactor_B'].split('uniprotkb:')[1].split( '|')[0] if 'refseq' in row['interactor_B']: refseq_B = row['interactor_B'].split('refseq:')[1].split( '|')[0] if uniprot_B is not None: interactor_B = session.query(Interactor).get(uniprot_B) if interactor_B is None: interactor_B = session.query(Protein).filter_by( uniprotkb=uniprot_B).first() if (interactor_B is None) and (refseq_B is not None): interactor_B = session.query(Protein).filter_by( ncbi_acc=refseq_B).first() if interactor_B is None: continue homogenous = (interactor_A == interactor_B) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_A), Interaction.interactors.contains(interactor_B), Interaction.homogenous == homogenous).first() # if no interaction was found with the interactors, create a new interaction if interaction is None: interaction = Interaction( strain=strain, type='p-p', homogenous=homogenous, interactors=[interactor_A, interactor_B]) session.add(interaction), session.commit() ref_parameter_list = get_psimi_ref_list(row) is_experimental = is_experimental_interaction(row) # check to see if source exists nsource = session.query(InteractionSource).filter_by( data_source=source, is_experimental=is_experimental).first() # if source doesn't exist, create and add it to the interaction's sources if nsource is None: nsource = InteractionSource(data_source=source, is_experimental=is_experimental) interaction.sources.append(nsource) # if the source does exist, add it to the interaction's sources if it isn't already elif nsource not in interaction.sources: interaction.sources.append(nsource) # go through each reference in the ref_parameter list, search for it, and if it doesnt exist create it for ref in ref_parameter_list: nref = session.query(InteractionReference).filter_by( detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6], interactor_a=None, interactor_b=None).first() # if nref doesn't exist, create and add it to the interaction's reference list, # and add the source to the reference's sources if nref is None: nref = InteractionReference(detection_method=ref[0], author_ln=ref[1], pub_date=ref[2], pmid=ref[3], interaction_type=ref[4], source_db=ref[5], confidence=ref[6]) interaction.references.append(nref) nref.sources.append(nsource) # if nref does exist, add the interaction and source to it's attributes if they aren't added else: if interaction not in nref.interactions: nref.interactions.append(interaction) if nsource not in nref.sources: nref.sources.append(nsource) #collect all the cross references for the interaction for xref in row['identifier'].split('|'): xref_field = xref.split(':') # check if the cross reference exists for this interaction, if it doesnt create it xref = session.query(InteractionXref).filter_by( accession=xref_field[1], interaction_id=interaction.id).first() if xref is None: xref = InteractionXref(interaction_id=interaction.id, accession=xref_field[1], data_source=xref_field[0]) session.add(xref) session.commit() print(source, session.query(Interaction).count())
def parse_ecoli_ebi_goa_nonintact(session): with open('Ecoli/PSICQUIC/EBI-GOA-nonIntAct.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] uniprot_A, uniprot_B = None, None if 'uniprotkb:' in row['#ID(s) interactor A']: uniprot_A = row['#ID(s) interactor A'].split('uniprotkb:')[1] if 'uniprotkb:' in row['ID(s) interactor B']: uniprot_B = row['ID(s) interactor B'].split('uniprotkb:')[1] if (uniprot_A is None) | (uniprot_B is None): continue orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprot_A).all() orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprot_B).all() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=interactor_pair, type='p-p', ortholog_derived='fe') if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] reference = InteractionReference( interaction_id=interaction.id, detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], author_ln=row['Publication 1st author(s)'].split(' ')[0], pub_date=row['Publication 1st author(s)'].split('(')[1], pmid=row['Publication Identifier(s)'].split('pubmed:')[1], interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], source_db=row['Source database(s)'].split('(')[1][:-1], interactor_a_id=row['#ID(s) interactor A'].split(':')[1], interactor_b_id=row['ID(s) interactor B'].split(':')[1]) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'EBI-GOA non-IntAct').first() if source is None: source = InteractionSource( interaction_id=interaction.id, data_source='EBI-GOA non-IntAct') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_kegg(org_id, strain, sourcedb, session): # get pathways for organism specified by org_id pathways = kegg_list(database='pathway', org=org_id).read().split('path:') path_ids = [] # make list of path ids to iterate through for path in pathways: if path != '': path_ids.append(path[:8]) # iterate through each path and obtain interactions for path in path_ids: # get kgml representation of path kgml_path = read(kegg_get(path, option='kgml')) path_name = kgml_path._getname() # dictionary of compounds in current path (node_id: kegg_id) # compound._getid() returns node id (only relevant in context of current path) # compound._getname() returns kegg id (relevant in overall KEGG DB) compound_ids = {} for compound in kgml_path.compounds: compound_ids[compound._getid()] = compound._getname()[-6:] # go through each relation in path for relation in kgml_path.relations: relation_type = relation.element.attrib['type'] # ignore maplink relations if relation_type == 'maplink': continue # relation._getentry1/2() returns protein id (locus) or compound id (KEGG id) entries = [relation._getentry1()._getname(), relation._getentry2()._getname()] # if one or both interactors are listed as undefined, move on to next interaction if (entries[0] == 'undefined') | (entries[1] == 'undefined'): continue # list to hold existing interactors interactors = [[], []] # list to hold new metabolite ids for interactions with metabolites not yet in the database new_metabolites = [[], []] # go through each entry in the relation for num in range(0, 2): # each entry may contain >1 id; go through all of them for id in entries[num].split(' '): if id == '': continue # if interactor is not protein or compound, continue if (id.split(':')[0] != org_id) & (id.split(':')[1] not in kegg_compounds): continue # check if the id is a kegg id by searching in kegg_compounds kegg_id= None if id.split(':')[1] in kegg_compounds: kegg_id = id.split(':')[1] # check if interactor (protein) already exists if (kegg_id is None) & (org_id != 'eco'): interactor = session.query(Interactor).get(id.split(':')[1]) if interactor is not None: # make sure to add None value; this will be needed to create interaction reference later # None is appended rather than the interactor id because the interactor is not an ortholog interactors[num].append([interactor, None]) # if it doesnt exist, it's not a valid protein, so check if it is a valid compound elif kegg_id is not None: interactor = session.query(Metabolite).filter_by(kegg = kegg_id).first() # if metabolite with id was not found, append the kegg_id to new_metabolites to create if interactor is None: new_metabolites[num].append(kegg_id) else: # if the metabolite was found, add it to the existing interactor list interactors[num].append([interactor, interactor.id]) # if parsing E. coli path, add all orthologs to interactor list elif org_id == 'eco': for ortholog in session.query(OrthologEcoli).filter_by(ortholog_id = id.split(':')[1], strain_protein = strain).all(): if ortholog is not None: # add the id of the ecoli protein for the interaction reference later interactors[num].append([ortholog.protein, id.split(':')[1]]) # create list of interactor pairs from two separate lists interactor_pairs = [] # create interactor pairs from interactors which already exist in db for interactor1 in interactors[0]: for interactor2 in interactors[1]: if (interactor1[0].type != 'm') | (interactor2[0].type != 'm'): interactor_pairs.append([interactor1, interactor2]) # create interactor pair from interactors and new metabolites for interactor1 in interactors[0]: for id in new_metabolites[1]: # ignore interactor pairs which would result in m-m interactions if interactor1[0].type == 'm': continue # Note: can query metabolite with kegg only because we updated the metabolite info first metabolite = session.query(Metabolite).filter_by(kegg = id).first() if metabolite is None: metabolite = Metabolite(id = id, kegg = id, pubchem = kegg_compounds[id]['pubchem'], chebi = kegg_compounds[id]['chebi']) session.add(metabolite) interactor_pairs.append([interactor1, [metabolite, metabolite.id]]) for interactor1 in interactors[1]: for id in new_metabolites[0]: if interactor1[0].type == 'm': continue metabolite = session.query(Metabolite).filter_by(kegg = id).first() if metabolite is None: metabolite = Metabolite(id = id, kegg = id, pubchem = kegg_compounds[id]['pubchem'], chebi = kegg_compounds[id]['chebi']) session.add(metabolite) interactor_pairs.append([interactor1, [metabolite, metabolite.id]]) # if no interactor pairs were found, move on the the next interaction if len(interactor_pairs) == 0: continue # get all intermediates in reaction of type compound intermeds = [] for subtype in relation.element.iter(tag='subtype'): # if the subtype element is a compound, get its node id if 'compound' in subtype.attrib: compound_node_id = subtype.attrib['compound'] if compound_node_id is None: continue # if the node id was not stored in the compound ids for this path, move on to the next sybtype if int(compound_node_id) not in compound_ids: continue # if compound id is valid, either add existing matching metabolite or create new one and add kegg_id = compound_ids[int(compound_node_id)] metabolite = session.query(Metabolite).filter_by(kegg = kegg_id).first() if metabolite is None: metabolite = Metabolite(id=kegg_id, name=kegg_compounds[kegg_id]['name'], pubchem=kegg_compounds[kegg_id]['pubchem'], chebi=kegg_compounds[kegg_id]['chebi'], kegg=kegg_id) session.add(metabolite) intermeds.append([metabolite, metabolite.id]) # add protein - intermediate interactor pairs for interactor_list in interactors: for interactor in interactor_list: if interactor[0].type != 'm': for intermed in intermeds: interactor_pairs.append([interactor, intermed]) # go through each interaction pair and add interaction if it doesnt exist yet for interactor_pair in interactor_pairs: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter(Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() source = session.query(InteractionSource).filter_by(data_source=sourcedb).first() #create interaction if it doesnt exist yet, add source to its sources if it isn't already if interaction is None: interaction = Interaction(type=interactor_pair[0][0].type + '-' + interactor_pair[1][0].type, strain=strain, homogenous=homogenous, interactors=[interactor_pair[0][0], interactor_pair[1][0]]) interaction.sources.append(source) if org_id == 'eco': interaction.ortholog_derived = 'Ecoli' session.add(interaction), session.commit() elif source not in interaction.sources: interaction.sources.append(source) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog if the org id is eco) interactor_a, interactor_b = None, None if org_id == 'eco': if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] # search for reference reference = session.query(InteractionReference).filter_by(source_db='kegg', comment='in ' + path_name + ' path', interactor_a=interactor_a, interactor_b=interactor_b).first() # if the reference doesnt exist, create it, add it to the interaction's references and add the source # to the reference's sources if reference is None: reference = InteractionReference(source_db='kegg', comment='in ' + path_name + ' path', interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(reference) reference.sources.append(source) # if the reference does exist, add it to the interaction's reference list and add the source to the # reference's source list if it isn't there already else: if interaction not in reference.interactions: reference.interactions.append(interaction) if source not in reference.sources: reference.sources.append(source) session.commit() print(sourcedb, session.query(Interaction).count())
def parse_ecocyc(strain, session): for path in ecocyc_paths: interaction_file_name = "Data/Ecoli/ecocyc_files/interactions_sif/" + path + "_interactions.txt" #if there was a problem with obtaining the sif files for a pathway, they may not exist if not exists(interaction_file_name): continue with open(interaction_file_name) as file: reader = csv.DictReader(file) for interaction_row in reader: interactors_A, interactors_B = [], [] new_metabolite_A, new_metabolite_B = None, None id_A = interaction_row['PARTICIPANT_A'] id_B = interaction_row['PARTICIPANT_B'] #if id_A isn't in ecocyc_compounds, it's a uniprot id; search for ecoli orthologs matching id_A if id_A not in ecocyc_compounds: for ortholog in session.query(OrthologEcoli).filter_by( ortholog_uniprot=id_A, strain_protein=strain).all(): if ortholog is not None: # add both the pseudomonas protein and the ortholog id (will be needed later to # create interaction reference) to interactors_A interactors_A.append( [ortholog.protein, ortholog.ortholog_id]) # if id_A is in ecocyc_compounds, it means it's a metabolite id else: A_ecocyc = ecocyc_compounds[id_A]['ecocyc'] #check if the metabolite already exists in database (only need to search ecocyc id since # update_metabolites_ecocyc was called) metabolite = session.query(Metabolite).filter_by( ecocyc=A_ecocyc).first() if metabolite is not None: # if metabolite exists, add both the metabolite and it's name (will be needed later # to create interaction reference) to interactors_A interactors_A.append([metabolite, metabolite.name]) else: # if metabolite doesn't exist yet, store it's id to create it later (don't create it now # since if interactor_B is invalid, there is no need for new metabolite to be created) new_metabolite_A = A_ecocyc # same as for id_A above, now with second interactor if id_B not in ecocyc_compounds: for ortholog in session.query(OrthologEcoli).filter_by( ortholog_uniprot=id_B, strain_protein=strain).all(): if ortholog is not None: interactors_B.append( [ortholog.protein, ortholog.ortholog_id]) else: B_ecocyc = ecocyc_compounds[id_B]['ecocyc'] metabolite = session.query(Metabolite).filter_by( ecocyc=B_ecocyc).first() if metabolite is not None: interactors_B.append([metabolite, metabolite.name]) else: new_metabolite_B = B_ecocyc # store new interactor pairs from which to create interactions here interactor_pairs = [] # case where no unknown metabolites were found if (new_metabolite_A is None) and (new_metabolite_B is None): # iterate through known protein interactors, add them together to interactor_pairs for interactor_A in interactors_A: for interactor_B in interactors_B: # only add the interactor pair if at least one of them is not a metabolite if (interactor_A[0].type != 'm') | (interactor_B[0].type != 'm'): interactor_pairs.append( [interactor_A, interactor_B]) # case where there is one new metabolite (new_metabolite_A) elif new_metabolite_A is not None: for interactor_B in interactors_B: # don't add a new interactor pair if both are metabolites if interactor_B[0].type != 'm': # check if new metabolite exists in database (eg. if more than one ortholog was found for # interactors_B, you don't want to create the same new metabolite twice) metabolite = session.query(Metabolite).filter_by( ecocyc=new_metabolite_A).first() # create a new metabolite if it doesn't exist if metabolite is None: metabolite = Metabolite( id=new_metabolite_A, name=id_A, ecocyc=new_metabolite_A, pubchem=ecocyc_compounds[id_A]['pubchem'], kegg=ecocyc_compounds[id_A]['kegg'], cas=ecocyc_compounds[id_A]['cas'], chebi=ecocyc_compounds[id_A]['chebi']) session.add(metabolite) # add the interactor pair (for the new metabolite, make sure to add it's name (for # reference later) interactor_pairs.append( [interactor_B, [metabolite, id_A]]) # same as previous case, but if new metabolite is new_metabolite_B elif new_metabolite_B is not None: for interactor_A in interactors_A: if interactor_A[0].type != 'm': metabolite = session.query(Metabolite).filter_by( ecocyc=new_metabolite_B).first() if metabolite is None: metabolite = Metabolite( id=new_metabolite_B, name=id_B, ecocyc=new_metabolite_B, pubchem=ecocyc_compounds[id_B]['pubchem'], kegg=ecocyc_compounds[id_B]['kegg'], cas=ecocyc_compounds[id_B]['cas'], chebi=ecocyc_compounds[id_B]['chebi']) session.add(metabolite) interactor_pairs.append( [interactor_A, [metabolite, id_B]]) # iterate through all interactor pairs and create new interactions # note interactor_pairs will be empty if: # 1) both interactors were new metabolites # 2) one or both ecoli interactors did not have orthologs in Pseudomonas for interactor_pair in interactor_pairs: homogenous = ( interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains( interactor_pair[0][0]), Interaction.interactors.contains( interactor_pair[1][0]), Interaction.homogenous == homogenous).first() source = session.query(InteractionSource).filter_by( data_source='EcoCyc').first() # if interaction doesn't exist, add it, and EcoCyc as a source if interaction is None: # if this interaction is created for first time, mark it as ortholog derived from Ecoli interaction = Interaction( type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0].type), strain=strain, homogenous=homogenous, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], ortholog_derived='Ecoli') interaction.sources.append(source) session.add(interaction), session.commit() # add EcoCyc as source for interaction if it isn't already elif source not in interaction.sources: interaction.sources.append(source) # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog) interactor_a, interactor_b = None, None if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] comment = interactor_pair[0][1] + interaction_row[ "INTERACTION_TYPE"] + interactor_pair[1][1] # iterate through all the pmids listed as reference for given interaction for pmid in interaction_row["INTERACTION_PUBMED_ID"].split( ';'): # check if interaction reference already exists in db reference = session.query( InteractionReference).filter_by( pmid=pmid, source_db='ecocyc', comment=comment, interactor_a=interactor_a, interactor_b=interactor_b).first() # if reference doesn't exist, create it, add the interaction to its references, and the # EcoCyc source to its sources if reference is None: reference = InteractionReference( pmid=pmid, source_db='ecocyc', comment=comment, interactor_a=interactor_a, interactor_b=interactor_b) interaction.references.append(reference) reference.sources.append(source) # if reference does exist, add interaction to its interactions and source to its sources # (if it doesn't have them already) else: if interaction not in reference.interactions: reference.interactions.append(interaction) if source not in reference.sources: reference.sources.append(source) session.commit() print('ecocyc', session.query(Interaction).count())
def parse_ecoli_dip(session): with open('Ecoli/DIP.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: interactors = [] ids_A = row['ID interactor A'].split('|') ids_B = row['ID interactor B'].split('|') refseq_A, uniprotkb_A, refseq_B, uniprotkb_B = '', '', '', '' for id in ids_A: fields = id.split(':') if fields[0] == 'refseq': refseq_A = fields[1] elif fields[0] == 'uniprotkb': uniprotkb_A = fields[1] for id in ids_B: fields = id.split(':') if fields[0] == 'refseq': refseq_B = fields[1] elif fields[0] == 'uniprotkb': uniprotkb_B = fields[1] orthologs_A, orthologs_B = [], [] if uniprotkb_A != '': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprotkb_A).all() if (len(orthologs_A) == 0) & (refseq_A != ''): orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_refseq == refseq_A).all() if uniprotkb_B != '': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == uniprotkb_B).all() if (len(orthologs_B) == 0) & (refseq_B != ''): orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_refseq == refseq_B).all() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: is_new = 0 homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: is_new = 1 interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type='p-p', ortholog_derived='fe') session.add(interaction), session.commit() detections, pmids, types, list = [], [], [], [] if row['Interaction detection method(s)'] != '-': detections = row['Interaction detection method(s)'].split( '|') list.append(detections) if row['Publication Identifier(s)'] != '-': pmids = row['Publication Identifier(s)'].split('|') list.append(pmids) if row['Interaction type(s)'] != '-': types = row['Interaction type(s)'].split('|') list.append(types) interactor_a, interactor_b = '', '' if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] for num in range(0, len(list[0])): type = types[num].split('(')[1][:-1] pmid = pmids[num * 2].split('pubmed:')[1] detection = detections[num].split('(')[1][:-1] # there are more than one pmid sometimes reference = InteractionReference( interaction_id=interaction.id, detection_method=detection, pmid=pmid, source_db=row['Source database(s)'].split('(')[1][:-1], interactor_a=interactor_a, interactor_b=interactor_b) session.add(reference) if is_new: if interaction.is_experimental is None: if is_experimental_psimi( row['Interaction detection method(s)']. split('MI:')[1][:4]): interaction.is_experimental = 1 else: interaction.is_experimental = 0 elif is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'DIP').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='DIP') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_ortholog_interactions(session): # query for all current interactions from PAO1 and PA14 sources all_interactions = session.query(Interaction).all() # iterate through each interaction, see if interactors have orthologs, and create new interactions in # other strain if they do for interaction in all_interactions: # ortholog interactors is interactors from opposite strain from that in interaction interactor_pairs, ortholog_interactors = [], [[], []] num = 0 for interactor in interaction.interactors: # if the interactor is a protein, add its pseudomonas orthologs to ortholog_interactors[num] if interactor.type == 'p': for ortholog in interactor.pseudomonas_orthologs: if ortholog is not None: # add the interactor's psuedomonas ortholog to ortholog_interactors # also add the interactor id for creation of interaction reference later ortholog_interactor = session.query(Interactor).get( ortholog.ortholog_id) ortholog_interactors[num].append( [ortholog_interactor, interactor.id]) # if the interactor is a metabolite, add it as is else: ortholog_interactors[num].append([interactor, interactor.id]) num += 1 # create interactor pairs from ortholog interactors for interactor1 in ortholog_interactors[0]: for interactor2 in ortholog_interactors[1]: interactor_pairs.append([interactor1, interactor2]) # iterate through each interactor pair, create interaction if it doesnt already exist for interactor_pair in interactor_pairs: homogenous = (interactor_pair[0] == interactor_pair[1]) new_interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0]), Interaction.interactors.contains(interactor_pair[1]), Interaction.homogenous == homogenous).first() if new_interaction is None: # set strain for new interaction to opposite of original interaction strain = 'PAO1' if interaction.strain == 'PAO1': strain = 'PA14' # set ortholog derived to the original interaction strain new_interaction = Interaction( strain=strain, type=interaction.type, interactors=interactor_pair, homogenous=homogenous, ortholog_derived=interaction.strain) session.add(new_interaction), session.commit() # in case the interaction already existed, make sure interactor_a and interactor_b variables for # new interaction reference match up with the first and second interactors of the existing # interaction (so it's easy to see which Pseudomonas interactor matches up with which Ecoli # ortholog) interactor_a, interactor_b = None, None if new_interaction.interactors[0] == interactor_pair[0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] # iterate through all of the original interaction references, create new reference # with same fields except with added interactor_a and interactor_b attributes to show original # interactors from which interaction was derived from for reference in interaction.references: new_ref = session.query(InteractionReference).filter_by( detection_method=reference.detection_method, author_ln=reference.author_ln, pub_date=reference.pub_date, pmid=reference.pmid, interaction_type=reference.interaction_type, source_db=reference.source_db, confidence=reference.confidence, comment=reference.comment, interactor_a=interactor_a, interactor_b=interactor_b).first() if new_ref is None: # if the new_ref doesn't exist, create and add it to the new interaction's reference list # and add the original reference's sources to the new ones sources new_ref = InteractionReference( detection_method=reference.detection_method, author_ln=reference.author_ln, pub_date=reference.pub_date, pmid=reference.pmid, interaction_type=reference.interaction_type, source_db=reference.source_db, confidence=reference.confidence, comment=reference.comment, interactor_a=interactor_a, interactor_b=interactor_b) new_interaction.references.append(new_ref) new_ref.sources = reference.sources else: # if the new reference did exist, add the new interaction and original interactions sources # to new reference's attributes if they were not there already if new_interaction not in new_ref.interactions: new_ref.interactions.append(new_interaction) for source in reference.sources: if source is not None: if source not in new_ref.sources: new_ref.sources.append(source) # for each source in the original interaction's sources, add it to the new interaction's source list if # it isn't already there for source in interaction.sources: if source not in new_interaction.sources: new_interaction.sources.append(source) session.commit() print('p_orthologs', session.query(Interaction).count())
def parse_ecoli_mentha(session): with open('Ecoli/PSICQUIC/mentha.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: if (row['#ID(s) interactor A'] == '-') | (row['ID(s) interactor B'] == '-'): continue interactors = [] orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == row['#ID(s) interactor A'].split(':')[1]).all() orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == row['ID(s) interactor B'].split(':')[1]).all() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0]), ortholog_derived='fe') #ask about marking ecoli ortholog interactions as experimental!! if 'MI:' in row['Interaction detection method(s)']: #iterate through all methods if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] reference = InteractionReference( interaction_id=interaction.id, psimi_detection=row['Interaction detection method(s)']. split('MI:')[1][:4], detection_method=row['Interaction detection method(s)']. split('(')[1][:-1], pmid=row['Publication Identifier(s)'].split('pubmed:')[1], psimi_type=row['Interaction type(s)'].split('MI:')[1][:4], interaction_type=row['Interaction type(s)'].split( '(')[1][:-1], psimi_db=row['Source database(s)'].split('MI:')[1][:4], source_db=row['Source database(s)'].split('(')[1][:-1], confidence_score=row['Confidence value(s)']) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'mentha').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='mentha') session.add(source) session.commit() print(session.query(Interaction).count())
def parse_ecoli_irefindex(session): with open('Ecoli/PSICQUIC/iRefIndex.txt') as csvfile: reader = csv.DictReader(csvfile, delimiter='\t') for row in reader: if (row['#ID(s) interactor A'] == '-') | (row['ID(s) interactor B'] == '-'): continue interactors = [] orthologs_A = [] id_A = row['#ID(s) interactor A'].split(':') if id_A[0] == 'uniprotkb': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_A[1]).all() elif id_A[0] == 'refseq': orthologs_A = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_refseq == id_A[1]).all() if len(orthologs_A) == 0: continue orthologs_B = [] id_B = row['ID(s) interactor B'].split(':') if id_B[0] == 'uniprotkb': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_uniprot == id_B[1]).all() elif id_B[0] == 'refseq': orthologs_B = session.query(OrthologEcoli).filter( OrthologEcoli.ortholog_refseq == id_B[1]).all() for ortholog_A in orthologs_A: for ortholog_B in orthologs_B: if (ortholog_A is not None) and (ortholog_B is not None): if ortholog_A.strain_protein == ortholog_B.strain_protein: interactors.append( [[ortholog_A.protein, ortholog_A.ortholog_id], [ortholog_B.protein, ortholog_B.ortholog_id]]) for interactor_pair in interactors: homogenous = (interactor_pair[0][0] == interactor_pair[1][0]) interaction = session.query(Interaction).filter( Interaction.interactors.contains(interactor_pair[0][0]), Interaction.interactors.contains(interactor_pair[1][0]), Interaction.homogenous == homogenous).first() if interaction is not None: if interaction.ortholog_derived is None: interaction.ortholog_derived = 'cfe' elif 'fe' not in interaction.ortholog_derived: interaction.ortholog_derived += ', cfe' session.commit() else: interaction = Interaction( strain=interactor_pair[0][0].strain, interactors=[ interactor_pair[0][0], interactor_pair[1][0] ], type=(interactor_pair[0][0].type + '-' + interactor_pair[1][0]), ortholog_derived='fe') if 'MI:' in row['Interaction detection method(s)']: #iterate through all methods if is_experimental_psimi( row['Interaction detection method(s)'].split( 'MI:')[1][:4]): interaction.is_experimental = 1 session.add(interaction), session.commit() interactor_a, interactor_b = '', '' if interaction.interactors[0] == interactor_pair[0][0]: interactor_a = interactor_pair[0][1] interactor_b = interactor_pair[1][1] else: interactor_b = interactor_pair[0][1] interactor_a = interactor_pair[1][1] author, date, psimi_type, type = None, None, None, None confidences, psimi_detections, detections, pmids = [None], [ None ], [None], [None] if row['Publication 1st author(s)'] != '-': author = row['Publication 1st author(s)'].split(' ')[0] date = row['Publication 1st author(s)'].split('(')[1][:-1] if row['Interaction type(s)'] != '-': type = row['Interaction type(s)'].split('(')[1][:-1] if 'MI' in row['Interaction type(s)']: psimi_type = row['Interaction type(s)'].split( 'MI:')[1][:4] if row['Publication Identifier(s)'] != '-': del pmids[0] for pmid in row['Publication Identifier(s)'].split('|'): pmids.append(pmid.split(':')[1]) if row['Interaction detection method(s)'] != '-': del detections[0] del psimi_detections[0] for detection in row['Publication Identifier(s)'].split( '|'): detections.append(detection.split('(')[1][:-1]) psimi_detections.append(detection.split('MI:')[1][:4]) for pmid in pmids: for confidence in confidences: for (detection, psimi_detection) in zip(detections, psimi_detections): reference = InteractionReference( interaction_id=interaction.id, psimi_detection=psimi_detection, detection_method=detection, author_ln=author, date=date, psimi_type=psimi_type, interaction_type=type, psimi_db=row['Source database(s)'].split( 'MI')[1][:4], source_db=row['Source database(s)'].split( '(')[1][:-1], confidence=confidence, interactor_a=interactor_a, interactor_b=interactor_b) session.add(reference) source = session.query(InteractionSource).filter( InteractionSource.interaction_id == interaction.id, InteractionSource.data_source == 'iRefIndex').first() if source is None: source = InteractionSource(interaction_id=interaction.id, data_source='iRefIndex') session.add(source) session.commit() print(session.query(Interaction).count())