def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) node_names_to_id = {} loadUniprotFile(UNIPROT_DATA_FILE) for files in os.listdir('PTM/databases/ELMpred/dssp/LAB/'): file_list.append('PTM/databases/ELMpred/dssp/LAB/' + files) i = 0 for file in file_list: i += 1 if i == 15000: break get_match(file) get_scores() get_protein_id() get_taxid() get_domain() logging.debug('Done creating elm map. Starting adding to DB structure') #SELECT elm_prot_id, domain_prot_id, taxid from elm_to_prot for m in ELMmaches: if len(m.domain_prot_id) > 0 and len(m.elm_prot_id) > 0: for m_elm_prot_id in m.elm_prot_id: for m_domain_prot_id in m.domain_prot_id: # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict( m_elm_prot_id, "Uniprot", m.taxid, node_names_to_id, db_api) target_dict = insert_or_get_node_dict( m_domain_prot_id, "Uniprot", m.taxid, node_names_to_id, db_api) # Nodes are inserted to the db if they are not in it yet if 'id' not in source_dict: db_api.insert_node(source_dict) if 'id' not in target_dict: db_api.insert_node(target_dict) edge_dict = { 'publication_ids': 'pubmed:26615199', 'layer': '2', 'source_db': DB_TYPE, # ontology database citation 'interaction_identifiers': None, 'confidence_scores': None, # if available 'interaction_detection_method': None, # probably exp type 'interaction_types': 'MI:0190(interaction type)', 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) node_names_to_id = {} with open(DATA_FILE, encoding='ISO-8859-1') as data: # Skipping the header data.readline() data.readline() data.readline() data.readline() for line in data: columns = line.split('\t') if columns[3] == 'human' and columns[8] == 'human': taxid = 'taxid:9606' # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict(columns[2], "Uniprot", taxid, node_names_to_id, db_api) target_dict = insert_or_get_node_dict(columns[6], "Uniprot", taxid, node_names_to_id, db_api) # Nodes are inserted to the db if they are not in it yet if not 'id' in source_dict: db_api.insert_node(source_dict) if not 'id' in target_dict: db_api.insert_node(target_dict) interaction_types = "%s|is_directed:%s|is_direct:%s" \ % ('MI:0217(phosphorylation)', "true", 'false') edge_dict = { 'publication_ids': 'pubmed:22135298', 'layer': '2', 'source_db': DB_TYPE, 'interaction_identifiers': None, 'confidence_scores': None, # if available 'interaction_detection_method': None, # probably exp type 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) with open(DATA_FILE) as data: # Skipping the header data.readline() for line in data: columns = line.split('\t') taxid = 'taxid:9606' # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = get_node_a(columns[0], taxid, db_api) target_dict = get_node_b(columns[2], taxid, db_api) # Nodes are inserted to the db if they are not in it yet if not 'id' in source_dict: db_api.insert_node(source_dict) if not 'id' in target_dict: db_api.insert_node(target_dict) interaction_types = "%s|is_directed:%s|is_direct:%s" \ % ('MI:0190(interaction type)', "false", 'false') edge_dict = { 'publication_ids': 'pubmed:20005715', 'layer': '1', 'source_db': DB_TYPE, 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Declaring variables and constants inserted_nodes = {} UNIPROT_TO_TAX_ID_MAP = get_taxid_map_dict(TAX_ID_MAP_FILE_LOCATION) # Initiating the parser db_api = PsimiSQL(SQL_SEED) SLK_21_FILE = csv.reader(open(SLK_21_FILE_LOCATION, encoding="ISO-8859-1"), delimiter='\t', quotechar='"') # Skipping the header next(SLK_21_FILE) node_names_to_id = {} for row in SLK_21_FILE: mitab_source_pathways = get_mitab_pathways_list_string(row[1]) mitab_target_pathways = get_mitab_pathways_list_string(row[4]) if (row[0] not in UNIPROT_TO_TAX_ID_MAP) or (row[3] not in UNIPROT_TO_TAX_ID_MAP): continue # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict(UNIPROT_TO_TAX_ID_MAP, row[0], mitab_source_pathways, row[2], node_names_to_id, db_api) target_dict = insert_or_get_node_dict(UNIPROT_TO_TAX_ID_MAP, row[3], mitab_target_pathways, row[5], node_names_to_id, db_api) effect = EFFECT_MAP[row[8]] is_direct = IS_DIRECT_MAP[row[6].lower()] if "MI:0407(direct interaction)" in is_direct: is_direct = "true" else: is_direct = "false" is_directed = IS_DIRECTED_MAP[row[7].lower()] if is_directed == "directed": is_directed = "true" else: is_directed = "false" edge_dict = { 'interaction_detection_method' : None, 'first_author' : None, 'publication_ids' : get_mitab_publication_list_string(row[9]), 'interaction_types' : "%s|is_directed:%s|is_direct:%s" % (effect, is_directed, is_direct), 'source_db' : 'SLKv2.1', 'interaction_identifiers' : None, 'confidence_scores' : None, 'layer' : "8" } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) with open(DATA_FILE) as data: # Skipping the header data.readline() node_names_to_id = {} for line in data: columns = line.strip().split('\t') if len(columns) != 1: if columns[9] == '9606' or columns[9] == '7227': mirbase_id = ("hsa-"+columns[0]) if columns[9] == '9606' else ("dme-"+columns[0]) # there are two malformed ID in the database: # hsa-miR-149* --> hsa-miR-149 # hsa-"miR-34a,b,c" --> hsa-"miR-34" mirbase_id = mirbase_id.replace('*', '').replace('\"', '').replace('a,b,c', '') source_dict = insert_or_get_node_dict(mirbase_id, 'miRBase', 'taxid:' + columns[9], node_names_to_id, db_api) target_dict = insert_or_get_node_dict(columns[3], 'GeneCards', 'taxid:' + columns[9], node_names_to_id, db_api) interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)" pubmed_ids = ['22743998'] # miRDeathDB publication if len(columns[8].strip()) > 0: pubmed_ids.append(columns[8].strip()) pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids)) # Inserting edges edge_dict = { 'publication_ids': "|".join(pubmed_ids), 'layer': '5', 'source_db': 'miRDeathDB', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION) print("miRDeathDB finished")
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) with open(DATA_FILE) as data: node_names_to_id = {} for line in data: columns = line.split('\t') lnc_name = columns[0].split('|')[3] taxid = 'taxid:9606' # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict(lnc_name, taxid, node_names_to_id, db_api) target_dict = insert_or_get_node_dict(columns[2], taxid, node_names_to_id, db_api) interaction_types = "%s|is_directed:%s|is_direct:%s" \ % ('MI:2247(trascriptional regulation)', 'true', 'false') edge_dict = { 'publication_ids': 'pubmed:28591841|pubmed:29140473', 'layer': '6', 'source_db': 'PSSMprediction', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': 'MI:1176(sequence based prediction of gene regulatory region binding sites)', 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) dest_conn = sqlite3.connect(DB_DESTINATION) # Parsing file with open(DATA_FILE, encoding='ISO-8859-1') as data: data.readline() for line in data: line = line.strip().split(',') source_uniprot = line[2] target_uniprot = line[3] source_genename = line[0] target_genename = line[1] # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = get_node_a('Uniprot:' + source_uniprot, 'taxid:9606', source_genename, db_api) target_dict = get_node_b('Uniprot:' + target_uniprot, 'taxid:9606', target_genename, db_api) # Nodes are inserted to the db if they are not in it yet if 'id' not in source_dict: db_api.insert_node(source_dict) if 'id' not in target_dict: db_api.insert_node(target_dict) # Layer mapping layer_map_dict = {'Interaction between autophagy proteins': 0} # Effect mapping effect_map_dict = { 'unknown': 'unknown', "stimulation": 'MI:0624(stimulation)', } # Directedness mapping direct_map_dict = { "direct": "MI:0407(directed)", "indirect": "MI:2246(indirect)", } # Setting up identifiers directness = direct_map_dict[line[5]] effect = effect_map_dict[line[6]] # Assembling line ident = ('effect:' + effect + '|is_direct:' + directness) # Publications pubs = '|pubmed:'.join(line[7].split('|')) # Sourcedb sourcedb = line[8].split('(')[0].replace('"', '') source_map = { 'BioGRID': 'TheBiogrid', 'Behrends et Al. 2010': 'Behrends' } if sourcedb in source_map: source = source_map[sourcedb] else: source = sourcedb edge_dict = { 'publication_ids': 'pubmed:' + pubs, 'layer': layer_map_dict[line[4]], 'source_db': source, 'interaction_identifiers': ident, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': None, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db file db_api.save_db_to_file(DB_DESTINATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) all_data = [] pubmed_id_map = {} if os.path.exists('./pubmed_id_map_cache_for_miR2Disease.json'): with open("./pubmed_id_map_cache_for_miR2Disease.json") as cache: pubmed_id_map = json.load(cache) with open(DATA_FILE, encoding='ISO-8859-1') as data: # Skipping the header data.readline() data.readline() data.readline() node_names_to_id = {} lines = 0 for line in data: columns = line.split('\t') if len(columns) > 1: lines += 1 if lines % 50 == 0: print("processed lines (miR2Disease): %d" % lines) columns[3] = columns[3].strip() if not is_well_formed_id( columns[0].strip()) or not is_well_formed_id( columns[1].strip()): print("Warning: malformed ID, link skipped") continue all_data.append(columns) if columns[3] not in pubmed_id_map: search_term = columns[3].replace(".", ' ').replace( ' and ', ' ').replace(' or ', ' ').replace("'", '').strip() search_term = "%s[pdat] AND %s" % (columns[2].strip(), search_term) search_term = urllib.parse.quote(search_term, safe='') URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&report=uilist&retmode=json&term=' + search_term resp = requests.get(URL) pubmed_id_list = json.loads( resp.text)['esearchresult']['idlist'] if pubmed_id_list and len(pubmed_id_list) == 1: pubmed_id_map[columns[3]] = pubmed_id_list[0] else: print("WARNING: pmid not found") # print(" pubmed ID list: %s" % str(pubmed_id_list)) # print(" %s %s" % (columns[2], columns[3])) # print(" " + URL) pubmed_id_map[columns[3]] = None print("processed lines (miR2Disease): %d" % lines) print("saving output db") for columns in all_data: source_dict = insert_or_get_node_dict(columns[0], 'miRBase', 'taxid:9606', node_names_to_id, db_api) target_dict = insert_or_get_node_dict(columns[1], 'GeneCards', 'taxid:9606', node_names_to_id, db_api) # Getting files from the web with a custom URL pubmed_ids = ['18927107'] # mir2Disease publication if columns[3] in pubmed_id_map and pubmed_id_map[columns[3]]: pubmed_ids.append(str(pubmed_id_map[columns[3]]).strip()) pubmed_ids = set(map(lambda x: ("pubmed:" + x).strip(), pubmed_ids)) interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)" # Inserting edges edge_dict = { 'publication_ids': "|".join(pubmed_ids), 'layer': '5', 'source_db': 'miR2Disease', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION) # save pubmed_id_map, so that we can re-use next time with open("./pubmed_id_map_cache_for_miR2Disease.json", 'w') as cache: json.dump(pubmed_id_map, cache, indent=4, sort_keys=True)
def main(logger): db_api = PsimiSQL(SQL_SEED) node_names_to_id = {} with open(DATA_FILE, encoding='ISO-8859-1') as data: lines = 0 for line in data: lines += 1 if lines % 50000 == 0: print("processed lines: %d" % lines) columns = line.strip().split('\t') if columns[-1] == 'RNA-RNA': if columns[12] in SPECIES_DICT: tax_id = 'taxid:' + SPECIES_DICT[columns[12]]['tax_id'] id_type_map = { 'NONCODE': { 'id_type': 'NONCODE', 'use_name': False }, 'miRBase': { 'id_type': 'miRBase', 'use_name': True }, 'UniProt': { 'id_type': 'Uniprot', 'use_name': False }, 'UniGene': { 'id_type': 'GeneCards', 'use_name': True }, 'RefSeq': { 'id_type': 'RefSeq', 'use_name': False }, } source_id = fix_id(columns[2].strip(), columns[3].strip(), columns[4].strip(), SPECIES_DICT[columns[12]], id_type_map) target_id = fix_id(columns[6].strip(), columns[7].strip(), columns[4].strip(), SPECIES_DICT[columns[12]], id_type_map) if not source_id or not target_id: continue source_dict = insert_or_get_node_dict( source_id, tax_id, node_names_to_id, db_api) target_dict = insert_or_get_node_dict( target_id, tax_id, node_names_to_id, db_api) interaction_types = "MI:0407(direct interaction)|is_directed:true|is_direct:true" pubmed_ids = ['27087310'] # NPInter publication pubmed_id = columns[11].strip() if len(pubmed_id) > 0 and re.search("^\\d+$", pubmed_id): pubmed_ids.append(pubmed_id) pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids)) edge_dict = { 'publication_ids': "|".join(pubmed_ids), 'layer': '7', 'source_db': 'NPInter', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) print("processed lines: %d" % lines) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION) print("NPInter finished")
class MolecularIDMapper: def __init__(self, db, layer, PROT_DBname, LNCRNAMAP_DBname): """ :param db: name of the parsed source database :param PROT_DBname: if value is not None, database is created in memory :argument DICTIONARY_DB_LOCATION: location of the mapping db, output of create_mapping_db :argument SQL_SEED_LOCATION :argument SOURCE_DB_LOCATION: location of the parsed source database :argument DESTINATION_DB_LOCATION: location where the mapped db will be saved """ # Declaring, and assigning constants self.DICTIONARY_DB_LOCATION = PROT_DBname self.SQL_SEED_LOCATION = '../../SLKlib/SQLiteDBApi/network-db-seed.sql' self.SOURCE_DB_TYPE = db self.layer = layer # The db we want to map self.SOURCE_DB_LOCATION = 'all_output/' + db + '.db' # Saving location self.DESTINATION_DB_LOCATION = '../../SLKlib/mapper/protein/output/' + db + '_mapped.db' # Protein map db self.DICTIONARY_DB = sqlite3.connect(self.DICTIONARY_DB_LOCATION) self.DICTIONARY_DB_CURSOR = self.DICTIONARY_DB.cursor() # lncRNA map db if self.layer == 'lncRNA' or self.layer == 'miRNA': self.LNCRNAMAP_DB_LOCATION = LNCRNAMAP_DBname self.LNCRNAMAP_DB = sqlite3.connect(self.LNCRNAMAP_DB_LOCATION) self.PROT_DBname = PROT_DBname if self.PROT_DBname is not None: # Read database to tempfile self.con = sqlite3.connect(self.PROT_DBname) tempfile = io.StringIO() for line in self.con.iterdump(): tempfile.write('%s\n' % line) self.con.close() tempfile.seek(0) # Create a database in memory and import from tempfile self.PROT_DB = sqlite3.connect(":memory:") with self.PROT_DB: self.PROT_DB.cursor().executescript(tempfile.read()) self.PROT_DB.cursor().execute( "CREATE INDEX map_uniprot ON MAPP(uniprot_ac);") self.PROT_DB.cursor().execute( "CREATE INDEX uniprotac_id ON UNIPROT_AC(id);") self.PROT_DB.cursor().execute( "CREATE INDEX taxid ON SPECIES(tax_id);") self.PROT_DB.cursor().execute( "CREATE INDEX map_foreign ON MAPP(foreign_id);") else: self.PROT_DB = sqlite3.connect(self.DICTIONARY_DB_LOCATION) self.PROT_DB.cursor().execute( "CREATE INDEX index_name ON mapp (foreign_id);") # For lncRNA and miRNA if self.layer == 'lncRNA' or self.layer == 'miRNA': self.LNCRNAMAP_DBname = LNCRNAMAP_DBname if self.LNCRNAMAP_DBname is not None: # Read database to tempfile self.con = sqlite3.connect(self.LNCRNAMAP_DBname) tempfile = io.StringIO() for line in self.con.iterdump(): tempfile.write('%s\n' % line) self.con.close() tempfile.seek(0) # Create a database in memory and import from tempfile self.LNCRNAMAP_DB = sqlite3.connect(":memory:") with self.LNCRNAMAP_DB: self.LNCRNAMAP_DB.cursor().executescript(tempfile.read()) self.LNCRNAMAP_DB.cursor().execute( "CREATE INDEX index_name ON mapper (orig_ac);") else: self.LNCRNAMAP_DB = sqlite3.connect(self.LNCRNAMAP_DB_LOCATION) self.new_db = PsimiSQL(self.SQL_SEED_LOCATION) # iterating through the old_db's nodes self.source_db = sqlite3.connect(self.SOURCE_DB_LOCATION) self.source_db.row_factory = sqlite3.Row self.cur = self.source_db.cursor() def add_node(self, old_node_id, old_to_new_node_ids_dict, new_name, new_taxid, new_pathways, new_topo, new_db_api): """ :param old_node_id: node id from the source db's node table :param old_to_new_node_ids_dict: A dictionary that contains an old node id as key and new node ids as values :param new_name: mapped uniprot ac of the mapped node :param new_taxid: taxid :param new_pathways: pathway :param new_topo: topology :param new_db_api: A PsimiSQL object """ new_node_dict = { "name": new_name, "alt_accession": None, # we don't use it anymore "tax_id": new_taxid, "pathways": new_pathways, "aliases": None, # we don't use it anymore "topology": new_topo } # inserting the node to the PSI-MI SQLite new_db_api.insert_unique_node(new_node_dict) # getting the new last row id of the inserted node new_node_id = new_node_dict['id'] # if the node maps to more than one swissprot uniprot id it will be inserted for every swissprot id and # this function will be called for every insertion if old_node_id not in old_to_new_node_ids_dict: old_to_new_node_ids_dict[old_node_id] = new_node_id def main(self): old_node_ids_dict = {} invalid_edge_counter = 0 # MAPPING NODES self.cur.execute("SELECT * FROM node") node_counter = 0 while True: # Getting data for each node node_row = self.cur.fetchone() node_counter += 1 # Until the last row if node_row is None: break # Getting the old information into a dictionary old_node_dict = dict(node_row) # For all other databases foreign_id = old_node_dict['name'].split(':')[1].strip() # Taxid taxid = old_node_dict['tax_id'].split(':')[1].split('(')[0] # miRNA and lncRNA mapping if self.layer == 'lncRNA' or self.layer == 'miRNA': with self.LNCRNAMAP_DB: c = self.LNCRNAMAP_DB.cursor() for indiv_id in foreign_id.split(','): indiv_id = indiv_id.replace('"', '').lower() c.execute( '''SELECT mapped_ac FROM MAPPER WHERE '%s' = MAPPER.orig_ac GROUP BY MAPPER.orig_ac''' % indiv_id) firstrow = c.fetchone() if firstrow: m.add_node(node_row['id'], old_node_ids_dict, 'RNACentral:' + firstrow[0], node_row['tax_id'], node_row['pathways'], node_row['topology'], self.new_db) with self.PROT_DB: c2 = self.PROT_DB.cursor() foreign_id = foreign_id.split(".")[0] c2.execute( "SELECT UNIPROT_AC.uniprot_ac, UNIPROT_AC.uniprot_ac_alt_acc FROM UNIPROT_AC " "JOIN MAPP ON MAPP.uniprot_ac=UNIPROT_AC.id " "JOIN SPECIES ON SPECIES.id=UNIPROT_AC.taxon WHERE SPECIES.tax_id='%s'" "AND MAPP.foreign_id='%s' GROUP BY MAPP.foreign_id" % (taxid, foreign_id.lower())) firstrow = c2.fetchone() if firstrow: m.add_node(node_row['id'], old_node_ids_dict, 'Uniprot:' + firstrow[0], node_row['tax_id'], node_row['pathways'], node_row['topology'], self.new_db) # Protein mapping else: with self.PROT_DB: c = self.PROT_DB.cursor() # Getting uniprot acs for each node and adding the node with new data to the new database foreign_id = foreign_id.split(".")[0] c.execute( "SELECT UNIPROT_AC.uniprot_ac, UNIPROT_AC.uniprot_ac_alt_acc FROM UNIPROT_AC " "JOIN MAPP ON MAPP.uniprot_ac=UNIPROT_AC.id " "JOIN SPECIES ON SPECIES.id=UNIPROT_AC.taxon WHERE SPECIES.tax_id='%s'" "AND MAPP.foreign_id='%s' GROUP BY MAPP.foreign_id" % (taxid, foreign_id.lower())) firstrow = c.fetchone() if firstrow: m.add_node(node_row['id'], old_node_ids_dict, 'Uniprot:' + firstrow[0], node_row['tax_id'], node_row['pathways'], node_row['topology'], self.new_db) # MAPPING EDGES # Since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict # if both nodes mapped we add them as an edge to the new db self.cur.execute("SELECT * from EDGE") edge_counter = 0 while True: edge_row = self.cur.fetchone() if edge_row is None: break else: edge_counter += 1 if edge_row[ 'interactor_a_node_id'] in old_node_ids_dict and edge_row[ 'interactor_b_node_id'] in old_node_ids_dict: new_node_id_a = old_node_ids_dict[ edge_row['interactor_a_node_id']] new_node_id_b = old_node_ids_dict[ edge_row['interactor_b_node_id']] new_node_a_dict = self.new_db.get_node_by_id(new_node_id_a) new_node_b_dict = self.new_db.get_node_by_id(new_node_id_b) new_edge_dict = dict(edge_row) new_edge_dict['interactor_a_node_id'] = new_node_id_a new_edge_dict['interactor_b_node_id'] = new_node_id_b new_edge_dict['source_db'] = edge_row['source_db'] # inserting the new node self.new_db.insert_edge(new_node_a_dict, new_node_b_dict, new_edge_dict) else: invalid_edge_counter += 1 # Saving the mapped database self.new_db.save_db_to_file(self.DESTINATION_DB_LOCATION) print( "\nmapping finished for: %s total edges: %d (unable to map: %d)\n" % (self.SOURCE_DB_TYPE, edge_counter, invalid_edge_counter)) import slk3_db_validator valid = slk3_db_validator.validate_db_file( self.DESTINATION_DB_LOCATION) if not valid: print("ERROR! invalid db file created by the mapper: " + self.DESTINATION_DB_LOCATION) sys.exit(1) return self.SOURCE_DB_TYPE, edge_counter, invalid_edge_counter
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) FILE = csv.reader(open(FILE_LOCATION), delimiter=';') # Skipping the header next(FILE) node_names_to_id = {} for row in FILE: mitab_source_pathways = get_mitab_pathways_list_string(row[5]) mitab_target_pathways = get_mitab_pathways_list_string(row[11]) # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict(row[1], row[0], mitab_source_pathways, row[4].strip(), row[3], node_names_to_id, db_api) target_dict = insert_or_get_node_dict(row[7], row[6], mitab_target_pathways, row[10].strip(), row[9], node_names_to_id, db_api) effect = EFFECT_MAP[row[15]] is_direct = IS_DIRECT_MAP[row[14].lower()] if "MI:0407(direct interaction)" in is_direct: is_direct = "true" else: is_direct = "false" is_directed = IS_DIRECTED_MAP[row[13].lower()] if is_directed == "directed": is_directed = "true" else: is_directed = "false" # Setting up the interaction type interaction_types = "%s|is_directed:%s|is_direct:%s" \ % (effect, is_directed, is_direct) new_scores = [] if row[18] != '': scores = row[18].split(",") for s in scores: confidence_score_name = s.split(":")[0] if " " in confidence_score_name: confidence_score_name = confidence_score_name.replace( " ", "") confidence_score_value = s.split(":")[1] if " " in confidence_score_value: confidence_score_value = confidence_score_value.replace( " ", "") score = f'SLK2 {confidence_score_name}:{confidence_score_value}' if score not in new_scores: new_scores.append(score) edge_dict = { 'interaction_detection_method': None, 'first_author': None, 'publication_ids': get_mitab_publication_list_string(row[16]), 'interaction_types': interaction_types, 'source_db': "SLKv2.0", 'interaction_identifiers': None, 'confidence_scores': "|".join(new_scores), 'layer': "8" } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION)
def main(logger): # Initiating a PsimiSQL class db_api = PsimiSQL(SQL_SEED) # Making the script user friendly file_counter = 1 print("Started parsing .csv files") # Parsing data files for csv_file_location in CSV_LIST: csv_file_name = csv_file_location.split('/')[-1] sys.stdout.write("Parsing '%s' (%d/%d)\r" % (csv_file_name, file_counter, NUMBER_OF_FILES)) csv_file = csv.reader(open(csv_file_location, encoding="ISO-8859-1"), delimiter=';', quotechar='"') pathway = FILENAME_TO_PATHWAY_MAP[csv_file_name] # Skipping the header for cells in csv_file: type_a = cells[1].lower() type_b = cells[5].lower() taxids = cells[12].split(';')[0] if type_a == 'protein' and type_b == 'protein' and taxids == '9606': # Dealing with the first node node_a_name = f'Uniprot:{cells[2]}' node_a_taxid = 'taxid:' + taxids node_a_taxid = node_a_taxid node_a_dict = {} # If the node already exists in the db, than only it's pathway will be modified, otherwise it will be added to the db if db_api.get_node(node_a_name, node_a_taxid): node_a_dict = db_api.get_node(node_a_name, node_a_taxid) if not pathway in node_a_dict['pathways']: node_a_dict['pathways'] += '|' + pathway db_api.update_node(node_a_dict) else: node_a_dict = { 'name': node_a_name, 'alt_accession': 'entrez gene/locuslink:' + cells[0], 'tax_id': node_a_taxid, 'pathways': pathway, 'aliases': None, 'topology': "" } db_api.insert_node(node_a_dict) # Doing the same with node b node_b_name = f'Uniprot:{cells[2]}' node_b_taxid = 'taxid:' + taxids node_b_taxid = node_b_taxid node_b_dict = {} # If the node already exists in the db, than only it's pathway will be modified, otherwise it will be added to the db if db_api.get_node(node_b_name, node_b_taxid): node_b_dict = db_api.get_node(node_b_name, node_b_taxid) if not pathway in node_b_dict['pathways']: node_b_dict['pathways'] += '|' + pathway db_api.update_node(node_b_dict) else: node_b_dict = { 'name': node_b_name, 'alt_accession': 'entrez gene/locuslink:' + cells[4], 'tax_id': node_b_taxid, 'pathways': pathway, 'aliases': None, 'topology': "" } db_api.insert_node(node_b_dict) # Getting publication id publication_id = ['pubmed:' + cells[21]] publication_id.append("pubmed:26467481") effect = EFFECT_MAP[cells[8]] molecular_background = MOLECULAR_MAP[cells[9]] inttype_final = effect + '|' + molecular_background is_direct = IS_DIRECT_MAP[cells[22]].strip() if "MI:0407(direct interaction)" in is_direct: is_direct = "true" else: is_direct = "false" # Setting up the interaction type interaction_types = "%s|is_directed:%s|is_direct:%s" \ % (inttype_final, "true", is_direct) edge_dict = { 'interaction_detection_method': None, 'first_author': None, 'publication_ids': "|".join(publication_id), 'interaction_types': interaction_types, 'source_db': 'Signor', 'interaction_identifiers': None, 'confidence_scores': None, 'layer': "8" } db_api.insert_edge(node_a_dict, node_b_dict, edge_dict) print("Parsing files finished!") print("Finished parsing Signor. Saving db to %s.db" % (DB_TYPE)) db_api.save_db_to_file(DB_DESTINATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) with open(DATA_FILE) as data: # Skipping the header data.readline() for line in data: columns = line.strip().split(';') taxid = 'taxid:9606' # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = get_node_a(columns[1], taxid, '|'.join(columns[4].replace('d ', 'd').split(',')), db_api) target_dict = get_node_b(columns[7], taxid, '|'.join(columns[10].replace('d ', 'd').split(',')), db_api) # Nodes are inserted to the db if they are not in it yet if not 'id' in source_dict: db_api.insert_node(source_dict) if not 'id' in target_dict: db_api.insert_node(target_dict) # Pubmed references pub_id = '|pubmed:'.join(columns[16].split('|')) # Directedness if columns[14] == 'direct': isdirect = 'true' else: isdirect = 'false' if columns[13] == 'PPI directed': isdirected = 'true' else: isdirected = 'false' # Effect if columns[15] == 'stimulation': effect = 'MI:0624(stimulation)' interaction_types = "%s|is_directed:%s|is_direct:%s" \ % (effect, isdirected, isdirect) elif columns[15] == 'inhibition': effect = 'MI:0623(inhibition)' interaction_types = "%s|is_directed:%s|is_direct:%s" \ % (effect, isdirected, isdirect) else: interaction_types = "is_directed:%s|is_direct:%s" \ % (isdirected, isdirect) edge_dict = { 'publication_ids': 'pubmed:' + pub_id, 'layer': '1', 'source_db': 'SLKv2.0', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) for file in RAW_FILE_LIST: with open(file) as data: # Skipping the header data.readline() node_names_to_id = {} lines = 0 for line in data: if line.strip() != '' and not line.strip().startswith("#"): lines += 1 if lines % 50000 == 0: print("processed lines (biogrid): %d" % lines) columns = line.split('\t') # if a cell contains only '-', we replace it with empty string columns = list( map(lambda x: "" if x.strip() == "-" else x, columns)) tax_id_a = columns[9].strip().lower() tax_id_b = columns[10].strip().lower() if tax_id_a in ('taxid:9606', 'taxid:7227', 'taxid:6239', 'taxid:7955') and \ tax_id_b in ('taxid:9606', 'taxid:7227', 'taxid:6239', 'taxid:7955'): biogrid_ids_a = filter( lambda x: x.strip().lower().startswith("biogrid:"), columns[2].split("|")) biogrid_id_a = list(biogrid_ids_a)[0][8:] biogrid_ids_b = filter( lambda x: x.strip().lower().startswith("biogrid:"), columns[3].split("|")) biogrid_id_b = list(biogrid_ids_b)[0][8:] # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict( biogrid_id_a, 'BioGrid', tax_id_a, node_names_to_id, db_api) target_dict = insert_or_get_node_dict( biogrid_id_b, 'BioGrid', tax_id_b, node_names_to_id, db_api) # interaction types in biogrid: # direct: # - psi-mi:"MI:0407"(direct interaction) # - psi-mi:"MI:0915"(physical association) # - psi-mi:"MI:0914"(association) # indirect: # - psi-mi:"MI:0799"(additive genetic interaction defined by inequality) # - psi-mi:"MI:0403"(colocalization) # - psi-mi:"MI:0796"(suppressive genetic interaction defined by inequality) # - psi-mi:"MI:0794"(synthetic genetic interaction defined by inequality) mi_number = columns[11][11:15] if mi_number not in ("0407", "0915", "0914", "0799", "0403", "0796", "0794"): print("warning: unknown interaction type: " + columns[11]) is_direct = True if mi_number in ("0799", "0403", "0796", "0794"): is_direct = False # we add the MI term to the interaction_types # but we skip MI:0407(direct interaction) -> this info is already presented in the is_direct attribute output_mi_string = "|" + columns[11].replace( "psi-mi:", "").replace("\"", "") if "MI:0407" in output_mi_string: output_mi_string = "" interaction_types = "is_directed:false|is_direct:%s%s" % ( str(is_direct).lower(), output_mi_string) # Interaction detection methods: psi-mi:"MI:0018"(two hybrid) detection_methods = columns[6].split("|") detection_methods = map( lambda x: x[7:] if x.lower().startswith('psi-mi') else x, detection_methods) detection_methods = map(lambda x: x.replace("\"", ""), detection_methods) # pubmed ids: pubmed:10747089 pubmed_ids = columns[8].split("|") pubmed_ids = map( lambda x: x[7:] if x.lower().startswith('pubmed') else x, pubmed_ids) pubmed_ids = filter(lambda x: re.search("^\\d+$", x), pubmed_ids) pubmed_ids = set(pubmed_ids) pubmed_ids.add( "30476227") # latest biogrid publication pubmed_ids = map(lambda x: "pubmed:" + x, pubmed_ids) edge_dict = { 'publication_ids': "|".join(pubmed_ids), 'layer': '3', 'source_db': 'TheBiogrid', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': "|".join(detection_methods), 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) print("processed lines: %d" % lines) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) with open(DATA_FILE) as data: # Skipping the header data.readline() node_names_to_id = {} for line in data: columns = line.split('\t') if columns[1] in SPECIES_DICT and columns[5] in SPECIES_DICT: # there are a few kinds of malformed miRbase IDs, like: # [has-let-7a3b] --> hsa-let-7a3b # hsa-miR-34b* --> hsa-miR-34b # miR-143 --> <tax_id>-miR-143 # rna_id = columns[6] rna_id = rna_id.replace("[", "").replace("]", "").replace( "has-", "hsa-") if not rna_id.startswith("hsa-") and not rna_id.startswith( "dme-") and not rna_id.startswith( "cel-") and not rna_id.startswith("dre-"): if rna_id.startswith("miR-"): rna_id = SPECIES_DICT[ columns[5]]['id_prefix'] + "-" + rna_id else: print( "WARNING: skipping interaction due to malformed miRBase ID: " + rna_id) continue rna_id = rna_id.replace("*", "") source_dict = insert_or_get_node_dict( rna_id, 'miRBase', 'taxid:' + SPECIES_DICT[columns[5]]['tax_id'], node_names_to_id, db_api) target_dict = insert_or_get_node_dict( columns[3], 'RefSeq', 'taxid:' + SPECIES_DICT[columns[1]]['tax_id'], node_names_to_id, db_api) interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)" # pubmed id example: 15105502.0 pubmed_id = columns[0].split('.')[0].strip() pubmed_ids = ['18996891'] # miRecords publication if len(pubmed_id) > 0: pubmed_ids.append(pubmed_id) pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids)) edge_dict = { 'publication_ids': "|".join(pubmed_ids), 'layer': '5', 'source_db': 'miRecords', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION) print("miRecords finished")
def main(logger): db_api = PsimiSQL(SQL_SEED) # looping through SLK3 files for SLK_3_FILE_LOCATION in TSV_LIST: #opening each slk files and looping through it SLK_3_FILE = csv.reader(open(SLK_3_FILE_LOCATION, encoding="ISO-8859-1"), delimiter='\t', quotechar='"') next(SLK_3_FILE) # Skipping the header for line in SLK_3_FILE: pathways_a = get_mitab_pathways_list(line[4]) new_pathways_a = [] for p in pathways_a: pathway_a = p if " " in p: pathway_a = p.replace(" ", "") elif '"' in p: pathway_a = p.replace('"', "") new_p = accepted_pathways[pathway_a] new_pathways_a.append(new_p) new_node_a = line[3] if " " in line[3]: new_node_a = line[3].replace(" ", "") if line[5] == "Meditor": line[5] = "Mediator" topologies_a = set(map(lambda x: x.strip(), line[5].split(","))) source_dict = { "name": "Uniprot:" + new_node_a, "alt_accession": "gene symbol:" + line[2], "tax_id": ORGANISM_NAME_TO_MITAB_ID_MAP[line[0]], "aliases": '-', "pathways": "|".join(new_pathways_a), "topology": "|".join(topologies_a) } db_api.insert_node(source_dict) pathways_b = get_mitab_pathways_list(line[9]) new_pathways_b = [] for p in pathways_b: pathway_b = p if " " in p: pathway_b = p.replace(" ", "") elif '"' in p: pathway_b = p.replace('"', "") new_p = accepted_pathways[pathway_b] new_pathways_b.append(new_p) new_node_b = line[8] if " " in line[8]: new_node_b = line[8].replace(" ", "") topologies_b = set(map(lambda x: x.strip(), line[10].split(","))) target_dict = { "name": "Uniprot:" + new_node_b, "alt_accession": "gene symbol:" + line[7], "tax_id": ORGANISM_NAME_TO_MITAB_ID_MAP[line[0]], "aliases": '-', "pathways": "|".join(new_pathways_b), "topology": "|".join(topologies_b) } db_api.insert_node(target_dict) effect = EFFECT_MAP[line[14].lower()] molecular_background = MOLECULAR_MAP[line[13].lower()] inttype_final = effect + '|' + molecular_background is_direct = IS_DIRECT_MAP[line[12].lower()] if "MI:0407(direct interaction)" in is_direct: is_direct = "true" else: is_direct = "false" interaction_types = "%s|is_directed:%s|is_direct:%s" % ( inttype_final, "true", is_direct) edge_dict = { 'interaction_detection_method': None, 'first_author': None, 'publication_ids': 'pubmed:' + line[15], 'interaction_types': interaction_types, 'source_db': 'SLKv3.0', 'interaction_identifiers': None, 'confidence_scores': None, 'layer': "8" } db_api.insert_edge(source_dict, target_dict, edge_dict) db_api.save_db_to_file(DESTINATION)
def main(logger): # Making a set from the curated files, the set will contain the proteins that does map to a unique value # (has 1 ->* mapping) not_valid_node_set = set() valid_node_set = set() # Getting all nodes that contain a * character, that means that the node represents more than one molecule # Making a list from these not unique proteins with open(CURATED_PROTEIN_LIST_FILE_LOCATION) as curated_protein_list_file: for line in curated_protein_list_file: line = line.strip() cells = line.split('\t') if len(cells) > 4: not_valid_node_set.add(cells[0]) else: # Collecting protein nodes valid_node_set.add(cells[0]) # Collecting pathways from the pathway files PATHWAY_FILE = PATHWAY_FILE_LOCATION pathways = get_pathways(open(PATHWAY_FILE)) # Initialising a PsimiTOSQL object parser = PsimiSQL(SQL_SEED) # Generating a dictionary that holds unique node objects, in the same time the node sql table is filled up nodes = {} edges = {} with open(CURATED_PROTEIN_LIST_FILE_LOCATION) as PPI_FILE: PPI_FILE.readline() # Although this is a SIF formatted files, it only contains two interactors in a line # (a SIF files can contain more than 2 interactors in a line) for line in PPI_FILE: # getting the names of interacting genes in HUGO format cells = line.strip().split('\t') try: inttype = cells[1] gene_a = cells[0] gene_b = cells[2] pubmed_ids = cells[3] except IndexError: continue if (gene_a not in valid_node_set) or (gene_b not in valid_node_set): continue if pubmed_ids: pubmed_list = pubmed_ids.split(';') pubmed_list.append("26192618") if 'N/A' in pubmed_list: pubmed_list.remove('N/A') pubmed_ids = 'pubmed:' + '|pubmed:'.join(pubmed_list) edge_id = gene_a + '@' + gene_b for type in inttype.lower().split(';'): final_inttype = [] if 'association' in type: selected_type = 'MI:0914(association)' final_inttype.append(selected_type) else: selected_type = 'MI:0190(interaction type)' final_inttype.append(selected_type) if edge_id not in edges: edges[edge_id] = { 'inserted': False, 'is_complex': None, 'pubmed': pubmed_ids, 'effect': '|'.join(final_inttype) } else: continue with open(CURATED_PROTEIN_LIST_FILE_LOCATION) as PPI_FILE: PPI_FILE.readline() for line in PPI_FILE: # Resetting variables edge_id = None gene_a = None gene_b = None effect = None edge_id = None try: cells = line.split('\t') gene_a = cells[0] gene_b = cells[2] except IndexError: continue not_accepted_characters = [" ", "?", "~", ","] characters_in_gene_a = [ e for e in not_accepted_characters if e in gene_a ] if len(characters_in_gene_a) > 0: continue characters_in_gene_b = [ e for e in not_accepted_characters if e in gene_b ] if len(characters_in_gene_b) > 0: continue if (gene_a not in valid_node_set) or (gene_b not in valid_node_set): continue edge_id = gene_a + '@' + gene_b if edge_id in edges: if edges[edge_id]['is_complex'] is True or edges[edge_id][ 'inserted'] is True or "Reference" in edges[edge_id][ 'effect'] or "neighbor-of" in edges[edge_id][ 'effect']: continue else: pubmed_ids = edges[edge_id]['pubmed'] effect = edges[edge_id]['effect'] else: continue """ creating and inserting edges to the db """ gene_a_pathway_list = get_pathway_list(gene_a.replace('*', ''), pathways) gene_b_pathway_list = get_pathway_list(gene_b.replace('*', ''), pathways) # If the node is in the not_valid_node set, it is not inserted if gene_a not in not_valid_node_set: gene_a = gene_a.replace('*', '') if gene_a in nodes: interactor_a = nodes[gene_a] else: interactor_a = nodes[gene_a] = { 'name': 'HGNC:' + gene_a, 'alt_accession': 'HGNC:' + gene_a, 'tax_id': 'taxid:9606', 'pathways': '|'.join(gene_a_pathway_list), 'aliases': None } parser.insert_node(interactor_a) else: continue if gene_b not in not_valid_node_set: gene_b = gene_b.replace('*', '') if gene_b in nodes: interactor_b = nodes[gene_b] else: interactor_b = nodes[gene_b] = { 'name': 'HGNC:' + gene_b, 'alt_accession': 'HGNC:' + gene_b, 'tax_id': 'taxid:9606', 'pathways': '|'.join(gene_b_pathway_list), 'aliases': None } parser.insert_node(interactor_b) else: continue interaction_types = "%s|is_directed:%s|is_direct:%s" \ % (effect, "true", "false") edge_dict = { 'interaction_detection_method': None, 'first_author': None, 'publication_ids': pubmed_ids, 'interaction_types': interaction_types, 'source_db': DB_TYPE, 'interaction_identifiers': None, 'confidence_scores': None, 'layer': "8" } parser.insert_edge(interactor_a, interactor_b, edge_dict) edges[edge_id]['inserted'] = True parser.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) node_names_to_id = {} for file in DATA_FILE_LIST: print("processing input file: " + file) with open(file) as data: lines = 0 for line in data: columns = line.split('\t') if columns[2] in SPECIES_DICT and columns[5] in SPECIES_DICT: lines += 1 if lines % 50000 == 0: print("processed lines (TarBase): %d" % lines) # there are a few malformed miRbase IDs, like: hsa-let-7c* --> hsa-let-7c # also during the mapping we dont care about the 3p/5p postfix rna_id = columns[1] rna_id = rna_id.replace("*", "") if rna_id.endswith("-3p") or rna_id.endswith("-5p"): rna_id = rna_id[:-3] # Entrez Gene ID in the input file: 3091.0 gene_id = columns[4].split('.')[0].strip() source_dict = insert_or_get_node_dict( rna_id, 'miRBase', 'taxid:' + SPECIES_DICT[columns[2]]['tax_id'], node_names_to_id, db_api) target_dict = insert_or_get_node_dict( gene_id, 'GeneID', 'taxid:' + SPECIES_DICT[columns[5]]['tax_id'], node_names_to_id, db_api) interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)" # pubmed id example: 16921378.0 pubmed_id = columns[8].split('.')[0].strip() pubmed_ids = ['25416803'] # TarBase v7 publication if len(pubmed_id) > 0: pubmed_ids.append(pubmed_id) pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids)) detmap = { 'qRT-PCR': 'MI:1196(quantitative reverse transcription pcr)', 'Luciferase reporter assay': 'MI:2285(miRNA interference luciferase reporter assay)', 'Western blot': 'MI:0113(western blot)', 'GFP reporter assay': 'MI:0045(experimental interaction detection)', 'In situ hybridization': 'MI:0045(experimental interaction detection)', 'Northern blot': 'MI:0929(northern blot)', 'Reporter assay': 'MI:0045(experimental interaction detection)', 'Other': 'MI:0045(experimental interaction detection)', 'Microarray': 'MI:0008(array technology)', 'Immunohistochemistry': 'MI:1198(immunohistochemistry)', 'Immunocytochemistry': 'MI:1200(immunocytochemistry)', 'Immunoblot': 'MI:0045(experimental interaction detection)', '5RACE': 'MI:0045(experimental interaction detection)', 'phenotypic sensor assay': 'MI:0045(experimental interaction detection)', 'real-time RT-PCR': 'MI:1196(quantitative reverse transcription pcr)', 'in situ hybridization': 'MI:0045(experimental interaction detection)', 'FACS': 'MI:0045(experimental interaction detection)', 'ELISA': 'MI:0045(experimental interaction detection)', 'Flow': 'MI:0045(experimental interaction detection)', 'ChIP-seq': 'MI:0402(chromatin immunoprecipitation assay)', 'Immunofluorescence': 'MI:0045(experimental interaction detection)', 'GFP Reporter Assay': 'MI:0045(experimental interaction detection)', 'HITS-CLIP': 'MI:2191(clip)', 'PAR-CLIP': 'MI:2191(clip)', 'intrarenal expression': 'MI:0045(experimental interaction detection)', 'Proteomics': 'MI:0045(experimental interaction detection)', 'ChIP immunoprecipitation': 'MI:0402(chromatin immunoprecipitation assay)', 'Luciferase assay': 'MI:2285(miRNA interference luciferase reporter assay)', 'QRTPCR': 'MI:1196(quantitative reverse transcription pcr)', 'Next Generation Sequencing (NGS)': 'MI:0078(nucleotide sequence identification)', 'RNA-binding protein immunoprecipitation': 'MI:1017(rna immunoprecipitation)', 'immunohistochemistry': 'MI:1198(immunohistochemistry)', 'Sequencing': 'MI:0078(nucleotide sequence identification)', 'CLASH': 'MI:2195(clash)', 'immunoprecipitaion': 'MI:1017(rna immunoprecipitation)', 'Quantitative proteomic approach': 'MI:0045(experimental interaction detection)', 'ChIP': 'MI:0402(chromatin immunoprecipitation assay)', 'TRAP': 'MI:0045(experimental interaction detection)', 'Immunoprecipitaion': 'MI:1017(rna immunoprecipitation)', 'LacZ reporter assay': 'MI:0045(experimental interaction detection)', 'flow': 'MI:0045(experimental interaction detection)', 'EMSA': 'MI:0045(experimental interaction detection)', 'Communoprecipitaion': 'MI:1017(rna immunoprecipitation)', 'pSILAC': 'MI:0045(experimental interaction detection)', 'RTPCR': 'MI:1196(quantitative reverse transcription pcr)', 'proteomics analysis': 'MI:0045(experimental interaction detection)', 'immunoblot': 'MI:0045(experimental interaction detection)', 'ASO assay': 'MI:0045(experimental interaction detection)', 'semi-qRT-PCR': 'MI:1196(quantitative reverse transcription pcr)', 'mice xenograft': 'MI:0045(experimental interaction detection)', 'Chip': 'MI:0402(chromatin immunoprecipitation assay)', 'Flow cytometry': 'MI:0045(experimental interaction detection)', 'Immuohistochemistry': 'MI:0045(experimental interaction detection)', 'Chromatin immunoprecipitation': 'MI:0402(chromatin immunoprecipitation assay)', 'microarray': 'MI:0008(array technology)', 'Western blotting': 'MI:0113(western blot)', 'TaqMan miRNA assay/RT-PCR': 'MI:0045(experimental interaction detection)|MI:1196(quantitative reverse transcription pcr)', 'TaqMan miRNA assay': 'MI:0045(experimental interaction detection)', 'QRTPCRWestern blot': 'MI:1196(quantitative reverse transcription pcr)|MI:0113(western blot)', 'Gluc assay': 'MI:0045(experimental interaction detection)', 'Real time PCR': 'MI:0045(experimental interaction detection)', "3'LIFE": 'MI:0045(experimental interaction detection)', 'Annexin V-FITC': 'MI:0045(experimental interaction detection)', "5\\'RACE": 'MI:0045(experimental interaction detection)', 'Real time RT-PCR': 'MI:1196(quantitative reverse transcription pcr)', 'Luciferase assay/RT-PCR': 'MI:2285(miRNA interference luciferase reporter assay)|MI:1196(quantitative reverse transcription pcr)', 'Westren blot': 'MI:0113(western blot)', '2DGE': 'MI:0045(experimental interaction detection)', 'Mass spectrometry': 'MI:0943(detection by mass spectrometry)', 'EGFP reporter assay': 'MI:0045(experimental interaction detection)', ' Western blot': 'MI:0113(western blot)', 'AGO2 binding RNA immunoprecipitation qRT-PCR': 'MI:1196(quantitative reverse transcription pcr)', 'B-globin reporter assay': 'MI:0045(experimental interaction detection)', 'RISC-IP': 'MI:1017(rna immunoprecipitation)', 'Western Blotting': 'MI:0113(western blot)', 'Immunoprecipitation': 'MI:1017(rna immunoprecipitation)', 'GFP reporter': 'MI:0045(experimental interaction detection)', 'pMIR-REPORT': 'MI:0045(experimental interaction detection)', 'LacZ assay': 'MI:0045(experimental interaction detection)', "5'RACE": 'MI:0045(experimental interaction detection)', 'Western blog': 'MI:0113(western blot)', 'Western blo': 'MI:0113(western blot)', 'western blot': 'MI:0113(western blot)', 'Reverse-phase protein array': 'MI:0008(array technology)', 'Western Blot': 'MI:0113(western blot)', 'MTT assay': 'MI:0045(experimental interaction detection)', 'Immunofluorescence staining': 'MI:0045(experimental interaction detection)', 'Immunoblotting': 'MI:0045(experimental interaction detection)', 'SILAC (Stable Isotope Labeling of Amino acids in Culture)': 'MI:0045(experimental interaction detection)', 'Western blot, luciferase assay': 'MI:0113(western blot)|MI:2285(miRNA interference luciferase reporter assay)', 'DNA methylation analysis': 'MI:1189(methylation interference assay)', 'Wetsern blot': 'MI:0113(western blot)', 'Immunohistochemistry analysis': 'MI:1198(immunohistochemistry)', 'ChIP-PCR': 'MI:0402(chromatin immunoprecipitation assay)', 'luciferase reporter assays': 'MI:2285(miRNA interference luciferase reporter assay)', 'PCR array': 'MI:0008(array technology)', 'Western': 'MI:0113(western blot)', 'immunostaining': 'MI:0422(immunostaining)', 'Caspase-Glo® 3/7 assay': 'MI:0045(experimental interaction detection)', 'Cell proliferation assay': 'MI:0045(experimental interaction detection)', 'safranin o staining/GAGs contents assay': 'MI:0045(experimental interaction detection)', 'wound healing assays': 'MI:0045(experimental interaction detection)', 'transwell insert': 'MI:0045(experimental interaction detection)', 'anoikis assay': 'MI:0045(experimental interaction detection)', 'Gluc reporter assay': 'MI:0045(experimental interaction detection)', 'GUS reporter assay': 'MI:0045(experimental interaction detection)', 'Zymography': 'MI:0512(zymography)', 'Motility assay': 'MI:0045(experimental interaction detection)', 'CAM assay': 'MI:0045(experimental interaction detection)', 'Colony formation assay': 'MI:0045(experimental interaction detection)', 'Alizarin red S staining': 'MI:0045(experimental interaction detection)', 'mRNA decay': 'MI:0045(experimental interaction detection)', 'Cell proliferation': 'MI:0045(experimental interaction detection)', 'apoptosis': 'MI:0045(experimental interaction detection)', 'cell cycle assays': 'MI:0045(experimental interaction detection)', 'colony formation': 'MI:0045(experimental interaction detection)', 'Immunoflourescence': 'MI:0045(experimental interaction detection)', 'Micorarray': 'MI:0008(array technology)', 'Westren Blot': 'MI:0113(western blot)', 'Luciferase reporter assay/Western blot': 'MI:2285(miRNA interference luciferase reporter assay)|MI:0113(western blot)', 'Immunohistochemical (IHC) staining': 'MI:1198(immunohistochemistry)', 'Luciferase reporter assay/qRT-PCR': 'MI:2285(miRNA interference luciferase reporter assay)|MI:1196(quantitative reverse transcription pcr)', '5"RACE': 'MI:0045(experimental interaction detection)', 'Immunofluorescence analysis': 'MI:0045(experimental interaction detection)', 'luciferase reporter assay': 'MI:2285(miRNA interference luciferase reporter assay)', 'Wstern blot': 'MI:0113(western blot)', 'Coimmunoprecipitation': 'MI:1017(rna immunoprecipitation)', 'Immunofluorescence microscopy': 'MI:0045(experimental interaction detection)', '/Western blot': 'MI:0113(western blot)', 'Luciferase reporter assay/QRTPCR': 'MI:2285(miRNA interference luciferase reporter assay)|MI:1196(quantitative reverse transcription pcr)', 'MTT': 'MI:0045(experimental interaction detection)', 'immunofluorescence assays': 'MI:0045(experimental interaction detection)', 'qRT_PCR': 'MI:1196(quantitative reverse transcription pcr)', '2-D Gel Electrophoresis (2DGE)': 'MI:0982(electrophoretic mobility-based method)', 'RISC analysis': 'MI:0045(experimental interaction detection)', 'silico analysis': 'MI:0045(experimental interaction detection)', 'Microarray/In situ hybridization': 'MI:0008(array technology)', 'Western blot ': 'MI:0113(western blot)', 'Genotyping': 'MI:0045(experimental interaction detection)', 'Weastern blot': 'MI:0113(western blot)', 'YFP expression': 'MI:0045(experimental interaction detection)', 'To test if miR-141 directly targets the PR transcript, we analyzed four predicted miR-141-binding sites (Figure 4c)': 'MI:0045(experimental interaction detection)', ' three within the 3′ untranslated region (UTR) as identified through Targetscan (http:': 'MI:0045(experimental interaction detection)', 'www.targetscan.org/) and one in the la': 'MI:0045(experimental interaction detection)', 'qRT-PCR/Luciferase reporter assay': 'MI:1196(quantitative reverse transcription pcr)', 'Luciferase reporter assay and western blot': 'MI:2285(miRNA interference luciferase reporter assay)|MI:0113(western blot)', 'TOPflash/FOPflash reporter assay': 'MI:0045(experimental interaction detection)', 'dual-luciferase reporter assay': 'MI:0045(experimental interaction detection)', 'RNA immunoprecipitation assay (RIP)': 'MI:1017(rna immunoprecipitation)', 'Chromogenic in situ hybridization': 'MI:0045(experimental interaction detection)', 'Luciferase reporter assa': 'MI:2285(miRNA interference luciferase reporter assay)', 'Immunoprecipitaionă„ĄLuciferase reporter assay': '|MI:2285(miRNA interference luciferase reporter assay)', 'ImmunoprecipitaionㄥLuciferase reporter assay': '|MI:2285(miRNA interference luciferase reporter assay)', 'Luciferase reporter assay/Microarray': 'MI:2285(miRNA interference luciferase reporter assay)|MI:0008(array technology)', 'q-PCR': 'MI:1196(quantitative reverse transcription pcr)', 'AGO2 Immunoprecipitation': 'MI:1017(rna immunoprecipitation)', 'Cell proliferation assays': 'MI:0045(experimental interaction detection)', 'LC-MS/MS': 'MI:0943(detection by mass spectrometry)', 'Chromatin Immunoprecipitation': 'MI:0402(chromatin immunoprecipitation assay)', 'Co-immunoprecipitation': 'MI:1017(rna immunoprecipitation)', 'IlluminaExpressionArrays': 'MI:0008(array technology)', 'Protein Immunoblot Analyses': 'MI:0045(experimental interaction detection)', 'miR PCR array system': 'MI:0008(array technology)', 'mtt': 'MI:0045(experimental interaction detection)', 'RNA immunopercipitation': 'MI:1017(rna immunoprecipitation)', 'TOP/FOP luciferase assay': 'MI:2285(miRNA interference luciferase reporter assay)', 'miRNA-masking antisense ODN (miR-Mask) assay': 'MI:0045(experimental interaction detection)', 'enzyme-linked immunosorbent assay': 'MI:0045(experimental interaction detection)', 'Ago2-IP/IgG-IP': 'MI:1017(rna immunoprecipitation)', 'EGFR reporter assay': 'MI:0045(experimental interaction detection)', 'immunoblot analysis': 'MI:0045(experimental interaction detection)', 'Immunohistochemical analysis': 'MI:1198(immunohistochemistry)', 'CC tissues and cells (C33A, HeLa, CaSki, SiHa, and ME-180)': 'MI:0045(experimental interaction detection)', 'Immuno-precipitation': 'MI:1017(rna immunoprecipitation)', 'Luciferase reporter assayMTT': 'MI:2285(miRNA interference luciferase reporter assay)', 'Immunostaining': 'MI:0422(immunostaining)', 'immunosorbent': 'MI:0411(enzyme linked immunosorbent assay)', 'Immunofluorescent Assay': 'MI:0045(experimental interaction detection)', 'YFP reporter assay': 'MI:0045(experimental interaction detection)', 'CLIP-seq': 'MI:2191(clip)', 'RNAi': 'MI:0045(experimental interaction detection)', 'TOPflash/FOPflash reporter assay': 'MI:0045(experimental interaction detection)', 'Caspase-Glo® 3/7 assay': 'MI:0045(experimental interaction detection)', '': 'MI:0045(experimental interaction detection)', } detlist = [] for method in columns[6].split('//'): for real_method in method.split(';'): if real_method not in detmap: print( "WARNING: detection method not recognised: " + real_method) detlist.append( 'MI:0045(experimental interaction detection)' ) else: detlist.append(detmap[real_method]) edge_dict = { 'publication_ids': '|'.join(pubmed_ids), 'layer': '5', 'source_db': 'TarBase', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': '|'.join(detlist), 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) print("processed lines (TarBase): %d" % lines) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION)
def __init__(self, db, layer, PROT_DBname, LNCRNAMAP_DBname): """ :param db: name of the parsed source database :param PROT_DBname: if value is not None, database is created in memory :argument DICTIONARY_DB_LOCATION: location of the mapping db, output of create_mapping_db :argument SQL_SEED_LOCATION :argument SOURCE_DB_LOCATION: location of the parsed source database :argument DESTINATION_DB_LOCATION: location where the mapped db will be saved """ # Declaring, and assigning constants self.DICTIONARY_DB_LOCATION = PROT_DBname self.SQL_SEED_LOCATION = '../../SLKlib/SQLiteDBApi/network-db-seed.sql' self.SOURCE_DB_TYPE = db self.layer = layer # The db we want to map self.SOURCE_DB_LOCATION = 'all_output/' + db + '.db' # Saving location self.DESTINATION_DB_LOCATION = '../../SLKlib/mapper/protein/output/' + db + '_mapped.db' # Protein map db self.DICTIONARY_DB = sqlite3.connect(self.DICTIONARY_DB_LOCATION) self.DICTIONARY_DB_CURSOR = self.DICTIONARY_DB.cursor() # lncRNA map db if self.layer == 'lncRNA' or self.layer == 'miRNA': self.LNCRNAMAP_DB_LOCATION = LNCRNAMAP_DBname self.LNCRNAMAP_DB = sqlite3.connect(self.LNCRNAMAP_DB_LOCATION) self.PROT_DBname = PROT_DBname if self.PROT_DBname is not None: # Read database to tempfile self.con = sqlite3.connect(self.PROT_DBname) tempfile = io.StringIO() for line in self.con.iterdump(): tempfile.write('%s\n' % line) self.con.close() tempfile.seek(0) # Create a database in memory and import from tempfile self.PROT_DB = sqlite3.connect(":memory:") with self.PROT_DB: self.PROT_DB.cursor().executescript(tempfile.read()) self.PROT_DB.cursor().execute( "CREATE INDEX map_uniprot ON MAPP(uniprot_ac);") self.PROT_DB.cursor().execute( "CREATE INDEX uniprotac_id ON UNIPROT_AC(id);") self.PROT_DB.cursor().execute( "CREATE INDEX taxid ON SPECIES(tax_id);") self.PROT_DB.cursor().execute( "CREATE INDEX map_foreign ON MAPP(foreign_id);") else: self.PROT_DB = sqlite3.connect(self.DICTIONARY_DB_LOCATION) self.PROT_DB.cursor().execute( "CREATE INDEX index_name ON mapp (foreign_id);") # For lncRNA and miRNA if self.layer == 'lncRNA' or self.layer == 'miRNA': self.LNCRNAMAP_DBname = LNCRNAMAP_DBname if self.LNCRNAMAP_DBname is not None: # Read database to tempfile self.con = sqlite3.connect(self.LNCRNAMAP_DBname) tempfile = io.StringIO() for line in self.con.iterdump(): tempfile.write('%s\n' % line) self.con.close() tempfile.seek(0) # Create a database in memory and import from tempfile self.LNCRNAMAP_DB = sqlite3.connect(":memory:") with self.LNCRNAMAP_DB: self.LNCRNAMAP_DB.cursor().executescript(tempfile.read()) self.LNCRNAMAP_DB.cursor().execute( "CREATE INDEX index_name ON mapper (orig_ac);") else: self.LNCRNAMAP_DB = sqlite3.connect(self.LNCRNAMAP_DB_LOCATION) self.new_db = PsimiSQL(self.SQL_SEED_LOCATION) # iterating through the old_db's nodes self.source_db = sqlite3.connect(self.SOURCE_DB_LOCATION) self.source_db.row_factory = sqlite3.Row self.cur = self.source_db.cursor()
def main(logger): TCR_DATA_FILE = open(TCR_DATA_LOC, encoding="ISO-8859-1") # Skipping the header line, and assigning the files's content to a list lines = TCR_DATA_FILE.readline() # Initiating a PsimiSQL object parser = PsimiSQL(SQL_SEED) for line in TCR_DATA_FILE: cells = line.split('\t') # Storing the needed properties in variables name_a = cells[1].strip() name_b = cells[3].strip() alt_accession_a = cells[0] alt_accession_b = cells[2] if name_a == '': continue # Building the node dictionaries, and inserting them to the db with the parser node_a_dict = { 'name': "Uniprot:" + name_a, 'alt_accession': "entrez gene/locuslink:" + alt_accession_a, 'tax_id': "taxid:9606", 'pathways': "T-cell receptor", 'aliases': None } parser.insert_node(node_a_dict) if name_b == '': continue node_b_dict = { 'name': "Uniprot:" + name_b, 'alt_accession': "entrez gene/locuslink:" + alt_accession_b, 'tax_id': "taxid:9606", 'pathways': "T-cell receptor", 'aliases': None } parser.insert_node(node_b_dict) # Gathering the edge's properies, and inserting the edge to the db interaction_direction = IS_DIRECT_MAP[cells[5].lower()] interaction_effect = EFFECT_MAP[cells[6].lower().strip()] pubmed_ids = cells[8] interaction_types = "%s|is_directed:%s|is_direct:%s" % ( interaction_effect, "true", interaction_direction) edge_dict = { 'interaction_detection_method': None, 'first_author': None, 'publication_ids': get_mitab_publication_list_string(pubmed_ids), 'interaction_types': interaction_types, 'source_db': "TCRcuration", 'interaction_identifiers': None, 'confidence_scores': None, 'layer': '8' } parser.insert_edge(node_a_dict, node_b_dict, edge_dict) # Saving the db to a files parser.save_db_to_file(DESTINATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) node_names_to_id = {} with open(DATA_FILE, encoding='ISO-8859-1') as data: # Skipping the header data.readline() data.readline() data.readline() data.readline() skipped_lines = 0 lines = 0 for line in data: lines += 1 if lines % 50000 == 0: print("processed lines (PTMCode2): %d" % lines) columns = line.split('\t') if len(columns) != 14: logger.debug("number of colums not 14: %s" % line) continue if columns[2] == 'H**o sapiens' or columns[2] == 'Drosophila melanogaster' or columns[2] == 'Danio rerio' \ or columns[2] == 'Caenorhabditis elegans': taxid = ORGANISM_NAME_TO_MITAB_ID_MAP[columns[2]] # Getting rid of beta'Cop because it can not be mapped due to syntax error if columns[0] != "beta'Cop" and columns[1] != "beta'Cop": # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict( columns[0].strip().replace(" ", ""), "GeneCards", taxid, node_names_to_id, db_api) target_dict = insert_or_get_node_dict( columns[1].strip().replace(" ", ""), "GeneCards", taxid, node_names_to_id, db_api) if not source_dict or not target_dict: skipped_lines += 1 continue interaction_types = "%s|is_directed:%s|is_direct:%s" \ % ('MI:0190(interaction type)', "true", 'false') edge_dict = { 'publication_ids': 'pubmed:25361965', 'layer': '2', 'source_db': DB_TYPE, 'interaction_identifiers': None, 'confidence_scores': None, # if available 'interaction_detection_method': None, # probably exp type 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) print("processed lines (PTMCode2): %d" % lines) print("skipped lines (malformed IDs): %d" % skipped_lines) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) # Parsing data file for file in DATA_FILE_LIST: print(file) with open(file, encoding="ISO-8859-1") as data: # Skipping header data.readline() data.readline() data.readline() data.readline() data.readline() data.readline() for line in data: line = line.strip().split('\t') if len( line ) < 4: # Probably because of conversion from xlsx to tsv continue # Mapping species to taxid if line[0] == 'human': taxid_source = 'taxid:9606' else: pass if line[2] == 'human': taxid_target = 'taxid:9606' else: pass # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = get_node_a('Uniprot:' + line[1], taxid_source, db_api) target_dict = get_node_b('Uniprot:' + line[3], taxid_target, db_api) # Nodes are inserted to the db if they are not in it yet if not 'id' in source_dict: db_api.insert_node(source_dict) if not 'id' in target_dict: db_api.insert_node(target_dict) # Mapping interaction identifiers # Directed/undirected if line[5] == 'D': directed = 'directed' else: directed = 'undirected' # Direct/indirect if line[7] == 'D': direct = 'MI:0407(directed)' else: direct = 'MI:2246(indirect)' # Stimulation/inhibition if line[8] == 'S' or line[8] == 's': stimulation = 'MI:0840(stimulator)' elif line[8] == 'I': stimulation = 'MI:0586(inhibitor)' else: pass # Molecular background molec_map = { 'P': 'MI:0217(phosphorylation reaction)', 'Acetylation': 'MI:0192(acetylation reaction)', 'degradation (ubiquitinilation)': 'MI:0220(ubiquitination reaction)', 'autoP': 'MI:0217(phosphorylation reaction)', 'csak beköt': 'MI:0462(bind)', 'proteolízis': 'MI:0414(enzymatic reaction)', 'proteolízis ("delipidálás")': 'MI:0414(enzymatic reaction)', '"proteolízis (""delipidálás"")"': 'MI:0414(enzymatic reaction)', 'E2 - kovalens tioészter kötés': 'MI:0195(covalent binding)', 'kovalens': 'MI:0195(covalent binding)', 'kovalens tioészter kötés': 'MI:0195(covalent binding)', 'E1 - kovalens tioészter kötés': 'MI:0195(covalent binding)', 'E1-E2 komplex': 'MI:0195(covalent binding)', '': '' } # Constructing interaction data line int_types = '|'.join([ stimulation, molec_map[line[9]], 'is_direct:' + 'true', 'is_directed:' + 'true' ]) edge_dict = { 'publication_ids': 'pubmed:' + line[4], 'layer': '1', 'source_db': 'manual curation', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': int_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db file db_api.save_db_to_file(DB_DESTINATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) # Parsing files with open(DATA_FILE) as data: data.readline() missing_alt_id = 0 edges_in_known_taxes = 0 node_names_to_id = {} lines = 0 for line in data: columns = line.split('\t') if len(columns) < 2: continue lines += 1 if lines % 50000 == 0: print("processed lines (IntAct): %d" % lines) # tax id like: taxid:9606(human), taxid:83333(ecoli), taxid:-1(in vitro) tax_id_a = columns[9][:10] tax_id_b = columns[10][:10] if tax_id_a not in ('taxid:9606', 'taxid:7227', 'taxid:6239', 'taxid:7955') or \ tax_id_b not in ('taxid:9606', 'taxid:7227', 'taxid:6239', 'taxid:7955'): continue edges_in_known_taxes += 1 if is_well_formed_uniprot_id( columns[2]) and is_well_formed_uniprot_id(columns[3]): uniprot_id_a = columns[2][10:].strip() uniprot_id_b = columns[3][10:].strip() # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict(uniprot_id_a, 'Uniprot', tax_id_a, node_names_to_id, db_api) target_dict = insert_or_get_node_dict(uniprot_id_b, 'Uniprot', tax_id_b, node_names_to_id, db_api) # interaction detection methods: psi-mi:"MI:0096"(pull down)|psi-mi:"MI:0018"(two hybrid) detection_methods = columns[6].split("|") detection_methods = filter( lambda x: x.strip().lower().startswith('psi-mi:'), detection_methods) detection_methods = map( lambda x: x.strip()[7:].replace("\"", "").strip(), detection_methods) detection_methods = set(detection_methods) # pubmed ids: pubmed:10887206|mint:MINT-5212759 pubmed_ids = columns[8].split("|") pubmed_ids = filter( lambda x: x.strip().lower().startswith('pubmed:'), pubmed_ids) pubmed_ids = map(lambda x: x.strip()[7:], pubmed_ids) pubmed_ids = filter(lambda x: re.search("^\\d+$", x), pubmed_ids) pubmed_ids = set(pubmed_ids) pubmed_ids.add("24234451") # intact publication pubmed_ids = map(lambda x: "pubmed:" + x, pubmed_ids) # interaction type: psi-mi:"MI:0407"(direct interaction)|psi-mi:"MI:0915"(physical association) interaction_type_terms = columns[11].split("|") interaction_type_terms = filter( lambda x: x.strip().lower().startswith('psi-mi:'), interaction_type_terms) interaction_type_terms = map( lambda x: x.strip()[7:].replace("\"", "").strip(), interaction_type_terms) interaction_type_terms = set(interaction_type_terms) # we remove 'MI:0407(direct interaction)' term, as it is redundant with the is_direct attribute interaction_type_terms.discard("MI:0407(direct interaction)") interaction_type = "is_directed:false|is_direct:true" if len(interaction_type_terms) > 0: interaction_type += "|" + "|".join(interaction_type_terms) # interaction score examples in the IntAct input file: # - intact-miscore:0.558037 # - author score:low # - author score:Retest score=6; Class=Core; confidence score set1/set2 =2 # - author score:"Socio-affinity score: 6.11118" # - author-confidence:Z-score = 17.60 # - replication-based confidence:4 # we don't keep the author-type scores, as those are a mess and also contains several non-numeric scores confidence_scores = columns[14].split("|") confidence_scores = map(lambda x: x.strip(), confidence_scores) confidence_scores = filter( lambda x: not x.startswith("author score:") and not x. startswith("author-confidence:"), confidence_scores) confidence_scores = map( lambda x: x.replace("intact-miscore", "intact miscore"), confidence_scores) confidence_scores = map( lambda x: x if x.lower().startswith("intact") else "intact " + x, confidence_scores) confidence_scores = set(confidence_scores) edge_dict = { 'publication_ids': "|".join(pubmed_ids), 'layer': '3', 'source_db': "IntAct", 'interaction_identifiers': None, 'confidence_scores': "|".join(confidence_scores), 'interaction_detection_method': "|".join(detection_methods), 'interaction_types': interaction_type, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) else: missing_alt_id += 1 print("processed lines (IntAct): %d" % lines) print("number of links in the known species: %d" % edges_in_known_taxes) print("links with missing uniprot ID: %d" % missing_alt_id) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) path_dict = {} # Uncomment if using run_all_auto # data = DATA_FILE.split('\n') # Getting only human data # path_map = UNI_TO_PATHWAY.split('\n') with open(DATA_FILE) as data, open(UNI_TO_PATHWAY) as path_map: for line in path_map: line = line.strip().split('\t') if len(line) > 4: if line[5] == 'H**o sapiens': path_dict[line[0]] = line[1] data.readline() reactome_to_signalink_pathway_map = {} pathway_file = open(PATHWAY_FILE_LOCATION) next(pathway_file) for line in pathway_file: reactome_pathway_id, signalink_pathway = line.strip().split('\t') reactome_to_signalink_pathway_map[ reactome_pathway_id] = signalink_pathway node_names_to_id = {} for line in data: columns = line.strip().split('\t') if len(columns) > 1: id_a = columns[0].strip().split(":")[1] id_type_a = columns[0].strip().split(":")[0] id_b = columns[1].strip().split(":")[1] id_type_b = columns[1].strip().split(":")[0] # Building the pathway dict for SLK3 pathways if not id_a in path_dict.keys() or not id_b in path_dict.keys( ): continue if not path_dict[id_a] in reactome_to_signalink_pathway_map \ or not path_dict[id_b] in reactome_to_signalink_pathway_map: continue interactor_a_tax_id = columns[9].split("(")[0] interactor_b_tax_id = columns[10].split("(")[0] if (interactor_a_tax_id != "taxid:9606") or (interactor_b_tax_id != "taxid:9606"): continue # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict( id_a, id_type_a, columns[2], columns[4], interactor_a_tax_id, reactome_to_signalink_pathway_map[path_dict[id_a]], node_names_to_id, db_api) target_dict = insert_or_get_node_dict( id_b, id_type_b, columns[3], columns[5], interactor_b_tax_id, reactome_to_signalink_pathway_map[path_dict[id_b]], node_names_to_id, db_api) # Setting up the interaction type effect = columns[11].replace('psi-mi:', '').replace('"', '') interaction_types = "%s|is_directed:%s|is_direct:%s" \ % (effect, 'true', 'false') if columns[8] != '-': pubmed = columns[8].split("|") pubmed.append("pubmed:29145629") pubmed_ids = "|".join(pubmed) else: pubmed_ids = "pubmed:29145629" edge_dict = { 'publication_ids': pubmed_ids, 'layer': '8', 'source_db': 'Reactome', 'interaction_identifiers': None, 'confidence_scores': columns[14].split("(")[0], 'interaction_detection_method': columns[6].replace('psi-mi:', '').replace('"', ''), 'interaction_types': interaction_types, 'first_author': columns[7] } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) with open(DATA_FILE) as data: # Skipping the header data.readline() node_names_to_id = {} lines = 0 for line in data: columns = line.strip().split('\t') lines += 1 if lines % 50000 == 0: print("processed lines (OmniPath): %d" % lines) # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = insert_or_get_node_dict(columns[0], "Uniprot", 'taxid:9606', node_names_to_id, db_api) target_dict = insert_or_get_node_dict(columns[1], "Uniprot", 'taxid:9606', node_names_to_id, db_api) # the link is indirect, unless it is directed or if it has a role as inhibitor or stimulator direct = 'false' if columns[2] == '1': directed = 'true' direct = 'true' elif columns[2] == '0': directed = 'false' else: print("WARNING: unknown direction flag in line: " + line) interaction_type_terms = [] if columns[3] == '1': interaction_type_terms.append('MI:0624(stimulant)') direct = 'true' if columns[4] == '1': interaction_type_terms.append('MI:0623(inhibition)') direct = 'true' interaction_types = "is_directed:%s|is_direct:%s" % (directed, direct) if len(interaction_type_terms) > 0: interaction_types += "|" + "|".join(interaction_type_terms) pubmed_ids = map(lambda x: x.strip(), columns[7].split(';')) pubmed_ids = filter(lambda x: re.search("^\\d+$", x), pubmed_ids) pubmed_ids = set(pubmed_ids) pubmed_ids.add("27898060") # OmniPath publication pubmed_ids = map(lambda x: "pubmed:" + x, pubmed_ids) edge_dict = { 'publication_ids': "|".join(pubmed_ids), 'layer': '3', 'source_db': 'OmniPath', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) print("processed lines (OmniPath): %d" % lines) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(EXPORT_DB_LOCATION)
def main(logger): db_api = PsimiSQL(SQL_SEED) node_names_to_id = {} for file in DATA_FILE_LIST: print("processing data file: " + file) with open(file) as data: # Skipping the header data.readline() data.readline() data.readline() data.readline() metainfo = FILE_DICT[file.split("/")[-1]] for line in data: columns = line.split('\t') if len(columns) < 2: continue rna_id = columns[metainfo['rna_id_column']].strip() if metainfo['rna_id_type'] == 'miRBase' and ( rna_id.endswith("-3p") or rna_id.endswith("-5p")): # during the rna mapping, we dont care about the 3p/5p postfix rna_id = rna_id[:-3] if metainfo['rna_id_type'] == 'HGNC': rna_id = rna_id.lower() # The wormBase IDs in the mapping DB contains only uppercase IDs gene_id = columns[metainfo['gene_id_column']].strip() if metainfo['gene_id_type'] == 'WormBase': gene_id = gene_id.upper() source_dict = insert_or_get_node_dict(rna_id, metainfo['rna_id_type'], metainfo['tax_id'], node_names_to_id, db_api) target_dict = insert_or_get_node_dict(gene_id, metainfo['gene_id_type'], metainfo['tax_id'], node_names_to_id, db_api) interaction_types = "is_directed:true|is_direct:true|MI:0571(mrna cleavage)" scores = [] for score_definition in metainfo['scores']: value = columns[score_definition['column']].strip() score_name = score_definition['score_name'].strip() scores.append("%s:%s" % (score_name, value)) # Inserting edges edge_dict = { 'publication_ids': 'pubmed:24297251', # StarBase v2.0 publication 'layer': '5', 'source_db': 'StarBase', 'interaction_identifiers': None, 'confidence_scores': "|".join(scores), 'interaction_detection_method': metainfo['detection_method'], 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION) print("StarBase finished " + DB_DESTINATION)
def main(logger): # Initiating the parser db_api = PsimiSQL(SQL_SEED) node_names_to_id = {} with open(DATA_FILE) as data: # Skipping the header data.readline() for line in data: columns = line.split('\t') if columns[4].strip().lower() in ORGANISM_TO_TAXID: source_dict = insert_or_get_node_dict( columns[0], "HGNC", ORGANISM_TO_TAXID[columns[4].strip().lower()], node_names_to_id, db_api) target_dict = insert_or_get_node_dict( columns[1], "HGNC", ORGANISM_TO_TAXID[columns[4].strip().lower()], node_names_to_id, db_api) interaction_types = "is_directed:true|is_direct:true|MI:0407(direct interaction)" detmap = { 'pull-down assay': 'MI:0096(pull down)', 'qPCR, Western blot, RIP.': 'MI:1195(quantitative pcr)|MI:0113(western blot)|MI:1017(rna immunoprecipitation)', 'qRT-PCR, RNAi': 'MI:1196(quantitative reverse transcription pcr)', 'in vitro': 'MI:0045(experimental interaction detection)', 'In vitro': 'MI:0045(experimental interaction detection)', 'Luciferase reporter assay, Pulldown assay ': 'MI:2285(miRNA interference luciferase reporter assay)|MI:0096(pull down)', 'luciferase reporter assays and pull-down assay': 'MI:2285(miRNA interference luciferase reporter assay)|MI:0096(pull down)', 'RIP': 'MI:1017(rna immunoprecipitation)', 'luciferase reporter constructs': 'MI:2285(miRNA interference luciferase reporter assay)', 'in vitro and vivo': 'MI:0045(experimental interaction detection)', 'dual luciferase reporter assay?': 'MI:2285(miRNA interference luciferase reporter assay)', 'dual luciferase reporter assays': 'MI:2285(miRNA interference luciferase reporter assay)', 'qPCR, RNAi etc.': 'MI:1195(quantitative pcr)', 'ISH and Luciferase Assay': 'MI:2285(miRNA interference luciferase reporter assay)', 'In vitro RNA/dsDNA binding assay utilizing biotin tagged RNA oligos as bait': 'MI:0045(experimental interaction detection)', 'In vitro RNA/dsDNA binding assay': 'MI:0045(experimental interaction detection)', 'biotin-avidin pull-down system': 'MI:0096(pull down)', 'microRNA crosslinking and immunoprecipitation (miR-CLIP)': 'MI:2191(clip)', 'Luciferase reporter assay and qPCR': 'MI:2285(miRNA interference luciferase reporter assay)', 'RNA immunoprecipitation;luciferase reporter assays': 'MI:2285(miRNA interference luciferase reporter assay)|MI:1017(rna immunoprecipitation)', 'in vivo': 'MI:0045(experimental interaction detection)', 'luciferase reporter assays': 'MI:2285(miRNA interference luciferase reporter assay)', 'RNA immunoprecipitation and luciferase reporter assays': 'MI:2285(miRNA interference luciferase reporter assay)|MI:1017(rna immunoprecipitation)', 'EMSA': 'MI:0413(electrophoretic mobility shift assay)', 'luciferase reporter assay': 'MI:2285(miRNA interference luciferase reporter assay)', 'Luciferase assays': 'MI:2285(miRNA interference luciferase reporter assay)', '-': 'MI:0045(experimental interaction detection)', 'RNA immunoprecipitation': 'MI:1017(rna immunoprecipitation)', 'RIP, Biotin-RNA Pull-Down Assay,qRT-PCR,EMSA': 'MI:1017(rna immunoprecipitation)|MI:0096(pull down)|MI:0413(electrophoretic mobility shift assay)|MI:1196(quantitative reverse transcription pcr)', 'Luciferase reporter assay, RIP assay and RNA pull-down assay': 'MI:2285(miRNA interference luciferase reporter assay)|MI:1017(rna immunoprecipitation)|MI:0096(pull down)', 'qPCR, Western blot and RNAi': 'MI:1195(quantitative pcr)|MI:0113(western blot)', 'luciferase reporter assay': 'MI:2285(miRNA interference luciferase reporter assay)', 'CLIP': 'MI:2191(clip)', 'RIP and ChIP assay ': 'MI:1017(rna immunoprecipitation)', 'in vitro or vivo': 'MI:0045(experimental interaction detection)', 'RNA pull-down assay': 'MI:0096(pull down)', 'immunoprecipitation (RIP) assay and RNA pull-down assay': 'MI:1017(rna immunoprecipitation)|MI:0096(pull down)', 'luciferase reporter': 'MI:2285(miRNA interference luciferase reporter assay)', 'in vitro and in vivo': 'MI:0045(experimental interaction detection)', 'in viro': 'MI:0045(experimental interaction detection)', 'co-RNA-FISH assays': 'MI:0045(experimental interaction detection)', 'luciferase reporter ': 'MI:2285(miRNA interference luciferase reporter assay)', 'microarray, qPCR': 'MI:1195(quantitative pcr)', 'In vitro and in vivo': 'MI:0045(experimental interaction detection)', 'Luciferase reporter assays': 'MI:2285(miRNA interference luciferase reporter assay)', 'RIP and Luciferase assays': 'MI:2285(miRNA interference luciferase reporter assay)|MI:1017(rna immunoprecipitation)', 'RNA-FISH': 'MI:0045(experimental interaction detection)', 'RNA FISH': 'MI:0045(experimental interaction detection)', 'FISH, Allele-specific RT-PCR': 'MI:1196(quantitative reverse transcription pcr)', 'RIP and RNA pull-down': 'MI:1017(rna immunoprecipitation)', 'RIP and ChIP assay': 'MI:0019(coimmunoprecipitation)' } detmethod = None if columns[8].strip() in detmap: detmethod = detmap[columns[8].strip()] else: print("WARNING: unknown detection method: " + columns[8].strip()) pubmed_ids = ['28529080'] # lncRInter publication pubmed_id = columns[9].strip() if len(pubmed_id) > 0: pubmed_ids.append(pubmed_id) pubmed_ids = set(map(lambda x: 'pubmed:' + x, pubmed_ids)) # Inserting edges edge_dict = { 'publication_ids': "|".join(pubmed_ids), 'layer': '7', 'source_db': 'lncRInter', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': detmethod, 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION) print("lncRInter finished")
def main(logger): def get_node_a(id, taxid, pathway, alias, topology, psi_mi_to_sql_object): """ This function sets up a node dict and returns it. If the node is already in the SQLite database it fetches that node from the db, so it won't be inserted multiple times. """ # Testing if the node is already in the database node_dict = psi_mi_to_sql_object.get_node(id, node_tax_id=taxid) if not node_dict: node_dict = { "name": id, "tax_id": taxid, "alt_accession": None, 'pathways': pathway, "aliases": alias, "topology": topology } return node_dict def get_node_b(id, taxid, pathway, alias, topology, psi_mi_to_sql_object): """ This function sets up a node dict and returns it. If the node is already in the SQLite database it fetches that node from the db, so it won't be inserted multiple times. """ # Testing if the node is already in the database node_dict = psi_mi_to_sql_object.get_node(id, node_tax_id=taxid) if not node_dict: node_dict = { "name": id, "tax_id": taxid, "alt_accession": None, 'pathways': pathway, "aliases": alias, "topology": topology } return node_dict # Initiating the parser db_api = PsimiSQL(SQL_SEED) # Parsing data file with open(DATA_FILE) as data: # Skipping the header data.readline() for line in data: line = line.strip().split(';') # Taxid if line[2] == '9606': taxid_source = 'taxid:9606' else: taxid_source = line[2] if line[10] == '9606': taxid_target = 'taxid:9606' else: taxid_target = line[10] # Pathways source_ptw_list = [] source_ptw_line = line[7].split(',') for ptw in source_ptw_line: ptw_new = ptw.strip().split('(')[0] source_ptw_list.append(ptw_new) source_ptw = '|'.join(source_ptw_list) target_ptw_list = [] target_ptw_line = line[15].split(',') for ptw in target_ptw_line: ptw_new = ptw.strip().split('(')[0] target_ptw_list.append(ptw_new) target_ptw = '|'.join(target_ptw_list) # Topology source_topol = '|'.join(line[4].strip().split(',')) target_topol = '|'.join(line[12].strip().split(',')) # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = get_node_a('Uniprot:' + line[1], taxid_source, source_ptw, line[0], source_topol, db_api) target_dict = get_node_b('Uniprot:' + line[9], taxid_target, target_ptw, line[8], target_topol, db_api) # Nodes are inserted to the db if they are not in it yet if not 'id' in source_dict: db_api.insert_node(source_dict) if not 'id' in target_dict: db_api.insert_node(target_dict) # Mapping layer descriptions to abbreviations layer_dict = { 'Post-translational regulation': '2', 'Interaction between autophagy proteins': '0', 'Autophagy regulators': '1' } # Is directed directed_map = {'PPI directed': 'true', 'PPI undirected': 'false'} # Is direct direct_map = {'direct': 'true'} is_direct = direct_map[line[18]] # Effect effect_map = {'stimulation': 'MI:0624(stimulation)'} if line[19] != 'unknown': effect = effect_map[line[19]] # Constructing interaction data line int_types = '|'.join([ effect, 'is_directed:' + directed_map[line[17]], 'is_direct:' + is_direct ]) else: # Constructing interaction data line int_types = '|'.join([ 'is_directed:' + directed_map[line[17]], 'is_direct:' + is_direct ]) # Publications pubs = '|pubmed:'.join(line[20].split('|')) # Sourcedb mapping sourcedb_map = { 'BioGRID': 'TheBiogrid', 'Behrends et Al. 2010': 'Behrends', 'direction is predicted': 'Behrends predicted' } dblist = [] for db in line[21].split(','): sourcedb = db.strip().split('(')[0] if 'pmid' not in sourcedb: if sourcedb in sourcedb_map.keys(): mysourcedb = sourcedb_map[sourcedb] else: mysourcedb = sourcedb dblist.append(mysourcedb) final_source = '|'.join(dblist) edge_dict = { 'publication_ids': 'pubmed:' + pubs, 'layer': layer_dict[line[16]], 'source_db': final_source, 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': None, 'interaction_types': int_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db file db_api.save_db_to_file(DB_DESTINATION)
def main(logger): # Declaring variables and constants inserted_nodes = {} # Initiating the parser db_api = PsimiSQL(SQL_SEED) for file in DATA_FILE_LIST: with open(file) as data: # Skipping the header data.readline() data.readline() data.readline() data.readline() for line in data: columns = line.split('\t') taxid = FILE_TO_TAXID[file] if len(columns) != 1: if file == 'lncRNA/databases/starbase/files/starbase_v3_miRNAlncRNA.txt': mirna_name = columns[1] lncrna_name = columns[3] elif file == 'lncRNA/databases/starbase/files/starbase_v3_ncRNA_degradome_human.txt' \ or file == 'lncRNA/databases/starbase/files/starbase_v3_ncRNA_degradome_worm.txt': mirna_name = columns[1] lncrna_name = columns[2] elif file == 'lncRNA/databases/starbase/files/starbase_v3_lncRNA_valid.txt': mirna_name = columns[1] lncrna_name = columns[4] else: mirna_name = None lncrna_name = None # Creating the node dicts, if the node is already in the db assigning that to the node dict source_dict = get_node_lncrna(mirna_name, taxid, db_api) target_dict = get_node_mirna(lncrna_name, taxid, db_api) # Nodes are inserted to the db if they are not in it yet if not 'id' in source_dict: db_api.insert_node(source_dict) if not 'id' in target_dict: db_api.insert_node(target_dict) interaction_types = "effect:%s|is_directed:%s|is_direct:%s" \ % ('MI:0256(rna interference)', 'directed', 'unknown') # Inserting edges edge_dict = { 'publication_ids': 'pubmed:24297251', 'layer': '7', 'source_db': 'starbase', 'interaction_identifiers': None, 'confidence_scores': None, 'interaction_detection_method': file_to_detmet[file], 'interaction_types': interaction_types, 'first_author': None } db_api.insert_edge(source_dict, target_dict, edge_dict) # Saving the to a DB_TYPE.db files db_api.save_db_to_file(DB_DESTINATION)
def main(logger): file_ = open(RAW_FILE) file_.seek(0, os.SEEK_END) filesize = file_.tell() filesize_mb = filesize / (1024 * 1024) # reseting the iterator to the begining of the files file_.seek(0) file_.readline() # Creating a psimi to sql db to every 15Mb of the raw Biogrid files # Setting the size of the pice mb = 1024 * 1024 piece_size = 10 * mb # The number of the little files file_counter = 0 while file_.tell() < filesize: starting_position = file_.tell() parser = PsimiSQL(SQL_SEED) while file_.tell() < starting_position + piece_size: sys.stdout.write( "Parsing piece: %d Mb / %d Mb Total: %d Mb / %d Mb \r" % ((file_.tell() - starting_position) / (1024 * 1024), piece_size / (1024 * 1024), file_.tell() / (1024 * 1024), filesize_mb)) # Dealing with the data line = file_.readline() cells = line.split("\t") try: # Extracting node a's properties node_a_dict = { 'name': extract_property("biogrid", cells[2]), 'alt_accession': extract_property("locuslink", cells[2]), 'tax_id': cells[9], 'pathways': None, 'aliases': None } # Extracting node b's properties node_b_dict = { 'name': extract_property("biogrid", cells[3]), 'alt_accession': extract_property("locuslink", cells[3]), 'tax_id': cells[10], 'pathways': None, 'aliases': None } # Interaction types inttype = cells[11].replace('psi-mi:', '').replace('"', '') if inttype == 'MI:0407(direct interaction)': is_direct = inttype effect = 'MI:0190(interaction type)' else: is_direct = 'unknown' effect = inttype interaction_types = "effect:%s|is_directed:%s|is_direct:%s" \ % (effect, "directed", is_direct) # Extracting the edge's properties edge_dict = { 'interaction_detection_method': cells[6].replace('psi-mi:', '').replace('"', ''), 'first_author': cells[7], 'publication_ids': cells[8], 'interaction_types': interaction_types, 'source_db': 'biogrid', 'interaction_identifiers': None, 'confidence_scores': cells[14], 'layer': "1" } # Inserting interactor a to the node table parser.insert_node(node_a_dict) # Inserting interactor b to the node table parser.insert_node(node_b_dict) # After insertion the node dictionaries will contain a lastrowid property # Inserting edge parser.insert_edge(node_a_dict, node_b_dict, edge_dict) # Inserting aliases #aliases_a = cells[4] #aliases_b = cells[5] #parser.insert_aliases(node_a_dict,aliases_a) #parser.insert_aliases(node_b_dict,aliases_b) except IndexError: break parser.save_db_to_file(DB_DESTINATION + "db_piece_%d" % file_counter) parser.db.close() sum_files = filesize / piece_size #sys.stdout.write('%d / %d SQLite db pieces created\r' % (file_counter, sum_files)) file_counter += 1 print("Data insertion is completed")