def main(): # opening the old_db for mapping old_db = PsimiSQL() old_db.import_from_db_file(SOURCE_DB_LOCATION) # making the script more verbose counter = 0 old_db.cursor.execute("SELECT count(*) FROM node") number_of_nodes = old_db.cursor.fetchone()[0] # iterating through the old_db's nodes old_db.cursor.execute("SELECT * FROM node") # mapping old node_ids to new node old ids old_node_ids_dict = {} # initiating an empty db the maped nodes are put new_db = PsimiSQL() # declaring a counter to count the nodes that does not match no_match_counter = 0 invalid_node_counter = 0 # looping through the old_db_s nodes while True: row = old_db.cursor.fetchone() # communicating with user sys.stdout.write("Querying %d. node from dictionary out of %d\r" % (counter, number_of_nodes)) counter += 1 # until the last row if row == None: break else: row_id, mitab_name, alt_accession, mitab_tax_id, pathways, aliases, topology = row tax_id = str(mitab_tax_id.split(':')[1]) name = str(mitab_name.split(':')[1]) old_node_dict = { "id": row_id, "name": mitab_name, "alt_accession": alt_accession, "tax_id": mitab_tax_id, "pathways": pathways, "aliases": aliases, "topology": topology } # if the fetched node is already mapped, just it's copy will be inserted # if "uniprot" in mitab_name: # add_uniprot(old_node_dict,old_node_ids_dict,new_db) # else: query = """ SELECT DISTINCT foreign_ids.accession, uniprot.accession, uniprot.is_swissprot, uniprot.is_primary FROM foreign_ids JOIN uniprot ON foreign_ids.uniprot_id = uniprot.id WHERE foreign_ids.accession = ? AND uniprot.tax_id = ? AND uniprot.is_primary = 1 """ tup = (name, tax_id) DICTIONARY_DB_CURSOR.execute(query, tup) DICTIONARY_DB.commit() result = DICTIONARY_DB_CURSOR.fetchall() if len(result) == 0: # if there is no match in the map for the current node no_match_counter += 1 else: # get a list with only the swissprot nodes from the result of the SQL query swiss_nodes = get_swiss_arr(result) # getting the trembl nodes arr trembl_nodes = get_trembl_arr(result) # getting the new aliases aliases = get_aliases_string(trembl_nodes) # best case scenario it's a 1 -> 1 map if len(swiss_nodes) == 1: swiss_accession = "uniprot:" + swiss_nodes[0][1] add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases) # if it maps to more than one swissprot accession, all swissprot nodes will be added elif len(swiss_nodes) > 1: for node in swiss_nodes: swiss_accession = "uniprot:" + node[1] add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases) # adding trembl nodes if the old node does not match any swissprot accession else: for node in trembl_nodes: trembl_accession = "trembl:" + node[1] add_node(old_node_dict, old_node_ids_dict, trembl_accession, new_db, aliases) print("Inserting to %s nodes done" % SOURCE_DB_TYPE) # setting up counters, to be able to give the user some information of the ongoing process old_db.cursor.execute("SELECT count(*) FROM edge") number_of_edges = old_db.cursor.fetchone()[0] edge_counter = 0 query = "SELECT * from edge" old_db.cursor.execute(query) while True: # informing the user sys.stdout.write("Parsing edge # %d out of %d\r" % (edge_counter, number_of_edges)) row = old_db.cursor.fetchone() if row == None: break else: edge_counter += 1 # deconstructing the row (list) edge_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row # since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict # it both nodes mapped we add them as an edge to the new db if old_node_ids_dict.has_key( old_interactor_a_node_id) and old_node_ids_dict.has_key( old_interactor_b_node_id): # looping through every new 'A' node for new_node_id_a in old_node_ids_dict[ old_interactor_a_node_id]: new_node_a_dict = new_db.get_node_by_id(new_node_id_a) # looping through every new 'B' node for every new 'A' node and inserting them as an edge for new_node_id_b in old_node_ids_dict[ old_interactor_b_node_id]: new_node_b_dict = new_db.get_node_by_id(new_node_id_b) # generating the new edge dict new_edge_dict = { 'interactor_a_node_id': new_node_id_a, 'interactor_b_node_id': new_node_id_b, 'interactor_a_node_name': interactor_a_node_name, 'interactor_b_node_name': interactor_b_node_name, 'interaction_detection_method': interaction_detection_method, 'first_author': first_author, 'publication_ids': publication_ids, 'source_db': "source database:" + SOURCE_DB_TYPE, 'interaction_types': interaction_types, 'interaction_identifiers': interaction_identifiers, 'confidence_scores': confidence_scores, 'layer': layer } # inserting the new node new_db.insert_edge(new_node_a_dict, new_node_b_dict, new_edge_dict) else: # countering the nodes that can't be inserted to the new db because one of their nodes haven't mapped invalid_node_counter += 1 print("Inserting edges to %s.db finished!" % SOURCE_DB_TYPE) new_db.save_db_to_file(DESTINATION_DB_LOCATION)
def main(): # opening the old_db for mapping old_db = PsimiSQL() old_db.import_from_db_file(SOURCE_DB_LOCATION) # making the script more verbose counter = 0 old_db.cursor.execute("SELECT count(*) FROM node") number_of_nodes = old_db.cursor.fetchone()[0] # iterating through the old_db's nodes old_db.cursor.execute("SELECT * FROM node") # mapping old node_ids to new node old ids old_node_ids_dict = {} # initiating an empty db the maped nodes are put new_db = PsimiSQL() # declaring a counter to count the nodes that does not match no_match_counter = 0 invalid_node_counter = 0 # looping through the old_db_s nodes while True: row = old_db.cursor.fetchone() # communicating with user sys.stdout.write("Querying %d. node from dictionary out of %d\r" % (counter, number_of_nodes)) counter += 1 # until the last row if row == None: break else: row_id, mitab_name, alt_accession, mitab_tax_id, pathways, aliases, topology = row tax_id = str(mitab_tax_id.split(':')[1]) name = str(mitab_name.split(':')[1]) old_node_dict = { "id" : row_id, "name" : mitab_name, "alt_accession" : alt_accession, "tax_id" : mitab_tax_id, "pathways" : pathways, "aliases" : aliases, "topology" : topology } # if the fetched node is already mapped, just it's copy will be inserted # if "uniprot" in mitab_name: # add_uniprot(old_node_dict,old_node_ids_dict,new_db) # else: query = """ SELECT DISTINCT foreign_ids.accession, uniprot.accession, uniprot.is_swissprot, uniprot.is_primary FROM foreign_ids JOIN uniprot ON foreign_ids.uniprot_id = uniprot.id WHERE foreign_ids.accession = ? AND uniprot.tax_id = ? AND uniprot.is_primary = 1 """ tup = (name, tax_id) DICTIONARY_DB_CURSOR.execute(query, tup) DICTIONARY_DB.commit() result = DICTIONARY_DB_CURSOR.fetchall() if len(result) == 0: # if there is no match in the map for the current node no_match_counter+=1 else: # get a list with only the swissprot nodes from the result of the SQL query swiss_nodes = get_swiss_arr(result) # getting the trembl nodes arr trembl_nodes = get_trembl_arr(result) # getting the new aliases aliases = get_aliases_string(trembl_nodes) # best case scenario it's a 1 -> 1 map if len(swiss_nodes) == 1: swiss_accession = "uniprot:"+swiss_nodes[0][1] add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases) # if it maps to more than one swissprot accession, all swissprot nodes will be added elif len(swiss_nodes) > 1: for node in swiss_nodes: swiss_accession = "uniprot:"+node[1] add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases) # adding trembl nodes if the old node does not match any swissprot accession else: for node in trembl_nodes: trembl_accession = "trembl:"+node[1] add_node(old_node_dict, old_node_ids_dict, trembl_accession, new_db, aliases) print("Inserting to %s nodes done" % SOURCE_DB_TYPE) # setting up counters, to be able to give the user some information of the ongoing process old_db.cursor.execute("SELECT count(*) FROM edge") number_of_edges = old_db.cursor.fetchone()[0] edge_counter = 0 query = "SELECT * from edge" old_db.cursor.execute(query) while True: # informing the user sys.stdout.write("Parsing edge # %d out of %d\r" % (edge_counter, number_of_edges)) row = old_db.cursor.fetchone() if row == None: break else: edge_counter += 1 # deconstructing the row (list) edge_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method , first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row # since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict # it both nodes mapped we add them as an edge to the new db if old_node_ids_dict.has_key(old_interactor_a_node_id) and old_node_ids_dict.has_key( old_interactor_b_node_id): # looping through every new 'A' node for new_node_id_a in old_node_ids_dict[old_interactor_a_node_id]: new_node_a_dict = new_db.get_node_by_id(new_node_id_a) # looping through every new 'B' node for every new 'A' node and inserting them as an edge for new_node_id_b in old_node_ids_dict[old_interactor_b_node_id]: new_node_b_dict = new_db.get_node_by_id(new_node_id_b) # generating the new edge dict new_edge_dict = { 'interactor_a_node_id' : new_node_id_a, 'interactor_b_node_id': new_node_id_b, 'interactor_a_node_name' : interactor_a_node_name, 'interactor_b_node_name': interactor_b_node_name, 'interaction_detection_method' : interaction_detection_method, 'first_author' : first_author, 'publication_ids' : publication_ids, 'source_db' : "source database:"+SOURCE_DB_TYPE, 'interaction_types' : interaction_types, 'interaction_identifiers' : interaction_identifiers, 'confidence_scores' : confidence_scores, 'layer' : layer } # inserting the new node new_db.insert_edge(new_node_a_dict, new_node_b_dict, new_edge_dict) else: # countering the nodes that can't be inserted to the new db because one of their nodes haven't mapped invalid_node_counter += 1 print("Inserting edges to %s.db finished!" % SOURCE_DB_TYPE) new_db.save_db_to_file(DESTINATION_DB_LOCATION)