nodes[name] = node else: nodes[name].update(node) #closing the current db db.close() #logging out some info current_file = PIECE_LIST.index(filename) sys.stdout.write( "Building the node dictionary: Processing %d files out of %d\r" % (current_file, sum_files)) #making a memory database and inserting the unique nodes from the nodes dictionary print('Inserting nodes to database') parser = PsimiSQL(SQL_SEED) for node in nodes: parser.insert_unique_node(nodes[node]) #now that we have the nodes in the final db, the edges can be inserted #there is no need for a edges dictionary, because reading it from the files costs less memory #iterating through the .db piece files again print('Inserting edges to database') for filename in PIECE_LIST: db = sqlite3.connect(filename) query = "SELECT * FROM edge" cursor = db.cursor() cursor.execute(query) #iterating trough the current piece .db files while True: edge_row = cursor.fetchone() if edge_row == None:
def main(): # declaring the dicts that will hold the data nodes = {} collected_edges = {} merged_edge_counter = 0 not_merged_edge = 0 # the number of pieces (.db files) sum_files = len(SOURCE_DB_FILE_LIST) # filling up the nodes dictionary with the data contained in db piece files for db_file in SOURCE_DB_FILE_LIST: # executing a query that selects everything (but the node id) from the current SQLite .db file db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute("SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'") # iterating trough the db row by row while True: row = cursor.fetchone() # until the last row if row == None: break # if unique, inserting the node (row) to the nodes dictionary id, name, alt_accession, tax_id, pathways, aliases, topology = row node = { "name": name, "alt_accession": alt_accession, "tax_id": tax_id, "pathways": pathways, "aliases": aliases, "topology": topology, } if not nodes.has_key(name): nodes[name] = node else: nodes[name] = get_union_of_nodes(nodes[name], node) # closing the current db db.close() # logging out some info current_file = SOURCE_DB_FILE_LIST.index(db_file) sys.stdout.write("Building the node dictionary: Processing %d file out of %d\r" % (current_file, sum_files)) # making a memory database and inserting the unique nodes from the nodes dictionary print("Inserting nodes to database") parser = PsimiSQL() for node in nodes: parser.insert_unique_node(nodes[node]) nodes[node]["id"] = parser.cursor.lastrowid # looping through the files again to make an edge list print("Started building edge dict") file_counter = 1 for db_file in SOURCE_DB_FILE_LIST: sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" % (db_file, file_counter, sum_files)) # executing a query that selects everything (but the node id) from the current SQLite .db file db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute("SELECT * FROM edge") while True: row = cursor.fetchone() # if there aren't any more nodes break out of the loop if not row: break else: # deconstructing the row (list) edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = ( row ) # because in the nodes dict building process the query only asks for human and salmonella nodes # we have to make sure that we don't try to insert edges whose # nodes are in the nodes dict (=does not a node of other organism) if nodes.has_key(interactor_a_node_name) and nodes.has_key(interactor_b_node_name): # generating an edge id that will be the key in the edge dict edge_id = interactor_a_node_name + "@" + interactor_b_node_name # generating an edge dict, that will be a value for the key in the collected_edges dict current_edge = { "interaction_detection_method": interaction_detection_method, "first_author": first_author, "publication_ids": publication_ids, "interaction_types": interaction_types, "source_db": source_db, "interaction_identifiers": interaction_identifiers, "confidence_scores": confidence_scores, "layer": layer, } # if the collected_edges dict does not contain # this edge_id. the edge is stored in the collected edges if not collected_edges.has_key(edge_id): collected_edges[edge_id] = current_edge else: # if collected_edges has this id the edge will be merged collected_edge = collected_edges[edge_id] # if an edge is already in the dict it will be merged with the current edge collected_edge["interaction_types"] = merge_strings( collected_edge["interaction_types"], current_edge["interaction_types"] ) collected_edge["first_author"] = merge_strings( collected_edge["first_author"], current_edge["first_author"] ) collected_edge["source_db"] = merge_strings( collected_edge["source_db"], current_edge["source_db"] ) collected_edge["interaction_identifiers"] = merge_strings( collected_edge["interaction_identifiers"], current_edge["interaction_identifiers"] ) collected_edge["interaction_detection_method"] = merge_strings( collected_edge["interaction_detection_method"], current_edge["interaction_detection_method"] ) collected_edge["confidence_scores"] = merge_strings( collected_edge["confidence_scores"], current_edge["confidence_scores"] ) print("Building edge dict done!") print("Started inserting edges to the db") # iterating through edges dictionary and inserting nodes to the SQLite db for collected_edge_id, edge_to_insert in collected_edges.iteritems(): # getting the nodes node_a, node_b = collected_edge_id.split("@") node_a_dict = nodes[node_a] node_b_dict = nodes[node_b] parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert) print("Saving db") parser.save_db_to_file(DESTINATION)
def main(): # declaring the dicts that will hold the data nodes = {} collected_edges = {} merged_edge_counter = 0 not_merged_edge = 0 # the number of pieces (.db files) sum_files = len(SOURCE_DB_FILE_LIST) # filling up the nodes dictionary with the data contained in db piece files for db_file in SOURCE_DB_FILE_LIST: # executing a query that selects everything (but the node id) from the current SQLite .db file db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute( "SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'" ) # iterating trough the db row by row while True: row = cursor.fetchone() # until the last row if row == None: break # if unique, inserting the node (row) to the nodes dictionary id, name, alt_accession, tax_id, pathways, aliases, topology = row node = { "name": name, 'alt_accession': alt_accession, 'tax_id': tax_id, 'pathways': pathways, 'aliases': aliases, 'topology': topology } if not nodes.has_key(name): nodes[name] = node else: nodes[name] = get_union_of_nodes(nodes[name], node) # closing the current db db.close() # logging out some info current_file = SOURCE_DB_FILE_LIST.index(db_file) sys.stdout.write( "Building the node dictionary: Processing %d file out of %d\r" % (current_file, sum_files)) # making a memory database and inserting the unique nodes from the nodes dictionary print('Inserting nodes to database') parser = PsimiSQL() for node in nodes: parser.insert_unique_node(nodes[node]) nodes[node]['id'] = parser.cursor.lastrowid # looping through the files again to make an edge list print("Started building edge dict") file_counter = 1 for db_file in SOURCE_DB_FILE_LIST: sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" % (db_file, file_counter, sum_files)) # executing a query that selects everything (but the node id) from the current SQLite .db file db = sqlite3.connect(db_file) cursor = db.cursor() cursor.execute("SELECT * FROM edge") while True: row = cursor.fetchone() # if there aren't any more nodes break out of the loop if not row: break else: # deconstructing the row (list) edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row # because in the nodes dict building process the query only asks for human and salmonella nodes # we have to make sure that we don't try to insert edges whose # nodes are in the nodes dict (=does not a node of other organism) if nodes.has_key(interactor_a_node_name) and nodes.has_key( interactor_b_node_name): # generating an edge id that will be the key in the edge dict edge_id = interactor_a_node_name + "@" + interactor_b_node_name # generating an edge dict, that will be a value for the key in the collected_edges dict current_edge = { 'interaction_detection_method': interaction_detection_method, 'first_author': first_author, 'publication_ids': publication_ids, 'interaction_types': interaction_types, 'source_db': source_db, 'interaction_identifiers': interaction_identifiers, 'confidence_scores': confidence_scores, 'layer': layer } # if the collected_edges dict does not contain # this edge_id. the edge is stored in the collected edges if not collected_edges.has_key(edge_id): collected_edges[edge_id] = current_edge else: # if collected_edges has this id the edge will be merged collected_edge = collected_edges[edge_id] # if an edge is already in the dict it will be merged with the current edge collected_edge['interaction_types'] = merge_strings( collected_edge['interaction_types'], current_edge['interaction_types']) collected_edge['first_author'] = merge_strings( collected_edge['first_author'], current_edge['first_author']) collected_edge['source_db'] = merge_strings( collected_edge['source_db'], current_edge['source_db']) collected_edge[ 'interaction_identifiers'] = merge_strings( collected_edge['interaction_identifiers'], current_edge['interaction_identifiers']) collected_edge[ 'interaction_detection_method'] = merge_strings( collected_edge['interaction_detection_method'], current_edge['interaction_detection_method']) collected_edge['confidence_scores'] = merge_strings( collected_edge['confidence_scores'], current_edge['confidence_scores']) print("Building edge dict done!") print("Started inserting edges to the db") # iterating through edges dictionary and inserting nodes to the SQLite db for collected_edge_id, edge_to_insert in collected_edges.iteritems(): # getting the nodes node_a, node_b = collected_edge_id.split('@') node_a_dict = nodes[node_a] node_b_dict = nodes[node_b] parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert) print("Saving db") parser.save_db_to_file(DESTINATION)