Exemplos de PsimiSQL.save_db_to_file em Python, exemplos de sqlite_db_api.PsimiSQL.save_db_to_file em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: single_prediction_parser.py Projeto: blaisehorvath/host-pathogen-networks

                'alt_accession': "gene symbol:" + human_node_acc,
                'tax_id': "taxid:9606",
                'pathways': "",
                'aliases': "",
                'topology': ""
            }

            human_nodes[human_node_acc]['id'] = db_api.insert_node(human_node)

        edges.append([salm_node_acc, human_node_acc])

for edge in edges:

    edge_dict = {
        'interaction_detection_method': "",
        'first_author': "",
        'publication_ids': "",
        'interaction_types': "",
        'source_db': args.prediction_name,
        'interaction_identifiers': "",
        'confidence_scores': "",
        'layer': "0"
    }

    if edge[0] in salmonella_nodes:

        db_api.insert_edge(salmonella_nodes[edge[0]], human_nodes[edge[1]], edge_dict)

db_api.save_db_to_file(args.outfile)

print("The database was saved to %s" % (args.outfile + ".db"))

Exemplo n.º 2

0

Exibir arquivo

def main():

    # opening the old_db for mapping
    old_db = PsimiSQL()
    old_db.import_from_db_file(SOURCE_DB_LOCATION)

    # making the script more verbose
    counter = 0
    old_db.cursor.execute("SELECT count(*) FROM node")
    number_of_nodes = old_db.cursor.fetchone()[0]

    # iterating through the old_db's nodes
    old_db.cursor.execute("SELECT * FROM node")

    # mapping old node_ids to new node old ids
    old_node_ids_dict = {}

    # initiating an empty db the maped nodes are put
    new_db = PsimiSQL()

    # declaring a counter to count the nodes that does not match
    no_match_counter = 0
    invalid_node_counter = 0

    # looping through the old_db_s nodes
    while True:
        row = old_db.cursor.fetchone()

        # communicating with user
        sys.stdout.write("Querying %d. node from dictionary out of %d\r" %
                         (counter, number_of_nodes))
        counter += 1

        # until the last row
        if row == None:
            break
        else:
            row_id, mitab_name, alt_accession, mitab_tax_id, pathways, aliases, topology = row

            tax_id = str(mitab_tax_id.split(':')[1])
            name = str(mitab_name.split(':')[1])

            old_node_dict = {
                "id": row_id,
                "name": mitab_name,
                "alt_accession": alt_accession,
                "tax_id": mitab_tax_id,
                "pathways": pathways,
                "aliases": aliases,
                "topology": topology
            }

            # if the fetched node is already mapped, just it's copy will be inserted
            #  if "uniprot" in mitab_name:
            #      add_uniprot(old_node_dict,old_node_ids_dict,new_db)
            #  else:

            query = """
                SELECT DISTINCT foreign_ids.accession, uniprot.accession, uniprot.is_swissprot, uniprot.is_primary
                FROM foreign_ids JOIN uniprot ON foreign_ids.uniprot_id = uniprot.id
                WHERE foreign_ids.accession = ? AND uniprot.tax_id = ? AND uniprot.is_primary = 1
            """

            tup = (name, tax_id)

            DICTIONARY_DB_CURSOR.execute(query, tup)
            DICTIONARY_DB.commit()

            result = DICTIONARY_DB_CURSOR.fetchall()

            if len(result) == 0:
                # if there is no match in the map for the current node
                no_match_counter += 1
            else:
                # get a list with only the swissprot nodes from the result of the SQL query
                swiss_nodes = get_swiss_arr(result)

                # getting the trembl nodes arr
                trembl_nodes = get_trembl_arr(result)

                # getting the new aliases
                aliases = get_aliases_string(trembl_nodes)

                # best case scenario it's a 1 -> 1 map
                if len(swiss_nodes) == 1:
                    swiss_accession = "uniprot:" + swiss_nodes[0][1]
                    add_node(old_node_dict, old_node_ids_dict, swiss_accession,
                             new_db, aliases)
                # if it maps to more than one swissprot accession, all swissprot nodes will be added
                elif len(swiss_nodes) > 1:
                    for node in swiss_nodes:
                        swiss_accession = "uniprot:" + node[1]
                        add_node(old_node_dict, old_node_ids_dict,
                                 swiss_accession, new_db, aliases)
                # adding trembl nodes if the old node does not match any swissprot accession
                else:
                    for node in trembl_nodes:
                        trembl_accession = "trembl:" + node[1]
                        add_node(old_node_dict, old_node_ids_dict,
                                 trembl_accession, new_db, aliases)

    print("Inserting to %s nodes done" % SOURCE_DB_TYPE)

    # setting up counters, to be able to give the user some information of the ongoing process
    old_db.cursor.execute("SELECT count(*) FROM edge")
    number_of_edges = old_db.cursor.fetchone()[0]
    edge_counter = 0

    query = "SELECT * from edge"
    old_db.cursor.execute(query)

    while True:
        # informing the user
        sys.stdout.write("Parsing edge # %d out of %d\r" %
                         (edge_counter, number_of_edges))
        row = old_db.cursor.fetchone()

        if row == None:
            break
        else:
            edge_counter += 1

            # deconstructing the row (list)
            edge_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row

            # since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict
            # it both nodes mapped we add them as an edge to the new db
            if old_node_ids_dict.has_key(
                    old_interactor_a_node_id) and old_node_ids_dict.has_key(
                        old_interactor_b_node_id):

                # looping through every new 'A' node
                for new_node_id_a in old_node_ids_dict[
                        old_interactor_a_node_id]:

                    new_node_a_dict = new_db.get_node_by_id(new_node_id_a)

                    # looping through every new 'B' node for every new 'A' node and inserting them as an edge
                    for new_node_id_b in old_node_ids_dict[
                            old_interactor_b_node_id]:

                        new_node_b_dict = new_db.get_node_by_id(new_node_id_b)

                        # generating the new edge dict
                        new_edge_dict = {
                            'interactor_a_node_id': new_node_id_a,
                            'interactor_b_node_id': new_node_id_b,
                            'interactor_a_node_name': interactor_a_node_name,
                            'interactor_b_node_name': interactor_b_node_name,
                            'interaction_detection_method':
                            interaction_detection_method,
                            'first_author': first_author,
                            'publication_ids': publication_ids,
                            'source_db': "source database:" + SOURCE_DB_TYPE,
                            'interaction_types': interaction_types,
                            'interaction_identifiers': interaction_identifiers,
                            'confidence_scores': confidence_scores,
                            'layer': layer
                        }

                        # inserting the new node
                        new_db.insert_edge(new_node_a_dict, new_node_b_dict,
                                           new_edge_dict)
            else:
                # countering the nodes that can't be inserted to the new db because one of their nodes haven't mapped
                invalid_node_counter += 1
    print("Inserting edges to %s.db finished!" % SOURCE_DB_TYPE)

    new_db.save_db_to_file(DESTINATION_DB_LOCATION)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: merge_layer.py Projeto: blaisehorvath/host-pathogen-networks

def main():
    # declaring the dicts that will hold the data
    nodes = {}
    collected_edges = {}

    merged_edge_counter = 0
    not_merged_edge = 0

    # the number of pieces (.db files)
    sum_files = len(SOURCE_DB_FILE_LIST)

    # filling up the nodes dictionary with the data contained in db piece files
    for db_file in SOURCE_DB_FILE_LIST:

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'")

        # iterating trough the db row by row
        while True:
            row = cursor.fetchone()
            # until the last row
            if row == None:
                break
            # if unique, inserting the node (row) to the nodes dictionary
            id, name, alt_accession, tax_id, pathways, aliases, topology = row
            node = {
                "name": name,
                "alt_accession": alt_accession,
                "tax_id": tax_id,
                "pathways": pathways,
                "aliases": aliases,
                "topology": topology,
            }
            if not nodes.has_key(name):
                nodes[name] = node
            else:
                nodes[name] = get_union_of_nodes(nodes[name], node)
        # closing the current db
        db.close()
        # logging out some info
        current_file = SOURCE_DB_FILE_LIST.index(db_file)
        sys.stdout.write("Building the node dictionary: Processing %d file out of %d\r" % (current_file, sum_files))

    # making a memory database and inserting the unique nodes from the nodes dictionary
    print("Inserting nodes to database")
    parser = PsimiSQL()
    for node in nodes:
        parser.insert_unique_node(nodes[node])
        nodes[node]["id"] = parser.cursor.lastrowid

    # looping through the files again to make an edge list
    print("Started building edge dict")
    file_counter = 1
    for db_file in SOURCE_DB_FILE_LIST:

        sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" % (db_file, file_counter, sum_files))

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM edge")

        while True:
            row = cursor.fetchone()

            # if there aren't any more nodes break out of the loop
            if not row:
                break
            else:
                # deconstructing the row (list)
                edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = (
                    row
                )

                # because in the nodes dict building process the query only asks for human and salmonella nodes
                # we have to make sure that we don't try to insert edges whose
                # nodes are in the nodes dict (=does not a node of other organism)
                if nodes.has_key(interactor_a_node_name) and nodes.has_key(interactor_b_node_name):

                    # generating an edge id that will be the key in the edge dict
                    edge_id = interactor_a_node_name + "@" + interactor_b_node_name

                    # generating an edge dict, that will be a value for the key in the collected_edges dict
                    current_edge = {
                        "interaction_detection_method": interaction_detection_method,
                        "first_author": first_author,
                        "publication_ids": publication_ids,
                        "interaction_types": interaction_types,
                        "source_db": source_db,
                        "interaction_identifiers": interaction_identifiers,
                        "confidence_scores": confidence_scores,
                        "layer": layer,
                    }

                    # if the collected_edges dict does not contain
                    # this edge_id. the edge is stored in the collected edges
                    if not collected_edges.has_key(edge_id):
                        collected_edges[edge_id] = current_edge
                    else:
                        # if collected_edges has this id the edge will be merged
                        collected_edge = collected_edges[edge_id]

                        # if an edge is already in the dict it will be merged with the current edge
                        collected_edge["interaction_types"] = merge_strings(
                            collected_edge["interaction_types"], current_edge["interaction_types"]
                        )
                        collected_edge["first_author"] = merge_strings(
                            collected_edge["first_author"], current_edge["first_author"]
                        )
                        collected_edge["source_db"] = merge_strings(
                            collected_edge["source_db"], current_edge["source_db"]
                        )
                        collected_edge["interaction_identifiers"] = merge_strings(
                            collected_edge["interaction_identifiers"], current_edge["interaction_identifiers"]
                        )
                        collected_edge["interaction_detection_method"] = merge_strings(
                            collected_edge["interaction_detection_method"], current_edge["interaction_detection_method"]
                        )
                        collected_edge["confidence_scores"] = merge_strings(
                            collected_edge["confidence_scores"], current_edge["confidence_scores"]
                        )

    print("Building edge dict done!")
    print("Started inserting edges to the db")
    # iterating through edges dictionary and inserting nodes to the SQLite db
    for collected_edge_id, edge_to_insert in collected_edges.iteritems():
        # getting the nodes
        node_a, node_b = collected_edge_id.split("@")

        node_a_dict = nodes[node_a]
        node_b_dict = nodes[node_b]

        parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert)

    print("Saving db")
    parser.save_db_to_file(DESTINATION)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: database_parser.py Projeto: blaisehorvath/host-pathogen-networks

def main():
    # parsing the arguments
    arguments = parse_arguments()

    # importing the PsimiSQL clas
    sys.path.append(arguments.sqlite_db_api)
    from sqlite_db_api import PsimiSQL

    # the nodes and the edges will be stored in a dict
    nodes = {}
    edges = {}

    # Parsing the nodes first and merging pathways and storing this in the nodes dict created above
    # (querying node to check whether it has the same pathway and than updating it with sql queries would be slow)
    with open(arguments.source_file) as source_file:
        # skipping the header line if needed
        if arguments.skip_header:
            source_file.readline()

        # setting up variables for informing the user
        num_lines = float(sum([1 for line in source_file]))
        line_counter = float(0)
        source_file.seek(0)

        # looping through the file
        for line in source_file:
            # infroming the user
            line_counter += 1
            if line_counter % 50 == 0:
                done = (line_counter / num_lines) * 100
                sys.stdout.write("Parsing mitab file (%d%%)\r" % (done))

            # deconstructing the line
            source_acc, target_acc, source_alt_acc, target_alt_acc, source_alias, target_alias, int_det_method, author, pubmed_ids, source_tax_id, target_tax_id, int_type, source_db, confidence, pathway_ids, layer, source_topology, target_topology = line.strip(
            ).split("\t")

            source_dict = {
                "name": source_acc,
                "alt_accession": source_alt_acc,
                "tax_id": source_tax_id,
                "pathways": pathway_ids,
                "aliases": source_alias,
                "topology": source_topology
            }

            add_to_nodes(source_dict, nodes)

            target_dict = {
                "name": target_acc,
                "alt_accession": target_alt_acc,
                "tax_id": target_tax_id,
                "pathways": pathway_ids,
                "aliases": target_alias,
                "topology": target_topology
            }

            add_to_nodes(target_dict, nodes)

            # adding the edge to the edges dict
            edges["%s@%s" % (source_acc, target_acc)] = {
                'interaction_detection_method': int_det_method,
                'first_author': author,
                'publication_ids': pubmed_ids,
                'interaction_types': int_type,
                'source_db': source_db,
                'interaction_identifiers': '-',
                'confidence_scores': confidence,
                'layer': layer
            }

    # informing the user
    print("Parsing MiTAB file: Finished")

    # now that we have the unique nodes we can add them to the Psi-Mi-SQL database

    # initiating the memory Mitab database
    db_api = PsimiSQL()

    num_nodes = float(len(nodes))
    line_counter = float(1)

    # inserting the nodes to the memory db
    for node_name, node_dict in nodes.items():

        #informing the user
        if line_counter % 50 == 0:
            done = float((line_counter / num_nodes) * 100)
            sys.stdout.write("Inserting nodes to NetMiTabSQL (%s%%)" % (done))
        line_counter += 1

        # inserting node to the db file
        db_api.insert_node(node_dict)

        # updating (mutating) the node dict with the SQL row id in the nodes dictionary so it can be used later
        # (again, it is faster to store the row ids fot the rows than querying each rowid)
        nodes["id"] = db_api.last_row_id

    print("Inserting nodes to NetMiTabSQL: Done")

    num_edges = float(len(nodes))
    line_counter = float(1)

    # inserting the edges to the memory db
    for edge_id, edge_dict in edges.items():

        #informing the user
        if line_counter % 50 == 0:
            done = float((line_counter / num_edges) * 100)
            sys.stdout.write("Inserting nodes to NetMiTabSQL (%s%%)" % (done))
        line_counter += 1

        source_name, target_name = edge_id.split('@')

        source_dict = nodes[source_name]
        target_dict = nodes[target_name]

        db_api.insert_edge(source_dict, target_dict, edge_dict)

    print("Inserting edges to NetMiTabSQL: Done")

    print("Saving the database to filesystem")
    # the database is finished, saving
    db_api.save_db_to_file(arguments.output_file)

    print("Database saved")

Exemplo n.º 5

0

Exibir arquivo

#now that we have the nodes in the final db, the edges can be inserted
#there is no need for a edges dictionary, because reading it from the files costs less memory
#iterating through the .db piece files again
print('Inserting edges to database')
for filename in PIECE_LIST:
    db = sqlite3.connect(filename)
    query = "SELECT * FROM edge"
    cursor = db.cursor()
    cursor.execute(query)

    #iterating trough the current piece .db files
    while True:
        edge_row = cursor.fetchone()
        if edge_row == None:
            break
        edge_dict = {
            'interaction_detection_method': edge_row[5],
            'first_author': edge_row[6],
            'publication_ids': edge_row[7],
            'interaction_types': edge_row[8],
            'source_db': edge_row[9],
            'interaction_identifiers': edge_row[10],
            'confidence_scores': edge_row[11],
            'layer': "3"
        }
        parser.insert_edge(nodes[edge_row[3]], nodes[edge_row[4]], edge_dict)

print('Saving database as biogrid_merged.db')
parser.save_db_to_file('biogrid_merged')
pass

Exemplo n.º 6

0

Exibir arquivo

def main():
    # declaring the dicts that will hold the data
    nodes = {}
    collected_edges = {}

    merged_edge_counter = 0
    not_merged_edge = 0

    # the number of pieces (.db files)
    sum_files = len(SOURCE_DB_FILE_LIST)

    # filling up the nodes dictionary with the data contained in db piece files
    for db_file in SOURCE_DB_FILE_LIST:

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute(
            "SELECT * FROM node WHERE tax_id = 'taxid:9606' OR tax_id = 'taxid:99284'"
        )

        # iterating trough the db row by row
        while True:
            row = cursor.fetchone()
            # until the last row
            if row == None:
                break
            # if unique, inserting the node (row) to the nodes dictionary
            id, name, alt_accession, tax_id, pathways, aliases, topology = row
            node = {
                "name": name,
                'alt_accession': alt_accession,
                'tax_id': tax_id,
                'pathways': pathways,
                'aliases': aliases,
                'topology': topology
            }
            if not nodes.has_key(name):
                nodes[name] = node
            else:
                nodes[name] = get_union_of_nodes(nodes[name], node)
        # closing the current db
        db.close()
        # logging out some info
        current_file = SOURCE_DB_FILE_LIST.index(db_file)
        sys.stdout.write(
            "Building the node dictionary: Processing %d file out of %d\r" %
            (current_file, sum_files))

    # making a memory database and inserting the unique nodes from the nodes dictionary
    print('Inserting nodes to database')
    parser = PsimiSQL()
    for node in nodes:
        parser.insert_unique_node(nodes[node])
        nodes[node]['id'] = parser.cursor.lastrowid

    # looping through the files again to make an edge list
    print("Started building edge dict")
    file_counter = 1
    for db_file in SOURCE_DB_FILE_LIST:

        sys.stdout.write("Inserting edges to edge dict from '%s' (%d/%d)\r" %
                         (db_file, file_counter, sum_files))

        # executing a query that selects everything (but the node id) from the current SQLite .db file
        db = sqlite3.connect(db_file)
        cursor = db.cursor()
        cursor.execute("SELECT * FROM edge")

        while True:
            row = cursor.fetchone()

            # if there aren't any more nodes break out of the loop
            if not row:
                break
            else:
                # deconstructing the row (list)
                edge_row_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method, first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row

                # because in the nodes dict building process the query only asks for human and salmonella nodes
                # we have to make sure that we don't try to insert edges whose
                # nodes are in the nodes dict (=does not a node of other organism)
                if nodes.has_key(interactor_a_node_name) and nodes.has_key(
                        interactor_b_node_name):

                    # generating an edge id that will be the key in the edge dict
                    edge_id = interactor_a_node_name + "@" + interactor_b_node_name

                    # generating an edge dict, that will be a value for the key in the collected_edges dict
                    current_edge = {
                        'interaction_detection_method':
                        interaction_detection_method,
                        'first_author': first_author,
                        'publication_ids': publication_ids,
                        'interaction_types': interaction_types,
                        'source_db': source_db,
                        'interaction_identifiers': interaction_identifiers,
                        'confidence_scores': confidence_scores,
                        'layer': layer
                    }

                    # if the collected_edges dict does not contain
                    # this edge_id. the edge is stored in the collected edges
                    if not collected_edges.has_key(edge_id):
                        collected_edges[edge_id] = current_edge
                    else:
                        # if collected_edges has this id the edge will be merged
                        collected_edge = collected_edges[edge_id]

                        # if an edge is already in the dict it will be merged with the current edge
                        collected_edge['interaction_types'] = merge_strings(
                            collected_edge['interaction_types'],
                            current_edge['interaction_types'])
                        collected_edge['first_author'] = merge_strings(
                            collected_edge['first_author'],
                            current_edge['first_author'])
                        collected_edge['source_db'] = merge_strings(
                            collected_edge['source_db'],
                            current_edge['source_db'])
                        collected_edge[
                            'interaction_identifiers'] = merge_strings(
                                collected_edge['interaction_identifiers'],
                                current_edge['interaction_identifiers'])
                        collected_edge[
                            'interaction_detection_method'] = merge_strings(
                                collected_edge['interaction_detection_method'],
                                current_edge['interaction_detection_method'])
                        collected_edge['confidence_scores'] = merge_strings(
                            collected_edge['confidence_scores'],
                            current_edge['confidence_scores'])

    print("Building edge dict done!")
    print("Started inserting edges to the db")
    # iterating through edges dictionary and inserting nodes to the SQLite db
    for collected_edge_id, edge_to_insert in collected_edges.iteritems():
        # getting the nodes
        node_a, node_b = collected_edge_id.split('@')

        node_a_dict = nodes[node_a]
        node_b_dict = nodes[node_b]

        parser.insert_edge(node_a_dict, node_b_dict, edge_to_insert)

    print("Saving db")
    parser.save_db_to_file(DESTINATION)

Exemplo n.º 7

0

Exibir arquivo

            human_node = {
                'name': "uniprot:" + human_uniprot,
                'alt_accession': "gene symbol:" + human_gene_symbol,
                'tax_id': "taxid:9606",
                'pathways': "",
                'aliases': "",
                'topology': ""
            }

            nodes[human_uniprot] = human_node
            nodes[human_uniprot]["id"] = db_api.insert_node(human_node)

        edge_dict = {
            'interaction_detection_method': "",
            'first_author': "",
            'publication_ids': "",
            'interaction_types': "",
            'source_db': "prediction-DD",
            'interaction_identifiers': "",
            'confidence_scores': "",
            'layer': "0"
        }

        db_api.insert_edge(nodes[salmonella_uniprot], nodes[human_uniprot],
                           edge_dict)

    # saving database

    db_api.save_db_to_file(args.outfile)

    print("The database was saved to %s" % (args.outfile))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: map_to_uniprot.py Projeto: blaisehorvath/host-pathogen-networks

def main():

    # opening the old_db for mapping
    old_db = PsimiSQL()
    old_db.import_from_db_file(SOURCE_DB_LOCATION)

    # making the script more verbose
    counter = 0
    old_db.cursor.execute("SELECT count(*) FROM node")
    number_of_nodes = old_db.cursor.fetchone()[0]

    # iterating through the old_db's nodes
    old_db.cursor.execute("SELECT * FROM node")

    # mapping old node_ids to new node old ids
    old_node_ids_dict = {}

    # initiating an empty db the maped nodes are put
    new_db = PsimiSQL()

    # declaring a counter to count the nodes that does not match
    no_match_counter = 0
    invalid_node_counter = 0

    # looping through the old_db_s nodes
    while True:
        row = old_db.cursor.fetchone()

        # communicating with user
        sys.stdout.write("Querying %d. node from dictionary out of %d\r" % (counter, number_of_nodes))
        counter += 1

        # until the last row
        if row == None:
            break
        else:
            row_id, mitab_name, alt_accession, mitab_tax_id, pathways, aliases, topology = row

            tax_id = str(mitab_tax_id.split(':')[1])
            name = str(mitab_name.split(':')[1])

            old_node_dict = {
                "id" : row_id,
                "name" : mitab_name,
                "alt_accession" : alt_accession,
                "tax_id" : mitab_tax_id,
                "pathways" : pathways,
                "aliases" : aliases,
                "topology" : topology
            }

            # if the fetched node is already mapped, just it's copy will be inserted
            #  if "uniprot" in mitab_name:
            #      add_uniprot(old_node_dict,old_node_ids_dict,new_db)
            #  else:

            query = """
                SELECT DISTINCT foreign_ids.accession, uniprot.accession, uniprot.is_swissprot, uniprot.is_primary
                FROM foreign_ids JOIN uniprot ON foreign_ids.uniprot_id = uniprot.id
                WHERE foreign_ids.accession = ? AND uniprot.tax_id = ? AND uniprot.is_primary = 1
            """

            tup = (name, tax_id)

            DICTIONARY_DB_CURSOR.execute(query, tup)
            DICTIONARY_DB.commit()

            result = DICTIONARY_DB_CURSOR.fetchall()

            if len(result) == 0:
                # if there is no match in the map for the current node
                no_match_counter+=1
            else:
                # get a list with only the swissprot nodes from the result of the SQL query
                swiss_nodes = get_swiss_arr(result)

                # getting the trembl nodes arr
                trembl_nodes = get_trembl_arr(result)

                # getting the new aliases
                aliases = get_aliases_string(trembl_nodes)

                # best case scenario it's a 1 -> 1 map
                if len(swiss_nodes) == 1:
                    swiss_accession  = "uniprot:"+swiss_nodes[0][1]
                    add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases)
                # if it maps to more than one swissprot accession, all swissprot nodes will be added
                elif len(swiss_nodes)  > 1:
                    for node in swiss_nodes:
                        swiss_accession = "uniprot:"+node[1]
                        add_node(old_node_dict, old_node_ids_dict, swiss_accession, new_db, aliases)
                # adding trembl nodes if the old node does not match any swissprot accession
                else:
                    for node in trembl_nodes:
                        trembl_accession = "trembl:"+node[1]
                        add_node(old_node_dict, old_node_ids_dict, trembl_accession, new_db, aliases)

    print("Inserting to %s nodes done" % SOURCE_DB_TYPE)

    # setting up counters, to be able to give the user some information of the ongoing process
    old_db.cursor.execute("SELECT count(*) FROM edge")
    number_of_edges = old_db.cursor.fetchone()[0]
    edge_counter = 0


    query = "SELECT * from edge"
    old_db.cursor.execute(query)

    while True:
        # informing the user
        sys.stdout.write("Parsing edge # %d out of %d\r" % (edge_counter, number_of_edges))
        row = old_db.cursor.fetchone()

        if row == None:
            break
        else:
            edge_counter += 1

            # deconstructing the row (list)
            edge_id, old_interactor_a_node_id, old_interactor_b_node_id, interactor_a_node_name, interactor_b_node_name, interaction_detection_method , first_author, publication_ids, interaction_types, source_db, interaction_identifiers, confidence_scores, layer = row

            # since we get the old interactor id's from this query we can simply look up ther new id(s) in the old_node_ids dict
            # it both nodes mapped we add them as an edge to the new db
            if old_node_ids_dict.has_key(old_interactor_a_node_id) and old_node_ids_dict.has_key( old_interactor_b_node_id):

                # looping through every new 'A' node
                for new_node_id_a in old_node_ids_dict[old_interactor_a_node_id]:

                    new_node_a_dict = new_db.get_node_by_id(new_node_id_a)

                    # looping through every new 'B' node for every new 'A' node and inserting them as an edge
                    for new_node_id_b in old_node_ids_dict[old_interactor_b_node_id]:

                        new_node_b_dict = new_db.get_node_by_id(new_node_id_b)

                        # generating the new edge dict
                        new_edge_dict = {
                            'interactor_a_node_id' : new_node_id_a,
                            'interactor_b_node_id': new_node_id_b,
                            'interactor_a_node_name' : interactor_a_node_name,
                            'interactor_b_node_name': interactor_b_node_name,
                            'interaction_detection_method' : interaction_detection_method,
                            'first_author' : first_author,
                            'publication_ids' : publication_ids,
                            'source_db' : "source database:"+SOURCE_DB_TYPE,
                            'interaction_types' : interaction_types,
                            'interaction_identifiers' : interaction_identifiers,
                            'confidence_scores' : confidence_scores,
                            'layer' : layer
                        }

                        # inserting the new node
                        new_db.insert_edge(new_node_a_dict, new_node_b_dict, new_edge_dict)
            else:
                # countering the nodes that can't be inserted to the new db because one of their nodes haven't mapped
                invalid_node_counter += 1
    print("Inserting edges to %s.db finished!" % SOURCE_DB_TYPE)

    new_db.save_db_to_file(DESTINATION_DB_LOCATION)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: map_trembl2swiss.py Projeto: blaisehorvath/host-pathogen-networks

def main():

    for db in args.source_files:

        # informing the user
        print("Parsing %s" % db)

        cursor = sqlite3.connect(db).cursor()

        mapped_nodes = {}
        nodemap = {}

        cursor.execute("SELECT * FROM node")
        result = cursor.fetchall()

        length = len(result)
        current = 1

        new_db = PsimiSQL()

        cursor.execute("SELECT count(*) FROM node")
        num_of_nodes = cursor.fetchone()[0]

        # mapping nodes

        print("Mapping nodes")

        for line in result:

            # informing user
            if (current % 50 == 0):
                print("Mapping nodes %d/%d" % (current, length))

            current += 1

            row_id, name, alt_accession, tax_id, pathways, aliases, topology = line

            old_uniprot = name

            new_uniprot = "uniprot:"+get_primary(old_uniprot.split(':')[1])

            # storing the new uniprot id for every old id
            nodemap[old_uniprot] = new_uniprot

            mapped_node = {
                'name': new_uniprot,
                'alt_accession': alt_accession,
                'tax_id': tax_id,
                'pathways': pathways,
                'aliases': aliases,
                'topology': topology
            }

            mapped_node['id'] = new_db.insert_node(mapped_node)

            mapped_nodes[new_uniprot] = mapped_node

        if len(nodemap) != num_of_nodes:
            print "Gebasz"

        # mapping edges

        cursor.execute("SELECT * FROM edge")
        result = cursor.fetchall()

        print("Mapping edges")
        length = len(result)
        current = 1
        shit_counter = 0

        for row in result:

            if (current % 10 == 0):
               print("Parsing edge %d/%d" % (current, length))
            current += 1

            old_source_uniprot = row[3]
            old_target_uniprot = row[4]


            edge_dict = {
                'interaction_detection_method': row[5],
                'first_author': row[6],
                'publication_ids': row[7],
                'interaction_types': row[8],
                'source_db': row[9],
                'interaction_identifiers': row[10],
                'confidence_scores': row[11],
                'layer': "0"
            }

            if (old_source_uniprot not in mapped_nodes or old_target_uniprot not in mapped_nodes):
                shit_counter +=1
            else:
                new_db.insert_edge(mapped_nodes[old_source_uniprot], mapped_nodes[old_target_uniprot], edge_dict)

        # saving the mapped db and informing user

        db_name = os.path.split(db)[1]

        print("Saving db to %s " % (args.outdir+"/mapped"+db_name))
        print("SHITCOUNTER %d" % shit_counter )

        new_db.save_db_to_file(args.outdir+"/mapped"+db_name)