def insert_new_proteins_n_alignments(session, target_sequence, record): database = record.database # insert the database entity (if it doesn't already exist) q_get_database = 'match $db isa database, has name "' + database + '"; get $db;' q_insert_database = 'insert $db isa database, has name "' + database + '";' database_id = insert_if_non_existent(session, q_get_database, q_insert_database, "$db") for alignment in record.alignments: # insert the protein entity (if it doesn't already exist) protein_name = alignment.hit_def.split(" >")[0].split(";")[0] q_get_protein = 'match $pr isa protein, has name "' + protein_name + '"; get $pr;' q_insert_protein = 'insert $pr isa protein, has name "' + protein_name + '";' protein_id = insert_if_non_existent(session, q_get_protein, q_insert_protein, "$pr") for hsp in alignment.hsps: sequence = hsp.sbjct # insert the sequence attribute for the protein entity (if doesn't exists already) q_get_protein_sequence = 'match $pr id ' + protein_id + ', has sequence $seq; $seq "' + sequence + '"; get $seq;' q_insert_protein_sequence = 'match $pr id ' + protein_id + '; insert $seq isa sequence; $seq "' + sequence + '"; $pr has sequence $seq;' insert_if_non_existent(session, q_get_protein_sequence, q_insert_protein_sequence, "$seq") # insert the sourcing-of-information relationship (if it doesn't already exist) q_get_sourcing = ( 'match $seq isa sequence; $seq "' + sequence + '"; $db id ' + database_id + '; ' + '$sourcing (information-source: $db, sourced-information: $seq) isa sourcing-of-information; ' + 'get $sourcing;') q_insert_sourcing = ( 'match $seq isa sequence; $seq "' + sequence + '"; $db id ' + database_id + ';' + 'insert $sourcing (information-source: $db, sourced-information: $seq) isa sourcing-of-information;' ) insert_if_non_existent(session, q_get_sourcing, q_insert_sourcing, "$sourcing") # insert the alignment relationship (if it doesn't already exist) sequence_positivity = round(hsp.positives / alignment.length, 3) sequence_identicality = round(hsp.identities / alignment.length, 3) sequence_gaps = round(hsp.gaps / alignment.length, 5) sequence_midline = hsp.match alignment_identifier = alignment.hit_id.split("|", 4)[3] q_get_alignment = ( 'match $target-seq isa sequence; $target-seq "' + target_sequence + '"; ' + '$matched-seq isa sequence; $matched-seq "' + sequence + '"; ' + '$alignment (target-sequence: $target-seq, matched-sequence: $matched-seq) isa sequence-sequence-alignment; ' + 'get $alignment;') q_insert_alignment = ( 'match $target-seq isa sequence; $target-seq "' + target_sequence + '"; ' + '$matched-seq isa sequence; $matched-seq "' + sequence + '"; ' + 'insert $alignment (target-sequence: $target-seq, matched-sequence: $matched-seq) isa sequence-sequence-alignment; ' + '$alignment has sequence-positivity ' + str(sequence_positivity) + ', has sequence-identicality ' + str(sequence_identicality) + ', has sequence-gaps ' + str(sequence_gaps) + ', has sequence-midline "' + sequence_midline + '"' + ', has identifier "' + alignment_identifier + '";') insert_if_non_existent(session, q_get_alignment, q_insert_alignment, "$alignment") # insert the species entity (if it doesn't already exist) if (len(alignment.hit_def.split("[")) > 1): species = alignment.hit_def.split("[")[1].split("]")[0] q_get_species = 'match $species isa species, has name "' + species + '"; get $species;' q_insert_species = 'insert $species isa species, has name "' + species + '"; ' species_id = insert_if_non_existent(session, q_get_species, q_insert_species, "$species") # insert protein-ownership relationship (if it doesn't already exist) q_get_protein_ownership = ( 'match $sp id "' + species_id + '"; ' + '$pr id ' + protein_id + '; ' + '$pr-ownership (species-owner: $sp, owned-protein: $pr) isa protein-ownership;' + 'get $pr-ownership;') q_insert_protein_ownership = ( 'match $sp id "' + species_id + '"; ' + '$pr id ' + protein_id + '; ' + 'insert $pr-ownership (species-owner: $sp, owned-protein: $pr) isa protein-ownership;' ) insert_if_non_existent(session, q_get_protein_ownership, q_insert_protein_ownership, "$pr-ownership")
species = protein_details[2] # insert the protein entity q_insert_protein = ("insert $pr isa protein " + 'has identifier "' + identifier + '" ' + 'has name "' + name + '" ' + 'has sequence "' + sequence + '";') protein_id = insert_anyway(session, q_insert_protein) # insert the sourcing-of-information relationship q_insert_sourcing_of_information = ( 'match $pr id ' + protein_id + '; ' + '$db id ' + db_id + '; ' + "insert (information-source: $db, sourced-information: $pr) isa sourcing-of-information;" ) insert_anyway(session, q_insert_sourcing_of_information) # insert the species entity (if it doesn't already exist) q_insert_species = 'insert $species isa species has name "' + species + '"; ' species_id = insert_if_non_existent(session, q_insert_species, "$species") # insert protein-ownership relationship (protein <> species) q_insert_protein_ownership = ( 'match $species id ' + species_id + '; ' + '$protein id ' + protein_id + '; ' + "insert (species-owner: $species, owned-protein: $protein) isa protein-ownership;" ) insert_anyway(session, q_insert_protein_ownership) print("- - - - - - - - - - - - - - - - -")
def init(data_path): """ 1. creates a Grakn session to talk to the 'proteins' keyspace 2. inserts the database entity named 'UniProt' 3. for each protein stored in target-protein-sequences.fasta, inserts the: - protein entity - species entity - species <> protein relationship - protein <> database relationship """ with GraknClient(uri="localhost:48555") as client: with client.session(keyspace="blast") as session: # insert the database entity q_get_database = 'match $db isa database, has name "uniprot"; get $db;' q_insert_database = 'insert $db isa database, has name "uniprot";' database_id = insert_if_non_existent(session, q_get_database, q_insert_database, "$db") with open(data_path) as data: for first_line, sequence in SimpleFastaParser(data): # extra relevant edata from first_line of each fasta (protein) protein_details = re.split(',| OS=| OX=', first_line.replace(' ', ',', 1)) identifier = protein_details[0].split("|")[1] name = protein_details[1] species = protein_details[2] # insert the protein entity q_get_protein = ('match $pr isa protein ' + ', has identifier "' + identifier + '" ' + ', has name "' + name + '" ' + ', has sequence "' + sequence + '"; ' + 'get $pr;') q_insert_protein = ('insert $pr isa protein ' + ', has identifier "' + identifier + '" ' + ', has name "' + name + '" ' + ', has sequence "' + sequence + '";') protein_id = insert_if_non_existent( session, q_get_protein, q_insert_protein, "$pr") # insert the sourcing-of-information relationship q_get_sourcing_of_information = ( 'match $pr id ' + protein_id + '; ' + '$db id ' + database_id + '; ' + '$sourcing (information-source: $db, sourced-information: $pr) isa sourcing-of-information; ' + 'get $sourcing;') q_insert_sourcing_of_information = ( 'match $pr id ' + protein_id + '; ' + '$db id ' + database_id + '; ' + 'insert $sourcing (information-source: $db, sourced-information: $pr) isa sourcing-of-information;' ) insert_if_non_existent(session, q_get_sourcing_of_information, q_insert_sourcing_of_information, "$sourcing") # insert the species entity q_get_species = 'match $species isa species, has name "' + species + '"; get $species;' q_insert_species = 'insert $species isa species, has name "' + species + '";' species_id = insert_if_non_existent( session, q_get_species, q_insert_species, "$species") # insert protein-ownership relationship q_get_protein_ownership = ( 'match $species id ' + species_id + '; ' + '$protein id ' + protein_id + '; ' + '$pr-ownership (species-owner: $species, owned-protein: $protein) isa protein-ownership; ' + 'get $pr-ownership;') q_insert_protein_ownership = ( 'match $species id ' + species_id + '; ' + '$protein id ' + protein_id + '; ' + 'insert $pr-ownership (species-owner: $species, owned-protein: $protein) isa protein-ownership;' ) insert_if_non_existent(session, q_get_protein_ownership, q_insert_protein_ownership, "$pr-ownership")