def hash_me(file_dict): uris = {} uris[file_dict['basename']] = {} uris[file_dict['basename']]['uriIsolate'] = gu( ':spfy' + str(file_dict['count'])) uris[file_dict['basename']]['uriGenome'] = gu( ':' + generate_hash(file_dict['withpath'])) return uris
def spfyids_single(args_dict): from settings import database # this is temporary, TODO: include a spqarql query to the db uriIsolate = gu(':spfy' + str(database['count'])) uriGenome = gu(':' + generate_hash(args_dict['i'])) args_dict['uriIsolate'] = uriIsolate args_dict['uriGenome'] = uriGenome return args_dict
def parse_serotype(graph, serotyper_dict, uriIsolate): if 'O type' in serotyper_dict: graph.add( (uriIsolate, gu('ge:0001076'), Literal(serotyper_dict['O type']))) if 'H type' in serotyper_dict: graph.add( (uriIsolate, gu('ge:0001077'), Literal(serotyper_dict['H type']))) if 'K type' in serotyper_dict: graph.add( (uriIsolate, gu('ge:0001684'), Literal(serotyper_dict['K type']))) return graph
def generate_turtle_skeleton(graph, fasta_file, uriIsolate, uriGenome): ''' Handles the main generation of a turtle object. NAMING CONVENTIONS: uriIsolate: this is the top-most entry, a uniq. id per file is allocated by checking our DB for the greatest most entry (not in this file) ex. :spfy234 uriAssembly: aka. the genome ID, this is a sha1 hash of the file contents ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba uriContig: indiv contig ids; from SeqIO.record.id - this should be uniq to a contig (at least within a given file) ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba/contigs/FLOF01006689.1 note: the record.id is what RGI uses as a prefix for ORF_ID (ORF_ID has additional _314 or w/e #s) Args: graph(rdflib.Graph): the graph instance that is 1:1 with a .fasta file fasta_file(str): path to the .fasta file (this should already incl the directory) spfyID(hash): currently a hash value generated from the name of the fasta file Returns: graph: the graph with all the triples generated from the .fasta file ''' # ex. :spfy234 graph.add((uriIsolate, gu('rdf:type'), gu('ncbi:562'))) graph.add((uriIsolate, gu('ge:0001567'), Literal("bacterium"))) graph.add((uriIsolate, gu('dc:description'), Literal(uri_to_basename(uriIsolate)))) # ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba # associatting isolate URI with assembly URI graph.add((uriIsolate, gu('g:Genome'), uriGenome)) # this is used as the human readable display of Genome graph.add((uriGenome, gu('dc:description'), Literal(basename(fasta_file)))) # uri for bag of contigs # ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba/contigs/ uriContigs = gu(uriGenome, "/contigs") graph.add((uriGenome, gu('so:0001462'), uriContigs)) for record in SeqIO.parse(open(fasta_file), "fasta"): # ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba/contigs/FLOF01006689.1 uriContig = gu(':' + record.id) # linking the spec contig and the bag of contigs graph.add((uriContigs, gu('g:Contig'), uriContig)) graph.add((uriContig, gu('g:DNASequence'), Literal(record.seq))) graph.add( (uriContig, gu('g:Description'), Literal(record.description))) graph.add( (uriContig, gu('dc:description'), Literal(record.description))) return graph
def parse_gene_dict(graph, gene_dict, uriGenome): ''' My intention is to eventually use ECTyper for all of the calls it was meant for. Just need to update ECTyper dict format to ref. AMR / VF by contig. as opposed to genome directly. These are the common gene related triples to both AMR / VF. Note: we are working from uriGenome and assume that the calling functions ( generate_amr() and generate_vf() are doing the transformations to the gene_dict.keys so that they are contig ids (as they differ in return value between VF & AMR from ECTyper) ) TODO: offshore rgi calls to ectyper and make it return a dict in the format we need -currently, we'll handle ORF_ID to contig id transform in generate_amr() Args: graph(rdflib.Graph): the running graph with all our triples gene_dict({{}}): a dictionary of genes with a assoc info ex. {'Some_Contig_ID':[{'START','STOP','ORIENTATION','GENE_NAME'}]} uriGenome(rdflib.URIRef): the base uri of the genome ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba TODO: merge common components with generate_amr() ''' for contig_id in gene_dict.keys(): for gene_record in gene_dict[contig_id]: # recreating the contig uri uriContig = gu(':' + contig_id) # now at contig uri # after this point we switch perspective to the gene and build down to # relink the gene with the contig bnode_occurrence = BNode() bnode_start = BNode() bnode_end = BNode() # some gene names, esp those which are effectively a description, # have spaces gene_name = gene_record['GENE_NAME'].replace(' ', '_') graph.add( (gu(':' + gene_name), gu('faldo:Region'), bnode_occurrence)) graph.add((bnode_occurrence, gu('faldo:Begin'), bnode_start)) graph.add((bnode_occurrence, gu('faldo:End'), bnode_end)) # this is a special case for amr results if 'CUT_OFF' in gene_dict.keys(): graph.add((bnode_start, gu('dc:Description'), Literal(amr_results['CUT_OFF'][i]))) graph.add((bnode_end, gu('dc:Description'), Literal(amr_results['CUT_OFF'][i]))) graph.add((bnode_start, gu('rdf:type'), gu('faldo:Position'))) graph.add((bnode_start, gu('rdf:type'), gu('faldo:ExactPosition'))) graph.add((bnode_end, gu('rdf:type'), gu('faldo:Position'))) graph.add((bnode_end, gu('rdf:type'), gu('faldo:ExactPosition'))) if gene_record['ORIENTATION'] is '+': graph.add((bnode_start, gu('rdf:type'), gu('faldo:ForwardStrandPosition'))) graph.add((bnode_end, gu('rdf:type'), gu('faldo:ForwardStrandPosition'))) else: graph.add((bnode_start, gu('rdf:type'), gu('faldo:ReverseStrandPosition'))) graph.add((bnode_end, gu('rdf:type'), gu('faldo:ReverseStrandPosition'))) graph.add((bnode_start, gu('faldo:Position'), Literal(gene_record['START']))) graph.add((bnode_start, gu('faldo:Reference'), uriContig)) graph.add((bnode_end, gu('faldo:Position'), Literal(gene_record['STOP']))) graph.add((bnode_end, gu('faldo:Reference'), uriContig)) #### return graph
# mainly used for when a func needs a lot of the args args_dict = vars(args) # starting#logging ''' logging.basicConfig( filename='outputs/' + __name__ + args_dict['i'].split('/')[-1] + '.log', level=logging.INFO ) ''' # check if a genome uri isn't set yet if args_dict['uriIsolate'] is None: # this is temporary, TODO: include a spqarql query to the db uriIsolate = gu(':spfy' + str(hash(args_dict['i'].split('/')[-1]))) else: uriIsolate = gu(':spfy' + args_dict['uriIsolate']) # if the fasta_file hash was not precomputed (batch scripts should # precompute it), we compute that now if args_dict['uriGenome'] is None: uriGenome = gu(':' + generate_hash(args_dict['i'])) else: uriGenome = gu(':' + args_dict['uriGenome']) args_dict['uriIsolate'] = uriIsolate args_dict['uriGenome'] = uriGenome savvy(args_dict)