Пример #1
0
def hash_me(file_dict):
    uris = {}
    uris[file_dict['basename']] = {}
    uris[file_dict['basename']]['uriIsolate'] = gu(
        ':spfy' + str(file_dict['count']))
    uris[file_dict['basename']]['uriGenome'] = gu(
        ':' + generate_hash(file_dict['withpath']))
    return uris
Пример #2
0
def spfyids_single(args_dict):
    from settings import database

    # this is temporary, TODO: include a spqarql query to the db
    uriIsolate = gu(':spfy' + str(database['count']))

    uriGenome = gu(':' + generate_hash(args_dict['i']))

    args_dict['uriIsolate'] = uriIsolate
    args_dict['uriGenome'] = uriGenome

    return args_dict
Пример #3
0
def parse_serotype(graph, serotyper_dict, uriIsolate):
    if 'O type' in serotyper_dict:
        graph.add(
            (uriIsolate, gu('ge:0001076'), Literal(serotyper_dict['O type'])))
    if 'H type' in serotyper_dict:
        graph.add(
            (uriIsolate, gu('ge:0001077'), Literal(serotyper_dict['H type'])))
    if 'K type' in serotyper_dict:
        graph.add(
            (uriIsolate, gu('ge:0001684'), Literal(serotyper_dict['K type'])))

    return graph
Пример #4
0
def generate_turtle_skeleton(graph, fasta_file, uriIsolate, uriGenome):
    '''
    Handles the main generation of a turtle object.

    NAMING CONVENTIONS:
    uriIsolate: this is the top-most entry, a uniq. id per file is allocated by checking our DB for the greatest most entry (not in this file)
        ex. :spfy234
    uriAssembly: aka. the genome ID, this is a sha1 hash of the file contents
        ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba
    uriContig: indiv contig ids; from SeqIO.record.id - this should be uniq to a contig (at least within a given file)
        ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba/contigs/FLOF01006689.1
        note: the record.id is what RGI uses as a prefix for ORF_ID (ORF_ID has additional _314 or w/e #s)

    Args:
        graph(rdflib.Graph): the graph instance that is 1:1 with a .fasta file
        fasta_file(str): path to the .fasta file (this should already incl the directory)
        spfyID(hash): currently a hash value generated from the name of the fasta file
    Returns:
        graph: the graph with all the triples generated from the .fasta file
    '''

    # ex. :spfy234
    graph.add((uriIsolate, gu('rdf:type'), gu('ncbi:562')))
    graph.add((uriIsolate, gu('ge:0001567'), Literal("bacterium")))
    graph.add((uriIsolate, gu('dc:description'),
               Literal(uri_to_basename(uriIsolate))))

    # ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba
    # associatting isolate URI with assembly URI
    graph.add((uriIsolate, gu('g:Genome'), uriGenome))

    # this is used as the human readable display of Genome
    graph.add((uriGenome, gu('dc:description'), Literal(basename(fasta_file))))

    # uri for bag of contigs
    # ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba/contigs/
    uriContigs = gu(uriGenome, "/contigs")
    graph.add((uriGenome, gu('so:0001462'), uriContigs))

    for record in SeqIO.parse(open(fasta_file), "fasta"):

        # ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba/contigs/FLOF01006689.1
        uriContig = gu(':' + record.id)
        # linking the spec contig and the bag of contigs
        graph.add((uriContigs, gu('g:Contig'), uriContig))
        graph.add((uriContig, gu('g:DNASequence'), Literal(record.seq)))
        graph.add(
            (uriContig, gu('g:Description'), Literal(record.description)))
        graph.add(
            (uriContig, gu('dc:description'), Literal(record.description)))

    return graph
Пример #5
0
def parse_gene_dict(graph, gene_dict, uriGenome):
    '''
    My intention is to eventually use ECTyper for all of the calls it was meant for.
    Just need to update ECTyper dict format to ref. AMR / VF by contig. as opposed to genome directly.

    These are the common gene related triples to both AMR / VF.
    Note: we are working from uriGenome and assume that the calling functions (
    generate_amr() and generate_vf() are doing the transformations to the
    gene_dict.keys so that they are contig ids (as they differ in return value
    between VF & AMR from ECTyper)
    )

    TODO: offshore rgi calls to ectyper and make it return a dict in the format we need
    -currently, we'll handle ORF_ID to contig id transform in generate_amr()

    Args:
        graph(rdflib.Graph): the running graph with all our triples
        gene_dict({{}}): a dictionary of genes with a assoc info
            ex. {'Some_Contig_ID':[{'START','STOP','ORIENTATION','GENE_NAME'}]}
        uriGenome(rdflib.URIRef): the base uri of the genome
            ex. :4eb02f5676bc808f86c0f014bbce15775adf06ba

    TODO: merge common components with generate_amr()
    '''

    for contig_id in gene_dict.keys():
        for gene_record in gene_dict[contig_id]:

            # recreating the contig uri
            uriContig = gu(':' + contig_id)  # now at contig uri

            # after this point we switch perspective to the gene and build down to
            # relink the gene with the contig

            bnode_occurrence = BNode()
            bnode_start = BNode()
            bnode_end = BNode()

            # some gene names, esp those which are effectively a description,
            # have spaces
            gene_name = gene_record['GENE_NAME'].replace(' ', '_')

            graph.add(
                (gu(':' + gene_name), gu('faldo:Region'), bnode_occurrence))

            graph.add((bnode_occurrence, gu('faldo:Begin'), bnode_start))
            graph.add((bnode_occurrence, gu('faldo:End'), bnode_end))

            # this is a special case for amr results
            if 'CUT_OFF' in gene_dict.keys():
                graph.add((bnode_start, gu('dc:Description'),
                           Literal(amr_results['CUT_OFF'][i])))
                graph.add((bnode_end, gu('dc:Description'),
                           Literal(amr_results['CUT_OFF'][i])))

            graph.add((bnode_start, gu('rdf:type'), gu('faldo:Position')))
            graph.add((bnode_start, gu('rdf:type'), gu('faldo:ExactPosition')))
            graph.add((bnode_end, gu('rdf:type'), gu('faldo:Position')))
            graph.add((bnode_end, gu('rdf:type'), gu('faldo:ExactPosition')))

            if gene_record['ORIENTATION'] is '+':
                graph.add((bnode_start, gu('rdf:type'),
                           gu('faldo:ForwardStrandPosition')))
                graph.add((bnode_end, gu('rdf:type'),
                           gu('faldo:ForwardStrandPosition')))
            else:
                graph.add((bnode_start, gu('rdf:type'),
                           gu('faldo:ReverseStrandPosition')))
                graph.add((bnode_end, gu('rdf:type'),
                           gu('faldo:ReverseStrandPosition')))

            graph.add((bnode_start, gu('faldo:Position'),
                       Literal(gene_record['START'])))
            graph.add((bnode_start, gu('faldo:Reference'), uriContig))

            graph.add((bnode_end, gu('faldo:Position'),
                       Literal(gene_record['STOP'])))
            graph.add((bnode_end, gu('faldo:Reference'), uriContig))

            ####

    return graph
Пример #6
0
    # mainly used for when a func needs a lot of the args
    args_dict = vars(args)

    # starting#logging
    '''
    logging.basicConfig(
        filename='outputs/' + __name__ +
        args_dict['i'].split('/')[-1] + '.log',
        level=logging.INFO
    )
    '''

    # check if a genome uri isn't set yet
    if args_dict['uriIsolate'] is None:
        # this is temporary, TODO: include a spqarql query to the db
        uriIsolate = gu(':spfy' + str(hash(args_dict['i'].split('/')[-1])))
    else:
        uriIsolate = gu(':spfy' + args_dict['uriIsolate'])

    # if the fasta_file hash was not precomputed (batch scripts should
    # precompute it), we compute that now
    if args_dict['uriGenome'] is None:
        uriGenome = gu(':' + generate_hash(args_dict['i']))
    else:
        uriGenome = gu(':' + args_dict['uriGenome'])

    args_dict['uriIsolate'] = uriIsolate
    args_dict['uriGenome'] = uriGenome

    savvy(args_dict)