Exemplo n.º 1
0
def check_phage_functions(sample, blastfile, outputfile, clusterdb):
    """
    Count how many proteins have hypothetical functions
    """

    out = open(outputfile, 'w')
    if not os.path.exists(clusterdb):
        out.close()
        return

    phage_cluster_db = connect_to_db(clusterdb)
    hypo = 0
    nonhypo = 0
    with open(blastfile, 'r') as f:
        for l in f:
            p = l.strip().split("\t")
            fn = proteinid_to_function(p[1], phage_cluster_db)
            if is_hypothetical(fn):
                hypo += 1
            else:
                nonhypo += 1
    out.write(f"{sample}\tHypothetical proteins\t")
    out.write("[Hypothetical, Non-hypothetical, Fraction hypothetical]\t")
    out.write(f"{hypo}\t{nonhypo}\t{hypo / (hypo + nonhypo)}\n")
    out.close()
Exemplo n.º 2
0
def print_all_proteins():
    """
    Print all the proteins to stdout
    :return:  nothing
    """

    con = connect_to_db(phagedb)
    protein_to_fasta(con)
    disconnect(con)
Exemplo n.º 3
0
def list_all_genomes():
    """
    Print all the proteins to stdout
    :return:  nothing
    """

    con = connect_to_db(phagedb)
    exc = con.cursor().execute("select description from genome")
    for d in exc.fetchall():
        print(f"{d[0]}")
    disconnect(con)
Exemplo n.º 4
0
def enrich_a_cluster(clid, mems, phagedb, exout=None, verbose=False):
    """
    Extract some information about each cluster and add it to the Cluster object
    :param clid: cluster id
    :param mems: the members of the cluster
    :param phagedb: the phage genome sqlite file
    :param exout: extended output file. If you want to add more data to the clusters (e.g. functions and lengths)
    :param verbose: more output
    :return: the modified cluster object
    """

    conn = connect_to_db(phagedb, verbose)
    cur = conn.cursor()

    lens = []
    shortest = [None, 10000]
    longest = [None, 0]
    functions = {}

    maxn = 500
    c = 0
    e = maxn
    eout = None
    if exout:
        eout = open(exout, 'a')
    while c <= len(mems):
        if c > 0:
            sys.stderr.write(
                f"{color.PINK}Retrieving clusters {c}:{e} for {clid}{color.ENDC}\n"
            )
        tm = mems[c:e]
        protein_query = f"select accession, length, product from protein where accession in ({','.join(['?']*len(tm))})"
        cur.execute(protein_query, tm)
        c = e
        e += maxn

        for row in cur.fetchall():
            if eout:
                r = "\t".join(map(str, row))
                eout.write(f'{clid}\t{r}\n')
            lens.append(row[1])
            if row[1] > longest[1]:
                longest = [row[0], row[1]]
            if row[1] < shortest[1]:
                shortest = [row[0], row[1]]
            functions[row[2]] = functions.get(row[2], 0) + 1

    if eout:
        eout.close()

    return shortest[0], shortest[1], longest[0], longest[1], functions, sum(
        lens) / len(lens)
Exemplo n.º 5
0
def lookup_word(word):
    """
    Return the number of proteins with the word `word` in their
    product field
    :param word: the word to search for
    :param phagedb: the phage database connection
    :return : int the number of occurrences of word
    """
    con = connect_to_db(phagedb)
    c = con.cursor()
    sql = "select count(1) from protein_fts where product match ?"
    ex = c.execute(sql, [word])
    return ex.fetchone()[0]
Exemplo n.º 6
0
import os
import sys
import argparse

from pppf_accessories import color
from pppf_databases import connect_to_db, disconnect

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Sequences in the phage database not in a cluster")
    parser.add_argument('-p', help='phage database', required=True)
    parser.add_argument('-c', help='cluster database', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    pbc = connect_to_db(args.p, args.v)
    pcur = pbc.cursor()

    dbc = connect_to_db(args.c, args.v)
    ccur = dbc.cursor()

    cl = set()
    ex = ccur.execute("select protein_md5sum, cluster from md5cluster")
    for (m, c) in ex.fetchall():
        cl.add(m)

    if args.v:
        sys.stderr.write(
            f"{color.GREEN}Loaded {len(cl)} proteins{color.ENDC}\n")

    ex = pcur.execute(
Exemplo n.º 7
0
    parser = argparse.ArgumentParser(description='Find new accesssions')
    parser.add_argument('-f',
                        help='input file of [gi, accession number]',
                        required=True)
    parser.add_argument('-p', help='phage database', required=True)
    parser.add_argument('-o',
                        help='file to write needed IDs to',
                        required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    # note that identifier has the version.
    # Thus identifier = AF068845.1 and accession = AF068845
    # We probably want identifier!

    con = connect_to_db(args.p, args.v)
    exc = con.cursor().execute("select identifier, accession from genome")
    ids = {}
    accs = {}
    for r in exc.fetchall():
        ids[r[0]] = r[1]
        accs[r[1]] = r[0]

    try:
        assert len(ids) == len(accs)
    except AssertionError as e:
        sys.stderr.write(
            f"{color.RED}FATAL: We found {len(ids)} identifiers and {len(accs)} accessions{color.ENDC}\n"
        )
        sys.exit(1)
Exemplo n.º 8
0
field lengths to define the database!
"""

import os
import sys
import argparse
import pppf_db
from pppf_databases import connect_to_db, disconnect

__author__ = 'Rob Edwards'
__copyright__ = 'Copyright 2020, Rob Edwards'
__credits__ = ['Rob Edwards']
__license__ = 'MIT'
__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

for db in pppf_db.phagedb, pppf_db.clustersdb:
    print(f"Fields in {db}")
    con = connect_to_db(db)
    cursor = con.cursor()
    exc = cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    for tbltpl in exc.fetchall():
        tbl = tbltpl[0]
        print(f"\tTable: {tbl}")
        cd = con.execute(f"select * from {tbl} limit 1")
        names = list(map(lambda x: x[0], cd.description))
        for fld in names:
            lfsql = f"select length({fld}) from {tbl} order by length({fld}) DESC limit 1;"
            lfexc = cursor.execute(lfsql)
            print(f"{tbl} :: {fld} :: {lfexc.fetchone()[0]}")
Exemplo n.º 9
0
    Convert a protein ID to a dict object of all function
    :param proteinid: The protein md5 sum
    :param clusterdb_cursor: the cursor to the cluster database
    :param verbose: more output
    :return: dict: the functions of the protein and their frequency
    """

    global protein_functions

    if proteinid not in protein_functions:
        protein_functions[proteinid] = get_functions(proteinid, clusterdb_cursor, verbose)

    return json.loads(protein_functions[proteinid][1])


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-i', help='protein id', required=True)
    parser.add_argument('-c', help='cluster database', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    c= connect_to_db(args.c, args.v)
    fn = proteinid_to_function(args.i, c.cursor(), args.v)
    fns = proteinid_to_all_functions(args.i, c.cursor(), args.v)
    fnstr = "\n".join([f"{x} -> {str(y)}" for x,y in sorted(fns.items(), key=lambda item: item[1], reverse=True)])
    disconnect(c, args.v)

    print(f"The function of {args.i} is\n'{fn}'")
    print(f'All the functions are:\n{fnstr}')
Exemplo n.º 10
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Create a database and load it with GenBank data")
    parser.add_argument('-p', help='Phage SQL output database')
    parser.add_argument('-c', help='clusters SQLite database')
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    if args.p:
        sys.stderr.write(
            f"{color.BOLD}{color.BLUE}Defining Phage Tables{color.ENDC}\n")
        if not os.path.exists(args.p):
            with open(args.p, 'w') as out:
                True
        phageconn = connect_to_db(args.p, args.v)
        define_phage_tables(phageconn, args.v)
        phageconn.commit()  # final commit to make sure everything saved!
        disconnect(phageconn, args.v)

    if args.c:
        sys.stderr.write(
            f"{color.BOLD}{color.BLUE}Defining Cluster Tables{color.ENDC}\n")
        if not os.path.exists(args.c):
            with open(args.c, 'w') as out:
                True
        clconn = connect_to_db(args.c, args.v)
        define_cluster_tables(clconn, args.v)
        clconn.commit()
        disconnect(clconn, args.v)
Exemplo n.º 11
0
from pppf_accessories import color

__author__ = 'Rob Edwards'
__copyright__ = 'Copyright 2020, Rob Edwards'
__credits__ = ['Rob Edwards']
__license__ = 'MIT'
__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-d', help='phage database', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    dbcon = connect_to_db(args.d, args.v)
    cur = dbcon.cursor()

    gene_query = "select gene_rowid, contig, start, end, protein from gene"
    exc = cur.execute(gene_query)

    if args.v:
        sys.stderr.write(
            f"{colour.GREEN}Reading gene locations{colour.ENDC}\n")
    firstgene = {}
    for tple in exc.fetchall():
        contig = tple[1]
        if contig not in firstgene:
            firstgene[contig] = tple
        l = min(tple[2], tple[3])
        if l < firstgene[contig][2] or l < firstgene[contig][2]:
Exemplo n.º 12
0
"""

"""

import os
import sys
import argparse

from pppf_databases import connect_to_db, disconnect
from pppf_clusters import read_mmseqs_clusters, add_functions_to_clusters, insert_cluster_metadata, insert_into_database

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Load the cluster information into the databases')
    parser.add_argument('-p', '--phage', help='Phage SQL database', required=True)
    parser.add_argument('-c', '--clusters', help='Clusters SQL database', required=True)
    parser.add_argument('-t', '--tsv', help='Cluster tsv file', required=True)
    parser.add_argument('-n', '--name', help='Cluster name (short text)', required=True)
    parser.add_argument('-d', '--description', help='Cluster description (human readable text)', required=True)
    parser.add_argument('-c', '--cli', help='Cluster command line (bash)', required=True)
    parser.add_argument('-v', '--verbose', help='verbose output', action='store_true')
    args = parser.parse_args()

    phageconn = connect_to_db(args.phage, args.verbose)
    clconn = connect_to_db(args.clusters, args.verbose)
    clusters = read_mmseqs_clusters(args.tsv, args.verbose)
    (clusters, protein_info) = add_functions_to_clusters(clusters, phageconn, args.verbose)
    metadata_id = insert_cluster_metadata(clconn, args.name, args.description, args.cli, args.verbose)
    insert_into_database(clusters, clconn, phageconn, metadata_id, protein_info, args.verbose)
    disconnect(phageconn, args.verbose)
    disconnect(clconn, args.verbose)