예제 #1
0
def parse_refs(bibtexf, verbose=False):
    """
    Parse the references and return some data structure
    :param bibtexf: the bibtex file
    :param verbose: more output
    :return: the BibliographyData object and a dictionary linking lower case titles to entry keys
    """

    if verbose:
        message(f"Parsing {bibtexf}", "GREEN")
    bib = parse_file(bibtexf, 'bibtex')
    titles = {}
    for e in bib.entries:
        try:
            if 'title' in bib.entries[e].fields:
                # sys.stderr.write(f"{bcolors.BLUE}{bib.entries[e].fields['title'].lower()}{bcolors.ENDC}\n")
                t = bib.entries[e].fields['title'].lower()
                t = t.replace('{', '')
                t = t.replace('}', '')
                titles[t.lower()] = e
        except Exception as ex:
            sys.stderr.write(f"Error parsing entry: {e}\n")
            print(ex)

    if verbose:
        message(f"Found {len(titles)} references", "BLUE")
    return bib, titles
예제 #2
0
def check_dups(bibtexf, verbose=False):
    """
    Check for all duplicate entries at once.
    :param bibtexf: the bibtex file
    :param verbose: more output
    :return:
    """

    if verbose:
        message(f"Checking for duplicate entries: {bibtexf}", "PINK")
    entries = set()
    dupentries = False
    with open(bibtexf, 'r') as bin:
        for l in bin:
            if l.startswith('@'):
                l = l.replace('@misc', '')
                l = l.replace('@article', '')
                l = l.replace('@inproceedings', '')
                if l in entries:
                    sys.stderr.write("Duplicate entry " +
                                     l.replace('{', '').replace(',', ''))
                    dupentries = True
                entries.add(l)

    if dupentries:
        sys.stderr.write(
            "FATAL: The bibtex file has duplicate entries in it. Please remove them before trying to continue\n"
        )
        sys.stderr.write(
            "(It is an issue with Google Scholar, but pybtex breaks with duplicate entries. Sorry)\n"
        )
        sys.exit(-1)
예제 #3
0
def count_feats(gbkf, verbose=False):
    if verbose:
        message(f"Reading {gbkf}", "BLUE")

    count = {}
    for seq in genbank_seqio(gbkf):
        for feat in seq.features:
            count[feat.type] = count.get(feat.type, 0) + 1
    return count
예제 #4
0
def crassphage_coverage(f, verbose=False):
    """
    Get the crassphage coverage. This is coverage.txt
    :param f: coverage.txt
    :param verbose: more output
    :return:
    """

    coverage = {}

    if verbose:
        message(f"Reading {f}", "GREEN")

    with open(f, 'r') as fin:
        for l in fin:
            p = l.strip().split("\t")
            coverage[p[0]] = int(p[1]) / 97092

    return coverage
예제 #5
0
def write_file(definition, samples, counts, allkeys, file, verbose=False):
    """ Write the appropriate output files"""

    allmeasures = sorted(list(allkeys))

    if verbose:
        message(f"Writing to {file}", "GREEN")

    with open(file, 'w') as out:
        out.write("Definition\tMeasure\t")
        out.write("\t".join(sortedsamples))
        out.write("\n")
        for m in allmeasures:
            out.write(f"{definition}\t{m}")
            for k in sortedsamples:
                if k in counts and m in counts[k]:
                    out.write(f"\t{counts[k][m]}")
                else:
                    out.write("\t0")
            out.write("\n")
예제 #6
0
def focus_counts(data_directory, taxlevel, verbose=False):
    """ find the focus output and read it"""
    count = {}
    allfocus = set()
    for sample in os.listdir(data_directory):
        if verbose:
            message(f"Focus: {sample}", "BLUE")
        count[sample] = {}
        if os.path.exists(
                os.path.join(data_directory, sample, "focus",
                             "output_All_levels.csv")):
            with open(
                    os.path.join(data_directory, sample, "focus",
                                 "output_All_levels.csv"), 'r') as fin:
                lastcol = -1
                for l in fin:
                    if l.startswith('Kingdom'):
                        if '_pass.fasta' in l and '_pass_1.fasta' in l and '_pass_2.fasta' in l:
                            lastcol = -3
                        elif '_pass_1.fasta' in l and '_pass_2.fasta' in l:
                            lastcol = -2
                        continue
                    l = l.strip()
                    taxparts = l.split(",")[0:lastcol]
                    if len(taxparts) != 8:
                        message(
                            f"Error parsing {sample} when lastcol was {lastcol}",
                            "RED")
                        message(f"{l}", "BLUE")
                        message(f"{taxparts}", "PINK")
                        message(f"{l.split(',')}", "GREEN")
                        sys.exit()
                    # note that even if we split the tax to the previous column
                    # we use R2 for the reads then it is consistent with the sf output :)

                    tax = ":".join(taxparts[0:taxlevel])
                    count[sample][tax] = count[sample].get(tax, 0) + float(
                        l.split(",")[-1])
                    allfocus.add(tax)
    return count, allfocus
예제 #7
0
def superfocus_counts(data_directory, level=3, verbose=False):
    """
    find the superfocus output and read it. The file name loooks like
    data/DRR042358/sf/DRR042358all_levels_and function.xls
    :param data_directory: data/
    :param level: the ss level. Currently only 1, 2, and 3 are supported. 2 is 1+2
    :param verbose: more output
    :return:
    """

    count = {}
    allsslvl = set()

    for sample in os.listdir(data_directory):
        if verbose:
            message(f"Super focus: {sample}", "YELLOW")
        count[sample] = {}
        sffile = os.path.join(data_directory, sample, "sf",
                              f"{sample}all_levels_and_function.xls")
        if os.path.exists(sffile):
            keep = False
            with open(sffile, 'r') as fin:
                for l in fin:
                    if l.startswith('Subsystem Level 1'):
                        keep = True
                        continue
                    if not keep:
                        continue
                    p = l.strip().split("\t")
                    if level == 1:
                        sslvl = p[0]
                    elif level == 2:
                        sslvl = ":".join([p[0], p[1]])
                    else:
                        sslvl = p[2]
                    count[sample][sslvl] = count[sample].get(sslvl, 0) + float(
                        p[-1])
                    allsslvl.add(sslvl)
    return count, allsslvl
예제 #8
0
def abricate_counts(data_directory, verbose=False):
    """ find the abricate folders and read them """

    count = {}
    allabr = set()
    for sample in os.listdir(data_directory):
        if verbose:
            message(f"Abricate: {sample}", "PINK")
        count[sample] = {}
        if os.path.exists(os.path.join(data_directory, sample, "abricate")):
            for f in os.listdir(
                    os.path.join(data_directory, sample, "abricate")):
                if f.endswith('.tab'):
                    with open(
                            os.path.join(data_directory, sample, "abricate",
                                         f), 'r') as fin:
                        for l in fin:
                            p = l.strip().split("\t")
                            abr = f"{p[11]}:{p[5]}"
                            count[sample][abr] = count[sample].get(abr, 0) + 1
                            allabr.add(abr)
    return count, allabr
예제 #9
0
def run_phage_boost(genecalls, model_file, verbose):
    """
    Run phage boost
    :param model_file: The model file that is probably something like model_delta_std_hacked.pickled.silent.gz
    :param genecalls: The pandas data frame of gene calls
    :param verbose: more output
    :return:
    """
    # rolling params
    period = 20
    win_type = 'parzen'
    min_periods = 1

    # region finding params
    threshold = 0.9
    length = 10
    gaps = 5
    neighbouring = 0
    alpha = 0.001

    # calculate features from gene calls
    if verbose:
        message("Calculating features", "GREEN")

    df = calculate_features(genecalls)
    # load model
    model, feats, feats_, limit = read_model_from_file(model_file)
    # transform data
    df = get_predictions.get_deltas(df[feats_])
    if verbose:
        message("Transforming gene predictions to regions", "GREEN")
    # transform single gene predictions to regions
    newgenecalls, nphages, res = predict(model, genecalls, df, feats, period,
                                         win_type, min_periods, limit,
                                         threshold, length, gaps, neighbouring,
                                         alpha)
    return res
예제 #10
0
        type=int,
        default=7)
    parser.add_argument('-s',
                        help='subsystem level (1,2, or 3) (default = 3)',
                        type=int,
                        default=3)
    parser.add_argument('-v', help='verbose output', action='store_true')
    parser.add_argument(
        '-a',
        help=
        'Run all focus and superfocus levels. This is not coded efficiently, so use sparingly!',
        action='store_true')
    args = parser.parse_args()

    if args.s < 1 or args.s > 3:
        message(f"Error: No subsystem level {args.s}. Defaulting to 3", "RED")
        args.s = 3

    if args.f not in focustax:
        message(
            f"{args.f} is not valid for focus taxonomy. It must be an integer between 1 and 8 in {focustax}",
            "RED")
        sys.exit()

    coverage = crassphage_coverage(args.c, args.v)
    focus, allfocus = focus_counts(args.d, args.f, args.v)

    abricate, allabricate = abricate_counts(args.d, args.v)
    sf, allsf = superfocus_counts(args.d, args.s, args.v)

    # now get all the samples that are in abricate, focus, or sf
예제 #11
0
    parser.add_argument(
        '--maxrev',
        type=int,
        default=1e6,
        help=
        'do not trim more than these bp from the end (does not include primer length)'
    )
    parser.add_argument('--listall',
                        help='list all sequences that were trimmed',
                        action='store_true')
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    if not args.forward and not args.reverse:
        message(
            "Either --forward or --reverse primer must be specified otherwise nothing will be removed"
        )
        sys.exit(-1)

    fwd = None
    rev = None
    if args.forward:
        fwd = args.forward.upper()
    if args.reverse:
        rev = args.reverse.upper()

    with open(args.o, 'w') as out:
        for sid, seqid, seq, qual in stream_fastq(args.f):
            original = [seq, qual]
            trimmed = False
            if fwd and fwd in seq.upper():
예제 #12
0
Test a directory of genbank files and note whether they have the is_phage qualifier for their genomes
"""

import os
import sys
import argparse

from roblib import genbank_seqio, message

__author__ = 'Rob Edwards'
__copyright__ = 'Copyright 2020, Rob Edwards'
__credits__ = ['Rob Edwards']
__license__ = 'MIT'
__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-d', help='directory of genbank files', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    for f in os.listdir(args.d):
        if args.v:
            message(f"Reading {f}", "GREEN")
        pc = 0
        for s in genbank_seqio(os.path.join(args.d, f)):
            for feat in s.features:
                if 'is_phage' in feat.qualifiers:
                    pc += 1
        print(f"{f}\t{pc}")
예제 #13
0
__license__ = 'MIT'
__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="Randomly sample a single fastq file")
    parser.add_argument('-f', help='fastq file to sample', required=True)
    parser.add_argument('-o', help='output file name', required=True)
    parser.add_argument('-p',
                        help='percent of the file to sample',
                        required=True,
                        type=int)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    sequences = []
    for seqid, header, seq, qualscores in stream_fastq(args.f):
        sequences.append([header, seq, qualscores])

    n = int(args.p / 100 * len(sequences))

    if args.v:
        message(
            f"There are {len(sequences)} sequences. So we will sample {n} elements",
            "GREEN")

    with open(args.o, 'w') as out:
        for s in sample(sequences, n):
            out.write(f"@{s[0]}\n{s[1]}\n+\n{s[2]}\n")
예제 #14
0
import os
import sys
import argparse

from roblib import read_fasta, write_fastq, message

__author__ = 'Rob Edwards'
__copyright__ = 'Copyright 2020, Rob Edwards'
__credits__ = ['Rob Edwards']
__license__ = 'MIT'
__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-f', help='fasta file', required=True)
    parser.add_argument('-q', help='quality file', required=True)
    parser.add_argument('-o', help='output fastq file', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    if not os.path.exists(args.f) and not os.path.exists(args.q):
        message("FATAL: either {args.f} or {args.q} not found", "RED")
        sys.exit(-1)

    fa = read_fasta(args.f, True, False)
    qu = read_fasta(args.q, True, True)

    write_fastq(fa, qu, args.o, args.v)
예제 #15
0
    os.makedirs(args.o, exist_ok=True)
    dna = {}
    qual = {}
    header = {}

    # initially didn't plan to keep all these :)
    for seqid, hd, seq, qualscores in stream_fastq(args.f):
        dna[seqid] = seq.upper()
        qual[seqid] = qualscores
        header[seqid] = hd

    changed = set()
    deleted = set()
    for step in range(1, 10):
        if args.v:
            message(f"Working on step {step}", "GREEN")
        fqf = os.path.join(args.q, f"step_{step}",
                           f"{args.n}.s{step}.out.fastq")
        if not os.path.exists(fqf):
            message(f"FQ File {fqf} not found", "RED")
            continue
        seqs = []
        with open(os.path.join(args.o, f"step_{step}.text"), 'w') as out, \
                open(os.path.join(args.o, f"step_{step}_input.fq"), 'w') as fqinput,\
                open(os.path.join(args.o, f"step_{step}_output.fq"), 'w') as fqout:
            seen = set()
            for seqid, hd, seq, qualscores in stream_fastq(fqf):
                seen.add(seqid)
                if seqid not in dna:
                    message(f"{seqid} is a different sequence id", "PINK")
                    continue
예제 #16
0
    parser.add_argument('-f', help='genbank file')
    parser.add_argument('-d', help='directory of genbank files')
    parser.add_argument('-t',
                        help='feature type(s) (at least one must be provided)',
                        nargs="+")
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    files = []
    if args.f:
        files.append(args.f)
    if args.d:
        for f in os.listdir(args.d):
            files.append(os.path.join(args.d, f))
    if len(files) == 0:
        message("Fatal. Either -d or -f is required", "RED")

    if len(args.t) == 0:
        message("Fatal. Please provide at least one feature type to count",
                "RED")

    print("File", end="")
    for t in args.t:
        print(f"\t{t}", end="")
    print()
    for f in files:
        c = count_feats(f, args.v)
        print(f, end="")
        for t in args.t:
            if t in c:
                print(f"\t{c[t]}", end="")
예제 #17
0
        required=True,
        help=
        "Model file. Probably something like model_delta_std_hacked.pickled.silent.gz"
    )
    parser.add_argument('-o',
                        '--outputfile',
                        help='output file for phage regions')
    parser.add_argument('-c',
                        '--mincontiglen',
                        default=1000,
                        type=int,
                        help='minimum contig length  [Default: %(default)d]')
    parser.add_argument('-v',
                        '--verbose',
                        help='verbose output',
                        action='store_true')
    args = parser.parse_args()

    if args.verbose:
        message("Reading genbank file", "GREEN")
    genecalls = genbank_to_pandas(args.genbankfile, args.mincontiglen, True,
                                  True, args.verbose)
    if args.verbose:
        message("Phage Boosting", "GREEN")
    res = run_phage_boost(genecalls, args.modelfile, args.verbose)
    if args.outputfile:
        with open(args.outputfile, 'w') as out:
            res.to_csv(out, sep="\t", header=True)
    else:
        print(res)