def split_by_rank(faf, ranks, outdir, verbose=False):
    """
    Split the fasta file
    :param faf: fasta file
    :param ranks: dict of taxid and rank
    :param outdir: output directory
    :param verbose: more output
    :return:
    """

    s = re.compile('TaxID=(\d+)')

    if args.v:
        sys.stderr.write(f"{colours.GREEN}Splitting {faf}{colours.ENDC}\n")
    fhs = {}
    for seqid, seq in stream_fasta(faf):
        m = s.search(seqid)
        rnk = "root"
        if m:
            tid = m.groups()[0]
            if tid in ranks:
                rnk = ranks[tid]
        else:
            sys.stderr.write(
                f"{colours.RED}ERROR: No taxonomy in {seqid}{colours.ENDC}\n")

        if rnk not in fhs:
            fhs[rnk] = open(os.path.join(outdir, rnk + ".fasta"), 'w')
        fhs[rnk].write(f">{seqid}\n{seq}\n")

    for fh in fhs:
        fhs[fh].close()
Пример #2
0
def write_fasta_files(faf, odir, bins, maxb, verbose=False):
    """
    Read the sequences from faf and write them into a set of files in odir.
    :param faf: The source fasta file
    :param odir: the output directory
    :param bins: the hash of contigs -> bin
    :param maxb: the maximum bin number
    :param verbose: more output
    :return: nada
    """

    if not os.path.exists(odir):
        os.mkdir(odir)

    outputfiles = []
    for i in range(maxb + 1):
        outputfiles.append(open(os.path.join(odir, f"bin_{i}.fna"), 'w'))

    written_to = set()

    for fa, seq in stream_fasta(faf, True):
        faid = fa.split(" ")[0]
        if faid not in bins:
            if verbose:
                sys.stderr.write(f"Sequence {faid} not found in a bin\n")
            continue
        outputfiles[bins[faid]].write(">{}\n{}\n".format(fa, seq))
        written_to.add(bins[faid])

    for o in outputfiles:
        o.close()

    for i in range(maxb + 1):
        if i not in written_to:
            os.remove(os.path.join(odir, f"bin_{i}.fna"))
Пример #3
0
def parse_contigs(locations, gdir, odir):
    """
    Parse the contigs file and print non-prophage regions

    :param locations: the locations hash from the phispy directory
    :param gdir: the genome directory that contains the contigs file
    :param odir: the output directory
    :return: None
    """

    p = re.compile('>?(\S+)')

    out = open(os.path.join(odir, "contigs_no_pp.fasta"), 'w')
    for contig, seq in stream_fasta(os.path.join(gdir, "contigs")):
        if contig not in locations:
            out.write(">{}\n{}\n".format(contig, seq))
            continue
        m = p.match(contig)
        tag = m.groups()[0]
        c = 0
        ses = sorted(locations[contig], key=itemgetter(0))
        posn = 0
        for start, end in ses:
            c += 1
            out.write(">{}.{}\n{}\n".format(tag, c, seq[posn:start]))
            posn = end + 1
    out.close()
Пример #4
0
def fasta2tax(faf, outdir, trank, tdb, verbose=False):
    """
    Split the fasta file by taxonomy
    :param faf: fasta file to split
    :param outdir: output directory to write to
    :param trank: taxonomic rank to choose
    :param tdb: The taxonomy database
    :param verbose: more output
    :return:
    """

    s = re.compile('TaxID=(\d+)')

    fhs = {}
    for seqid, seq in stream_fasta(faf):
        m = s.search(seqid)
        if m:
            tid = m.groups()[0]
            rnk = find_rank(tid, trank, tdb, verbose)
            if rnk not in fhs:
                fhs[rnk] = open(os.path.join(outdir, rnk + ".fasta"), 'w')
            fhs[rnk].write(f">{seqid}\n{seq}\n")
        else:
            sys.stderr.write(
                f"{colours.RED}ERROR: No taxonomy in {seqid}{colours.ENDC}\n")

    for fh in fhs:
        fhs[fh].close()
Пример #5
0
def print_trim(faf, start=0, end=None, seqid=None, verbose=False):
    """

    We don't actually trim the sequence, we print the first and last 20bp of the sequence and the trim
    If start is not provided we use the first base.
    If end is not provided we trim to the end.
    If sequence ID is not provided we trim the first sequence, otherwise we'll trim all the sequences

    :param faf: fasta file
    :param start: optional start position
    :param end:  optional end position
    :param seqid: optional sequence ID to trim
    :param verbose: more output
    :return: Nothing
    """

    for sid, seq in stream_fasta(faf):
        seqname = sid.split(" ")[0]
        if seqid and seqname != seqid:
            continue

        trimmed_seq = seq[start:end]
        aftertr_seq = seq[end:]

        print("Sequence 1: {} ... {}\nSequence 2: {} ... {}".format(
            trimmed_seq[0:20], trimmed_seq[-20:],
            aftertr_seq[0:20], aftertr_seq[-20:]
        ))
Пример #6
0
def parse_contigs(locations, gdir, odir):
    """
    Parse the contigs file and print non-prophage regions

    :param locations: the locations hash from the phispy directory
    :param gdir: the genome directory that contains the contigs file
    :param odir: the output directory
    :return: None
    """

    p = re.compile('>?(\S+)')

    out = open(os.path.join(odir, "contigs_no_pp.fasta"), 'w')
    for contig, seq in stream_fasta(os.path.join(gdir, "contigs")):
        if contig not in locations:
            out.write(">{}\n{}\n".format(contig, seq))
            continue
        m = p.match(contig)
        tag = m.groups()[0]
        c = 0
        ses = sorted(locations[contig], key=itemgetter(0))
        posn = 0
        for start, end in ses:
            c += 1
            out.write(">{}.{}\n{}\n".format(tag, c, seq[posn:start]))
            posn = end + 1
    out.close()
Пример #7
0
def write_fasta_files(faf, odir, bins, maxb, verbose=False):
    """
    Read the sequences from faf and write them into a set of files in odir.
    :param faf: The source fasta file
    :param odir: the output directory
    :param bins: the hash of contigs -> bin
    :param maxb: the maximum bin number
    :param verbose: more output
    :return: nada
    """

    if not os.path.exists(odir):
        os.mkdir(odir)
    
    outputfiles = []
    for i in range(maxb+1):
        outputfiles.append(open(os.path.join(odir, f"bin_{i}.fna"), 'w'))

    written_to=set()

    for fa, seq in stream_fasta(faf, True):
        faid = fa.split(" ")[0]
        if faid not in bins:
            if verbose:
                sys.stderr.write(f"Sequence {faid} not found in a bin\n")
            continue
        outputfiles[bins[faid]].write(">{}\n{}\n".format(fa, seq))
        written_to.add(bins[faid])

    for o in outputfiles:
        o.close()

    for i in range(maxb+1):
        if i not in written_to:
            os.remove(os.path.join(odir, f"bin_{i}.fna"))
Пример #8
0
def trim_seq(faf, start=0, end=None, seqid=None, verbose=False):
    """

    Trim a sequence from start to end.
    If start is not provided we use the first base.
    If end is not provided we trim to the end.
    If sequence ID is not provided we trim the first sequence, otherwise we'll trim all the sequences

    :param faf: fasta file
    :param start: optional start position
    :param end:  optional end position
    :param seqid: optional sequence ID to trim
    :param verbose: more output
    :return: Nothing
    """

    for sid, seq in stream_fasta(faf):
        seqname = sid.split(" ")[0]
        if seqid and seqname != seqid:
            continue

        print(">{}\n{}".format(sid, seq[start:end]))
        if verbose:
            sys.stderr.write("Trimmed {}. Next stretch is {}\n".format(
                sid, seq[end:end + 20]))
Пример #9
0
def print_trim(faf, start=0, end=None, seqid=None, verbose=False):
    """

    We don't actually trim the sequence, we print the first and last 20bp of the sequence and the trim
    If start is not provided we use the first base.
    If end is not provided we trim to the end.
    If sequence ID is not provided we trim the first sequence, otherwise we'll trim all the sequences

    :param faf: fasta file
    :param start: optional start position
    :param end:  optional end position
    :param seqid: optional sequence ID to trim
    :param verbose: more output
    :return: Nothing
    """

    for sid, seq in stream_fasta(faf):
        seqname = sid.split(" ")[0]
        if seqid and seqname != seqid:
            continue

        trimmed_seq = seq[start:end]
        aftertr_seq = seq[end:]

        print("Sequence 1: {} ... {}\nSequence 2: {} ... {}".format(
            trimmed_seq[0:20], trimmed_seq[-20:], aftertr_seq[0:20],
            aftertr_seq[-20:]))
Пример #10
0
def read_fasta(fafile, idmapfile, minlen=100, verbose=False):
    """
    Read a fasta file and return a dict of proteins and their md5 sums
    :param fafile: fasta file
    :param minlen: minimum protein sequence length (in amino acids) to be included
    :param verbose: more output
    :return:
    """

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Reading {fafile}{bcolors.ENDC}\n")

    seqs = {}
    with open(idmapfile, 'a') as idout:
        for seqid, seq in stream_fasta(fafile):
            # ignore a sequence with a stop codon
            if '*' in seq:
                continue
            if len(seq) < minlen:
                continue
            md5 = hashlib.md5(seq.upper().encode('utf-8')).hexdigest()
            seqs[md5] = seq
            idout.write(f"{md5}\t{seqid}\n")

    return seqs
Пример #11
0
def read_sequence(conf, verbose=False):
    """
    Read the contigs file for this genome and return it
    :param conf: the contigs file
    :param verbose:
    :return: a dict of contig/sequence
    """

    seqs = {}
    for seqid, seq in stream_fasta(conf, whole_id=False):
        seqs[seqid] = seq
    return seqs
Пример #12
0
def coding_versus_noncoding(sample, contigs, orfs, outputfile):
    """
    Count the number of coding vs. noncoding bases
    """
    sys.stderr.write(
        f"{bcolors.GREEN}Counting coding vs. non coding bases{bcolors.ENDC}\n")

    # read the DNA sequences
    dnalen = 0
    for seqid, seq in stream_fasta(contigs):
        dnalen += len(seq)

    # read the protein sequences
    protlen = 0
    for seqid, seq in stream_fasta(orfs):
        protlen += len(seq)

    with open(outputfile, 'w') as out:
        out.write(f"{sample}\tCoding vs non coding\t")
        out.write(f"[coding bp, total bp, fraction coding]\t")
        out.write(f"{protlen}\t{dnalen}\t{protlen / dnalen}\n")
def count_len(fastaf, verbose=False):
    """
    Count the sequence lengths and return a dict of len:count
    :param fastaf: fasta file
    :param verbose: more output
    :return:
    """

    counts = {}
    for seqid, seq in stream_fasta(fastaf):
        counts[len(seq)] = counts.get(len(seq), 0) + 1

    return counts
Пример #14
0
def seq_lengths(fafile, verbose=False):
    """
    Read the sequence length from a fasta file
    :param fafile: the fasta file to read
    :param verbose: more output
    :return: a dict of sequence id and length
    """

    length = {}
    for i,s in stream_fasta(fafile):
        length[i] = len(s)

    return length
Пример #15
0
def length_filter(f, l, verbose=False):
    """
    Filter a fasta file based on the minimum length, l
    :param f: fasta file
    :param l: minimum sequene length
    :param verbose: more output
    :return:
    """

    for seqid, seq in stream_fasta(f, True):
        if len(seq) < l:
            continue
        print(">{}\n{}".format(seqid, seq))
Пример #16
0
def count_len(fastaf, verbose=False):
    """
    Count the sequence lengths and return a dict of len:count
    :param fastaf: fasta file
    :param verbose: more output
    :return:
    """

    counts = {}
    for seqid, seq in stream_fasta(fastaf):
        counts[len(seq)] = counts.get(len(seq), 0) + 1

    return counts
Пример #17
0
def introduce_break(fastaf, lsgenes, ssgenes, numbp, verbose=False):
    """
    Introduce a break into the contigs upstream of the large subunit, but also ignore the small subunit
    :param fastaf: fasta file to break
    :param lsgenes: dict of ls genes
    :param ssgenes: dict of ss genes
    :param numbp: how far upstream to break the DNA
    :param verbose: more output
    :return:
    """

    for seqdef, seq in stream_fasta(fastaf):
        seqid = seqdef.split(" ")[0]
        if verbose:
            sys.stderr.write(f"Looking for {seqid}\n")
        if seqid in lsgenes:
            breakat = 0
            dorc = False
            if '+ve' in lsgenes[seqid]:
                # the terminase is on the + strand so we go n bp less than the mininmum
                breakat = lsgenes[seqid]['+ve'][0] - numbp
                # check to see if the small subunit is here too
                if seqid in ssgenes and '+ve' in ssgenes[seqid]:
                    if ssgenes[seqid]['+ve'][0] - numbp < breakat:
                        breakat = ssgenes[seqid]['+ve'][0] - numbp
                        if verbose:
                            sys.stderr.write(
                                "Because we have a small subunit at {} we moved the start\n"
                                .format(ssgenes[seqid]['+ve'][0]))
            elif '-ve' in lsgenes[seqid]:
                dorc = True
                # the terminase is on the - strand so we go n bp more than the maximum
                breakat = lsgenes[seqid]['-ve'][1] + numbp
                # check to see if the small subunit is here too
                if seqid in ssgenes and '-ve' in ssgenes[seqid]:
                    if ssgenes[seqid]['-ve'][1] + numbp > breakat:
                        breakat = ssgenes[seqid]['-ve'][1] + numbp
                        if verbose:
                            sys.stderr.write(
                                "Because we have a small subunit at {} we moved the start\n"
                                .format(ssgenes[seqid]['-ve'][0]))
            else:
                sys.stderr.write("No terminase found in {}\n".format(seqid))

            newseq = seq
            if breakat > 0:
                sys.stderr.write("Reformating the sequence {}\n".format(seqid))
                newseq = seq[breakat:] + seq[0:breakat]
                if dorc:
                    newseq = rc(newseq)
            print(">{}\n{}".format(seqdef, newseq))
Пример #18
0
def get_reps(faf, outf, verbose=False):
    """
    Get the repeats and write them out
    :param faf: fasta file
    :param outf: output file
    :param verbose: more output
    :return:
    """

    with open(outf, 'w') as out:
        for seqid, seq in stream_fasta(faf):
            for r in RobRepeatFinder.repeatFinder(seq, 3):
                out.write(
                    f"{r['first_start']}\t{r['first_end']}\t{r['second_start']}\t{r['second_end']}\n"
                )
Пример #19
0
def count_kmers(faf, type, k, jsonout=None, verbose=False):
    """
    Count the kmers
    :param faf: fasta file
    :param type: str either fasta or fastq
    :param k: kmer size
    :param verbose: more output
    :return: a dict of kmers
    """

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Counting kmers (k={k}) in {faf}\n")

    kmers = {}

    if type == "fasta":
        for id, seq in stream_fasta(faf):
            rcseq = rc(seq)
            posn = 0
            while posn < len(seq) - k - 1:
                kmers[seq[posn:posn +
                          k]] = kmers.get(seq[posn:posn + k], 0) + 1
                kmers[rcseq[posn:posn +
                            k]] = kmers.get(rcseq[posn:posn + k], 0) + 1
                posn += 1

    if type == "fastq":
        for id, fullid, seq, qual in stream_fastq(faf):
            rcseq = rc(seq)
            posn = 0
            while posn < len(seq) - k - 1:
                kmers[seq[posn:posn +
                          k]] = kmers.get(seq[posn:posn + k], 0) + 1
                kmers[rcseq[posn:posn +
                            k]] = kmers.get(rcseq[posn:posn + k], 0) + 1
                posn += 1

    if jsonout:
        if verbose:
            sys.stderr.write(f"{bcolors.BLUE}\tWriting to {jsonout}\n")
        with open(jsonout, 'w') as out:
            json.dump({faf: kmers}, out)

    if verbose:
        sys.stderr.write(
            f"{bcolors.BLUE}\tDone counting kmers (k={k}) in {faf}\n")

    return kmers
Пример #20
0
def introduce_break(fastaf, lsgenes, ssgenes, numbp, verbose=False):
    """
    Introduce a break into the contigs upstream of the large subunit, but also ignore the small subunit
    :param fastaf: fasta file to break
    :param lsgenes: dict of ls genes
    :param ssgenes: dict of ss genes
    :param numbp: how far upstream to break the DNA
    :param verbose: more output
    :return:
    """

    for seqdef, seq in stream_fasta(fastaf):
        seqid = seqdef.split(" ")[0]
        if verbose:
            sys.stderr.write(f"Looking for {seqid}\n")
        if seqid in lsgenes:
            breakat = 0
            dorc = False
            if '+ve' in lsgenes[seqid]:
                # the terminase is on the + strand so we go n bp less than the mininmum
                breakat = lsgenes[seqid]['+ve'][0] - numbp
                # check to see if the small subunit is here too
                if seqid in ssgenes and '+ve' in ssgenes[seqid]:
                    if ssgenes[seqid]['+ve'][0] - numbp < breakat:
                        breakat = ssgenes[seqid]['+ve'][0] - numbp
                        if verbose:
                            sys.stderr.write("Because we have a small subunit at {} we moved the start\n".format(ssgenes[seqid]['+ve'][0]))
            elif '-ve' in lsgenes[seqid]:
                dorc = True
                # the terminase is on the - strand so we go n bp more than the maximum
                breakat = lsgenes[seqid]['-ve'][1] + numbp
                # check to see if the small subunit is here too
                if seqid in ssgenes and '-ve' in ssgenes[seqid]:
                    if ssgenes[seqid]['-ve'][1] + numbp > breakat:
                        breakat = ssgenes[seqid]['-ve'][1] + numbp
                        if verbose:
                            sys.stderr.write("Because we have a small subunit at {} we moved the start\n".format(ssgenes[seqid]['-ve'][0]))
            else:
                sys.stderr.write("No terminase found in {}\n".format(seqid))

            newseq = seq
            if breakat > 0:
                sys.stderr.write("Reformating the sequence {}\n".format(seqid))
                newseq = seq[breakat:] + seq[0:breakat]
                if dorc:
                    newseq = rc(newseq)
            print(">{}\n{}".format(seqdef, newseq))
Пример #21
0
def parse_contigs(locations, gdir, odir):
    """
    Parse the contigs file and print non-prophage regions

    :param locations: the locations hash from the phispy directory
    :param gdir: the genome directory that contains the contigs file
    :param odir: the output directory
    :return: None
    """

    out = open(os.path.join(odir, "pp.fasta"), 'w')
    for contig, seq in stream_fasta(os.path.join(gdir, "contigs")):
        if contig not in locations:
            continue
        ses = sorted(locations[contig], key=itemgetter(0))
        for start, end, ppid in ses:
            out.write(">{} {}_{}_{}\n{}\n".format(ppid, contig, start, end, seq[start:end]))
    out.close()
Пример #22
0
def parse_contigs(locations, gdir, odir):
    """
    Parse the contigs file and print non-prophage regions

    :param locations: the locations hash from the phispy directory
    :param gdir: the genome directory that contains the contigs file
    :param odir: the output directory
    :return: None
    """

    out = open(os.path.join(odir, "pp.fasta"), 'w')
    for contig, seq in stream_fasta(os.path.join(gdir, "contigs")):
        if contig not in locations:
            continue
        ses = sorted(locations[contig], key=itemgetter(0))
        for start, end, ppid in ses:
            out.write(">{} {}_{}_{}\n{}\n".format(ppid, contig, start, end,
                                                  seq[start:end]))
    out.close()
Пример #23
0
def count_pairwise_no_fn(faf, kmer, verbose=True):
    """
    Count all pairwise amino acids
    :param faf: fasta file
    :param kmer: kmer size
    :param verbose: more output
    """

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Reading {faf}{bcolors.ENDC}\n")

    count = {}
    for sid, seq in stream_fasta(faf, whole_id=True):
        posn = 0
        count[sid] = {}
        while posn < len(seq) - (kmer - 1):
            count[sid][seq[posn:posn + kmer]] = count[sid].get(
                seq[posn:posn + kmer], 0) + 1
            posn += 1
        # normalize by protein length
        for aa in count[sid]:
            count[sid][aa] /= len(seq)
    return count
Пример #24
0
def split_contigs(inf, outf, length, minlen, verbose=False):
    """
    Split the contigs
    :param inf: input fasta file
    :param outf: output fasta file
    :param length: length to split into
    :param minlen: minimum length that a contig must be to be printed
    :param verbose: more output
    :return:
    """

    with open(outf, 'w') as out:
        for seqid, seq in stream_fasta(inf, True):
            if verbose:
                sys.stderr.write("{}\n".format(seqid))
            posn = 0
            seqcounter = 0
            seqidparts = seqid.split(" ")
            while posn < len(seq) - length:
                seqcounter += 1
                if len(seqidparts) == 1:
                    out.write(">{}_{} {}\n".format(seqidparts[0], seqcounter,
                                                   "".join(seqidparts[1:])))
                else:
                    out.write(">{}_{}\n".format(seqidparts[0], seqcounter))
                out.write("{}\n".format(seq[posn:posn + length]))
                posn += length
            seqcounter += 1
            if len(seq) - posn < minlen:
                continue
            if len(seqidparts) == 1:
                out.write(">{}_{} {}\n".format(seqidparts[0], seqcounter,
                                               "".join(seqidparts[1:])))
            else:
                out.write(">{}_{}\n".format(seqidparts[0], seqcounter))
            out.write("{}\n".format(seq[posn:]))
Пример #25
0
def count_pairwise(faf, kmer, verbose=True):
    """
    Count all pairwise amino acids
    :param faf: fasta file
    :param kmer: kmer size
    :param verbose: more output
    """

    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Reading {faf}{bcolors.ENDC}\n")

    count = {}
    fns = {}
    for sidf, seq in stream_fasta(faf, whole_id=True):
        posn = 0
        # split the sequence id into sequence and function
        # may need to provide an alternate way to do this
        # note we also strip out [organism name]
        sid = sidf[:sidf.index(" ")]
        try:
            fns[sid] = sidf[sidf.index(" ") + 1:sidf.index("[") - 1]
        except:
            fns[sid] = sidf[sidf.index(" ") + 1:]

        count[sid] = {}
        while posn < len(seq) - (kmer - 1):
            count[sid][seq[posn:posn + kmer]] = count[sid].get(
                seq[posn:posn + kmer], 0) + 1
            posn += 1
        # normalize by protein length
        for aa in count[sid]:
            count[sid][aa] /= len(seq)
    if verbose:
        sys.stderr.write(f"{bcolors.GREEN}Read {faf}{bcolors.ENDC}\n")

    return count, fns
Пример #26
0
def trim_seq(faf, start=0, end=None, seqid=None, verbose=False):
    """

    Trim a sequence from start to end.
    If start is not provided we use the first base.
    If end is not provided we trim to the end.
    If sequence ID is not provided we trim the first sequence, otherwise we'll trim all the sequences

    :param faf: fasta file
    :param start: optional start position
    :param end:  optional end position
    :param seqid: optional sequence ID to trim
    :param verbose: more output
    :return: Nothing
    """

    for sid, seq in stream_fasta(faf):
        seqname = sid.split(" ")[0]
        if seqid and seqname != seqid:
            continue

        print(">{}\n{}".format(sid, seq[start:end]))
        if verbose:
            sys.stderr.write("Trimmed {}. Next stretch is {}\n".format(sid, seq[end:end+20]))
    # read the directories
    for phispydir in os.listdir(args.p):
        if not os.path.exists(os.path.join(args.s, phispydir)):
            if args.v:
                sys.stderr.write(
                    'A seed directory matching the phispy directory {} was not found. Skipped\n'
                    .format(phispydir))
            continue

        # read the phages
        phageseqs = []
        for phagefile in [
                x for x in os.listdir(os.path.join(args.p, phispydir))
                if x.endswith('.fasta')
        ]:
            for pid, phagecontig in roblib.stream_fasta(
                    os.path.join(args.p, phispydir, phagefile)):
                phageseqs.append(phagecontig)

        # sort the phages longest to smallest
        phageseqs = sorted(phageseqs, key=len, reverse=True)

        # now read the sequences and split out on phageseqs
        if not os.path.exists(os.path.join(args.s, phispydir, "contigs")):
            sys.stderr.write(
                "Error: no contigs file was found at {}. Skipped\n".format(
                    os.path.join(args.s, phispydir, "contigs")))

        os.mkdir(os.path.join(args.o, phispydir))
        out = open(os.path.join(args.o, phispydir, "contigs"), 'w')

        for gid, genomecontig in roblib.stream_fasta(
def count_kmers(dir, kmersize, outdir, verbose=False):
    """
    Count the kmers in all the fasta files
    :param dir: the directory
    :param kmersize: the kmer size
    :param outdir: the output directory
    :param verbose:
    :return:
    """

    bases = {'A', 'C', 'T', 'G'}
    fcount = {}
    rcount = {}

    kmers = ["".join(s) for s in product(bases, repeat=kmersize)]
    fare = re.compile('.fasta$|.fna$|.fa$')
    for f in filter(fare.search, os.listdir(dir)):
        if args.v:
            sys.stderr.write(f"{bcolors.GREEN}Reading {f}{bcolors.ENDC}\n")
        for seqid, seq in stream_fasta(os.path.join(dir, f)):
            fcount[seqid] = []
            rcount[seqid] = []
            for k in kmers:
                fcount[seqid].append(seq.count(k))
                rcount[seqid].append(seq.count(rc(k)))

    # now we have a hash with all the fwd counts and fwd->rev counts. Which is closer
    dists = {}
    for s in fcount:
        dists[s] = {}
        for t in fcount:
            distf = distance.euclidean(fcount[s], fcount[t])
            distr = distance.euclidean(fcount[s], rcount[t])
            deltad = distf / (distr + 0.000001)  # add episilon incase rd =  0
            """
            # Uncomment this line to print the results as a list
            print("\t".join(map(str, [s, t, fd, rd, dd])))
            """
            """
            deltad is the ratio between the number of fwd kmers and the 
            number of reverse kmers. When deltad == 0, the sequences
            are the same.
            When deltad is 1 the sequences are random
            When deltad is very high (>2 but sometimes much bigger) the 
            sequences are the reverse complement 
            
            We set:
            -1 -- they are the same
             0 -- no relationship
             1 -- need reverse complementing
            """

            if deltad < 0.5:
                dists[s][t] = -1
            elif deltad > 2:
                dists[s][t] = 1
            else:
                dists[s][t] = 0

    # now we just need to figure out which ones minimize the score by reverse complementing them
    # we are going to do this with pandas. I have a jupyter notebook (calculate_which_to_rc) about
    # this
    df = pd.DataFrame.from_dict(dists)
    seen = {}
    oldsum = 0
    maxiters = 100
    iters = 0
    """
    there are three ways to control this loop:
     - run it for n (100) times
     - run it until we see the same thing again
     - run it while we are always improving.
     
    It probably doesn't matter which one, we just run it 100 times
    We also check whether we are just flipping the same thing
    """
    oldsum = 0
    seen = {}
    maxiters = 100
    iters = 0
    lastflip = None
    while True:
        # this is based on number of runs
        iters += 1
        if iters > maxiters:
            break

        totalsum = sum(df.sum(axis=1))
        """
        # uncomment to run while we are getting better
        if totalsum > oldsum:
            sys.stderr.write(f"Total sum is {totalsum}. Oldsum is {oldsum}. Breaking\n")
            break
        oldsum = totalsum
        """

        ## what is the top one to change
        idx = df.sum(axis=1).sort_values(ascending=False).head(2).index
        nm = idx[0]
        if nm == lastflip:
            nm = idx[1]
        lastflip = nm
        """
        # uncomment to break on if we have run before
        if nm in seen:
            sys.stderr.write(f"Total sum is {totalsum}. Found {nm} ({seen[nm]}) again. Breaking\n")
            break
        """
        seen[nm] = seen.get(nm, 0) + 1
        df.loc[nm] *= -1
        df[nm] *= -1
    # finally, we need to know which sequences to reverse complement. Only those where seen is an odd number!

    torc = set()
    for s in seen:
        if seen[s] % 2:
            torc.add(s)

    if args.v:
        sys.stderr.write(
            f"{bcolors.BLUE}Will reverse complement: {torc}{bcolors.ENDC}\n")
    # and now we read all the fasta files again and put them in output directory
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    for f in filter(fare.search, os.listdir(dir)):
        if args.v:
            sys.stderr.write(
                f"{bcolors.GREEN}Re-reading to check for rc {f}{bcolors.ENDC}\n"
            )
        with open(os.path.join(outdir, f), 'w') as out:
            for seqid, seq in stream_fasta(os.path.join(dir, f)):
                if seqid in torc:
                    out.write(f">{seqid}_rc\n{rc(seq)}\n")
                else:
                    out.write(f">{seqid}\n{seq}\n")
Пример #29
0
"""
Print the IDs and lengths of sequences in a fasta file
"""

import os
import sys
import argparse
from roblib import stream_fasta

parser = argparse.ArgumentParser(description="Print the lengths of sequences in a fasta file")
parser.add_argument('-f', help='fasta file', required=True)
parser.add_argument('-w', help='whole sequence ID. Default is to use ID upto whitespace', action="store_true", default=False)
parser.add_argument('-v', help='verbose output', action="store_true")
args = parser.parse_args()

for seqid, seq in stream_fasta(args.f, args.w):
    print("{}\t{}".format(seqid, len(seq)))
Пример #30
0
"""

import os
import sys
import argparse
from roblib import stream_fasta, colours

__author__ = 'Rob Edwards'
__copyright__ = 'Copyright 2020, Rob Edwards'
__credits__ = ['Rob Edwards']
__license__ = 'MIT'
__maintainer__ = 'Rob Edwards'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-f', help='fasta file', required=True)
    parser.add_argument('-d', help='output directory', required=True)
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    if not os.path.exists(args.d):
        os.makedirs(args.d, exist_ok=True)

    for seqid, seq in stream_fasta(args.f, True):
        sname = seqid.split(" ")[0]
        if args.v:
            sys.stderr.write(f"{colours.GREEN}Writing {sname}{colours.ENDC}\n")
        with open(os.path.join(args.d, f"{sname}.fasta"), 'w') as out:
            out.write(f">{seqid}\n{seq}\n")
Пример #31
0
    parser.add_argument('-f', help='fasta DNA sequence file', required=True)
    parser.add_argument('-o', help='ORFs file', required=True)
    parser.add_argument(
        '-s',
        help=
        'sources file that has the source of the ORF calls (default = sources.txt)',
        default="sources.txt")
    parser.add_argument('-v', help='verbose output', action='store_true')
    args = parser.parse_args()

    sources = read_sources(args.s)

    # generate the six frame translations
    seqs = {}
    lengths = {}
    for seqid, seq in stream_fasta(args.f, False):
        seq = seq.upper()
        lengths[seqid] = len(seq)
        seqs[seqid] = {}
        seqs[seqid]["f1"] = translate_dna(seq, args.v)
        seqs[seqid]["f2"] = translate_dna(seq[1:], args.v)
        seqs[seqid]["f3"] = translate_dna(seq[2:], args.v)
        rcseq = rc(seq)
        seqs[seqid]["r1"] = translate_dna(rcseq, args.v)
        seqs[seqid]["r2"] = translate_dna(rcseq[1:], args.v)
        seqs[seqid]["r3"] = translate_dna(rcseq[2:], args.v)

    for orfid, orf in stream_fasta(args.o):
        for s in seqs:
            for fr in ["f1", "f2", "f3", "r1", "r2", "r3"]:
                if orf in seqs[s][fr]:
Пример #32
0
NOTE: This is faster (not really fast), but only allows you to run one fasta
"""

import os
import sys
import argparse
import hashlib
from roblib import stream_fasta, bcolors

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=" ")
    parser.add_argument('-f', help='fasta file')
    parser.add_argument('-i', help='id map file to write', required=True)
    parser.add_argument('-o', help='output file', required=True)
    args = parser.parse_args()

    md5seen = set()
    idseen = set()

    with open(args.o, 'w') as out, open(args.i, 'w') as idout:
        for seqid, seq in stream_fasta(args.f):
            if seqid in idseen:
                continue
            idseen.add(seqid)
            md5 = hashlib.md5(seq.upper().encode('utf-8')).hexdigest()
            idout.write(f"{md5}\t{seqid}\n")
            if md5 not in md5seen:
                out.write(f">{md5}\n{seq}\n")
            md5seen.add(md5)
    if not os.path.exists(args.o):
        os.mkdir(args.o)

    # read the directories
    for phispydir in os.listdir(args.p):
        if not os.path.exists(os.path.join(args.s, phispydir)):
            if args.v:
                sys.stderr.write(
                    'A seed directory matching the phispy directory {} was not found. Skipped\n'.format(phispydir))
            continue

        # read the phages
        phageseqs = []
        for phagefile in [x for x in os.listdir(os.path.join(args.p, phispydir)) if x.endswith('.fasta')]:
            for pid, phagecontig in roblib.stream_fasta(os.path.join(args.p, phispydir, phagefile)):
                phageseqs.append(phagecontig)

        # sort the phages longest to smallest
        phageseqs = sorted(phageseqs, key=len, reverse=True)

        # now read the sequences and split out on phageseqs
        if not os.path.exists(os.path.join(args.s, phispydir, "contigs")):
            sys.stderr.write(
                "Error: no contigs file was found at {}. Skipped\n".format(os.path.join(args.s, phispydir, "contigs")))

        os.mkdir(os.path.join(args.o, phispydir))
        out = open(os.path.join(args.o, phispydir, "contigs"), 'w')

        for gid, genomecontig in roblib.stream_fasta(os.path.join(args.s, phispydir, "contigs")):
            contigcount = 0