예제 #1
0
def rm_degenerates(inputfile):
    standard_nts = list('ACGTU')
    fl1 = FastaList(inputfile)
    for item in fl1.seq_list:
        seq_id = item.split('\n')[0]
        seq_seq = item.split('\n')[1].upper()
        nt = seq_seq[0]
        while nt not in standard_nts:
            seq_seq = seq_seq[1:]
            nt = seq_seq[0]
        nt = seq_seq[-1]
        while nt not in standard_nts:
            seq_seq = seq_seq[:-1]
            nt = seq_seq[-1]
        tmpfile.write(seq_id + '\n' + seq_seq + '\n')
    tmpfile.seek(0)
    return tmpfile.name
예제 #2
0
def rm_degenerates(inputfile):
    standard_nts = list('ACGTU')
    fl1 = FastaList(inputfile)
    for item in fl1.seq_list:
        seq_id = item.split('\n')[0]
        seq_seq = item.split('\n')[1].upper()
        nt = seq_seq[0]
        while nt not in standard_nts:
            seq_seq = seq_seq[1:]
            nt = seq_seq[0]
        nt = seq_seq[-1]
        while nt not in standard_nts:
            seq_seq = seq_seq[:-1]
            nt = seq_seq[-1]
        fraction_nondeg = 1 - (seq_seq.count('A') + seq_seq.count('T') +
                               seq_seq.count('G') + seq_seq.count('C') +
                               seq_seq.count('U')) / (len(seq_seq))
        if fraction_nondeg <= ARGS.q:
            tmpfile.write(seq_id + '\n' + seq_seq + '\n')
    tmpfile.seek(0)
    return tmpfile.name
예제 #3
0
if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(description='Finds target sequences from\
    a target (-t) fasta file in a source fasta file (-s)')
    PARSER.add_argument('-p',
                        type=int,
                        help='nr of processor threads',
                        default=1)
    PARSER.add_argument('-t', type=str, help='target fastafile', required=True)
    PARSER.add_argument('-s', type=str, help='source fastafile', required=True)
    PARSER.add_argument('-l',
                        type=int,
                        help='the length of the extracted seq',
                        default=150)
    ARGS = PARSER.parse_args()
    FA_S = FastaList(ARGS.s)
    ALL_S = FA_S.seq_list + FA_S.rev_comp()
    FA_T = FastaList(ARGS.t)
    FA_CS = FastaList('aivcs.fa')
    fa_t_div = FA_T.divide(ARGS.p)
    aivcs = []
    for seq in FA_CS.seq_list:
        aivcs.append(seq.split('\n')[1].strip())
    FA_OUT = open('sources.fa', 'w')
    manager = Manager()  # Multiprocessing manager
    res_lst = manager.list()
    processes = []
    for i in range(ARGS.p):
        p = Process(target=process_work, args=(res_lst, fa_t_div[i]))
        processes.append(p)
        p.start()
예제 #4
0
def main():
    # Writes log-file
    logfile = open(ARGS.od + os.path.basename(__file__).
                   replace('.py', '.log'), 'w')
    logfile.write('Log for: {} at {}\nUser: {}\n\n'.format(os.path.basename(
        __file__), str(datetime.datetime.now()).split('.')[0],
        getpass.getuser()))
    logfile.write('Minimum sequence length = {}\n'.format(ARGS.m))
    logfile.write('Minimum nr of sequences = {}\n'.format(ARGS.c))
    logfile.write('Minimum fraction of most abundant sequence = {}\n\n'.format(
        ARGS.f))

    # Loop over files in input directory (ARGS.id) but skip files without .fa
    # and .fastq file extension
    filelst = [name for name in os.listdir(ARGS.id) if
               os.path.isfile(ARGS.id + name) and (name.endswith('.fa') or
                                                   name.endswith('.fastq') or
                                                   name.endswith('.gz'))]
    nr_of_files = len(filelst)
    file_nr = 1
    for seqfile in filelst:
        print('\rprocessing file {}/{}'.format(file_nr, nr_of_files), end=" ")
        inp_seq = FastaList(ARGS.id + seqfile)
        init_seq = inp_seq.nr_seq  # The intitial number of seqs in fasta-file
        seq_fa = reduce_fa(inp_seq)
        logfile.write('{}: Read {} sequences. '.format(seqfile, init_seq))
        nr_seq_demult = 0
        primerlist = primer_fa.seq_list
        primerlist_rc = primer_fa.seq_list_revc()
        marker_maxcount = dict()
        fraction = 0
        if ARGS.f > 0:
            # Assumes three charachters at the end of the primer id indicating
            # forward or reverse. Initilize the dict contaning the counts for
            # the sequence for each primer pair with the  highest count
            # TODO: Try to remove dependence on specific primer names in
            #  primer-file
            for primerid in primer_fa.id_list[::2]:
                marker_maxcount[primerid[:-3]] = 0
                
        for seq in range(seq_fa.nr_seq):
            seqname = seq_fa.seq_list[seq].split('\n')[0]
            seqcount = int(seqname.split(':')[1].split('_')[0])
            seqseq = seq_fa.seq_list[seq].split('\n')[1]
            for primer in range(0, primer_fa.nr_seq, 2):
                test_primers1 = [primerlist[primer+1].split('\n')[1],
                                 primerlist_rc[primer].split('\n')[1]]
                test_primers2 = [primerlist[primer].split('\n')[1],
                                 primerlist_rc[primer + 1].split('\n')[1]]
                if all(x in seq_fa.seq_list[seq] for x in test_primers1) or\
                        all(x in seq_fa.seq_list[seq] for x in test_primers2):
                    if ARGS.f > 0:
                        # Since seq_fa.seq_list is ordered with respect to count
                        # the first occurance in the list of a marker sequence
                        # will have the highest count
                        if marker_maxcount[primerlist[primer].split('\n')[0][1:-3]] == 0:
                            marker_maxcount[primerlist[primer].split('\n')[0][1:-3]] =\
                                seqcount
                        fraction = seqcount / marker_maxcount[
                            primerlist[primer].split('\n')[0][1:-3]]
                        # Filters on fraction of most abundant sequnce for the
                        # marker
                        if fraction < ARGS.f:
                            break
                    with open(ARGS.od + primer_fa.id_list[primer][:-3] + '.fa', 'a')\
                            as fi:
                        if ARGS.f > 0:
                            newseq = '>{}_fraction:{}\n{}\n'.format(
                                seqname, round(fraction, 3), seqseq)
                            fi.write(newseq)
                            nr_seq_demult += 1
                        else:
                            fi.write('>' + seq_fa.seq_list[seq])
                            nr_seq_demult += 1
                    fi.close()
        if seq_fa.nr_seq == 0:
            seq_fa.nr_seq = 1
        logfile.write("Reduced to {} sequences.\n".format(nr_seq_demult))
        file_nr += 1
예제 #5
0
                                             'sequence-list',
                        required=True)
    PARSER.add_argument('-m', type=int, help='minimum seq length',
                        default=250, required=False)
    PARSER.add_argument('-c', type=int, help='minimum nr of seqs', default=1,
                        required=False)
    PARSER.add_argument('-f', type=float, help='minimum fraction of most '
                                               'abundant seq', default=0,
                        required=False)
    ARGS = PARSER.parse_args()
    # Some control of input file/directory names and parameter values
    if not os.path.isfile(ARGS.t):
        sys.exit('No PCR primer file. Exits.')
    if not ((ARGS.id.endswith(sep(os.name)) and
             ARGS.od.endswith(sep(os.name)))):
        sys.exit('Invalid directory name. Exits.')
    if os.path.isfile(ARGS.od[:-1]):
        print('{} is a file'.format(ARGS.od[:-1]))
        sys.exit('Exits')
    if os.path.isdir(ARGS.od):
        shutil.rmtree(ARGS.od)
    os.mkdir(ARGS.od)
    primer_fa = FastaList(ARGS.t)  # Make fastaList of primerfile
    if not (0 <= ARGS.f <= 1):
        sys.exit('Fraction (-f) out of range. Exits.')
    if ARGS.m < 0:
        sys.exit('Minimum sequence length (-m) ut of range. Exits.')
    if ARGS.c < 1:
        sys.exit('Count (-c) ot of range. Exits.')
    main()
예제 #6
0
#!/usr/bin/python3
"""Something"""


import argparse
from fasta import FastaList

PARSER = argparse.ArgumentParser(description='Reverse complement a DNA strand')
PARSER.add_argument('-s', type=str, help='oligonucleotide', required=True)
ARGS = PARSER.parse_args()
STRAND = FastaList(ARGS.s)
print(STRAND.rev_comp(STRAND.seq_list))
예제 #7
0
        # print('{:03} % completed'.format(int((100*counter/len(fasta_div)))),
        #       end='\r', flush=True)
    return tmp_lst


if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(description='Finds target sequences from\
    a target (-t) fasta file in a source fasta file (-s)')
    PARSER.add_argument('-p', type=int, help='nr of processor threads',
                        default=1)
    PARSER.add_argument('-t', type=str, help='target fastafile', required=True)
    PARSER.add_argument('-s', type=str, help='source fastafile', required=True)
    PARSER.add_argument('-l', type=int, help='the length of the extracted seq',
                        default=150)
    ARGS = PARSER.parse_args()
    FA_S = FastaList(ARGS.s)
    manager = Manager()
    ALL_S = manager.list([(fasta_s.split('\n')[0], fasta_s.split('\n')[1])
                          for fasta_s in FA_S.seq_list])
    FA_T = FastaList(ARGS.t)
    fa_t_div = FA_T.divide(ARGS.p)
    nr_s_seq = len(ALL_S)
    nr_t_seq = len(FA_T.seq_list)
    print('searching {} subsequences in {} sequences'.format(
     nr_t_seq, nr_s_seq))
    argument = [(x, y) for x in fa_t_div for y in [ALL_S]]
    aivcs = []
    re_lst = []
    try:
        with open('cs.js') as cs_file:
            aivcs = json.load(cs_file)
예제 #8
0
        name_par = fa_hits[:-3].rpartition('/')
        fipa = open(name_par[0] + name_part[1] + 'nopar_' + name_par[2], 'w')
        fipa.write('No parameters, not contigs fasta file created by spades.')
        fipa.close()


# Main
PARSER = argparse.ArgumentParser(description='Split input fasta file based on\
                                              existence of blast hits in input\
                                              blast table file')
PARSER.add_argument('-p',
                    action='store_true',
                    help='switch for parameter file\
                                                     output')
PARSER.add_argument('-f', type=str, help='fastafile', required=True)
PARSER.add_argument('-b', type=str, help='blastfile', required=True)
ARGS = PARSER.parse_args()
try:
    BL_IN = open(ARGS.b)
except IOError:
    sys.exit('input file error')
FA_LST = FastaList(ARGS.f)
FA_IN = FA_LST.fa_file
SPAFA_LST = SpaFaLst(FA_IN)
BL_LST = BlastTbl(BL_IN)
wr_files()
FA_IN.close()
if ARGS.f[-2:] == 'gz' and os.path.isfile(ARGS.f[:-3]):
    os.remove(ARGS.f[:-3])
BL_IN.close()
예제 #9
0
                        action='store_true',
                        help='switch for removal of subseqs')
    PARSER.add_argument('--subsample',
                        type=int,
                        help='switch for removal of subseqs',
                        required=False)

    ARGS = PARSER.parse_args()
    if ARGS.subsample:
        from random import sample
    org_seqs = list()
    unique_seqs = set()
    #  Create a set of unique seqs without flanking Ns or polyA-tail and store
    #  the originals seqs in the list org_seqs. The seqs must be at least of
    #  MIN_LEN length and contain at most MAX_DEG number of deg nucs
    for item in FastaList(ARGS.i):
        org_seqs.append(item.split('\n')[1])
        sequence = item.split('\n')[1].strip('N').rstrip('A')
        if len(sequence) >= MIN_LEN and cnt_nt_deg(sequence) <= MAX_DEG:
            unique_seqs.add(sequence)
    if ARGS.d:
        deg_summary(org_seqs)
    print('Nr of input sequences: {:38}'.format(len(org_seqs)))
    print('Nr of unique sequences (including subsequences): {:12}'.format(
        len(unique_seqs)))
    unique_seqs = list(unique_seqs)
    if ARGS.rm_subseqs:
        print('\n***Removing subsequences***')
        unique_seqs.sort(key=len, reverse=True)
        start = datetime.now()
        final_seqs = red_uniq_seq(unique_seqs)
예제 #10
0
                    help='output file directory',
                    required=True)
PARSER.add_argument('-m', type=str, help='muscle path', required=True)
ARGS = PARSER.parse_args()

# Some controls of input data
if not ((ARGS.id.endswith(sep(os.name)) and ARGS.od.endswith(sep(os.name)))):
    sys.exit('Invalid directory name. Exits.')
if os.path.isfile(ARGS.od[:-1]):
    print('{} is a file'.format(ARGS.od[:-1]))
    sys.exit('Exits')

if os.path.isdir(ARGS.od):
    shutil.rmtree(ARGS.od)
os.mkdir(ARGS.od)
refs = FastaList(ARGS.r)
for seqfile in os.listdir(ARGS.id):
    if 'log' in seqfile:
        continue
    print('Cleaning: {}'.format(seqfile))
    input_name = ARGS.id + seqfile
    output_name = ARGS.od + seqfile
    fa_in = FastaList(input_name)

    # Remove primers if primers exit in seq. rmprimers return an empty list
    # if no primers are found
    if fa_in.rmprimers(ARGS.p):
        fa_in.wr_fasta_file(output_name, ARGS.p)
        fa_in = FastaList(output_name)
    for item in refs.seq_list:
        if seqfile.split('.')[0] in item.split('\n')[0]:
예제 #11
0
    proc2.wait()
    if ARGS.l:
        fi.write(proc1.stderr.read())
        fi.write(proc2.stderr.read())


if __name__ == "__main__":
    import argparse

    PARSER = argparse.ArgumentParser(description='TBD')
    PARSER.add_argument('-f', type=str, help='fasta filename', required=True)
    PARSER.add_argument('-o',
                        type=str,
                        help='output fasta filename',
                        required=True)
    PARSER.add_argument('-m',
                        type=str,
                        help='minimun sequence length',
                        default='200')
    PARSER.add_argument('-l',
                        action='store_true',
                        help='switch for log file output')
    ARGS = PARSER.parse_args()
    tmpfile = NamedTemporaryFile(mode='w+')
    rm_degenerates(ARGS.f)
    if ARGS.l:
        fi = open(ARGS.o.split('.')[0] + '.log', 'w')
    run_vsearch(rm_degenerates(ARGS.f))
    fl2 = FastaList(ARGS.o)
    derep()
예제 #12
0
#!/usr/bin/python3

from fasta import FastaList
import argparse
import subprocess as sub

PARSER = argparse.ArgumentParser(description='Test of primerdelete')
PARSER.add_argument('-f', type=str, help='input fasta file', required=True)
PARSER.add_argument('-p',
                    type=str,
                    help='input primer fasta'
                    ' file',
                    required=True)
PARSER.add_argument('-o', type=str, help='output file', required=True)
PARSER.add_argument('-m', type=str, help='muscle path', required=True)
ARGS = PARSER.parse_args()
fa = FastaList(ARGS.f)
fa.wr_fasta_file(ARGS.o, ARGS.p)
outfi = ARGS.o.split('.')[0] + '.afa'
muscle = sub.Popen(ARGS.m + ' -in ' + ARGS.o + ' -out ' + outfi + ' -quiet')
muscle.wait()
예제 #13
0
                                  'sequence')
 PARSER.add_argument('-id',
                     type=str,
                     help='Directory for input data',
                     required=True)
 PARSER.add_argument('-od',
                     type=str,
                     help='Directory for output data',
                     required=True)
 ARGS = PARSER.parse_args()
 # Some control of input file/directory names and parameter values
 if not ((ARGS.id.endswith(sep(os.name))
          and ARGS.od.endswith(sep(os.name)))):
     sys.exit('Invalid directory name. Exits.')
 if os.path.isfile(ARGS.od[:-1]):
     print('{} is a file'.format(ARGS.od[:-1]))
     sys.exit('Exits')
 if os.path.isdir(ARGS.od):
     shutil.rmtree(ARGS.od)
 os.mkdir(ARGS.od)
 for seqfile in os.listdir(ARGS.id):
     infa = FastaList(ARGS.id + seqfile)
     ref = infa.seq_list[0]
     refid = infa.seq_list[0].split('\n')[0]
     refseq = infa.seq_list[0].split('\n')[1]
     for seq in infa.seq_list[1:2]:
         seqid = seq.split('\n')[0]
         seqseq = seq.split('\n')[1]
         for nt in range(0, len(seqseq)):
             if refseq[nt] != seqseq[nt]:
                 print(seqseq[nt])
예제 #14
0
#!/usr/bin/python3
"""Something"""

import argparse
from fasta import FastaList

PARSER = argparse.ArgumentParser(description='Finds target sequences from a\
           target (-t) fasta file in a source fasta file (-s)')
PARSER.add_argument('-t', type=str, help='target fastafile', required=True)
PARSER.add_argument('-s', type=str, help='source fastafile', required=True)
PARSER.add_argument('-l',
                    type=str,
                    help='the length of the extracted seq',
                    default=150)
ARGS = PARSER.parse_args()
FA_S = FastaList(ARGS.s)
FA_T = FastaList(ARGS.t)
FA_CS = FastaList('aivcs.fa')
aivcs = []
for seq in FA_CS.seq_list:
    aivcs.append(seq.split('\n')[1].strip())
ALL_S = FA_S.seq_list + FA_S.rev_comp()
FA_OUT = open('sources.fa', 'w')
count = len(FA_T.seq_list) // 100
percent = 0
incr1 = 0
incr2 = 0
seq_found = 0
for fasta_t in FA_T.seq_list:
    incr1 += 1
    if incr1 > incr2:
예제 #15
0
import argparse
from fasta import FastaList
if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(description='TBD')
    PARSER.add_argument('-f', type=str, help='input fasta file', required=True)
    PARSER.add_argument('--start', type=int, help='start cut', required=True)
    PARSER.add_argument('--end', type=int, help='end cut', required=True)
    ARGS = PARSER.parse_args()
    cut = ':' + str(ARGS.start) + '-' + str(ARGS.end)
    title = FastaList(ARGS.f).seq_list[0].split()[0] + cut + '\n'
    seq = FastaList(ARGS.f).seq_list[0].split()[1][ARGS.start:ARGS.end]
    outfile = ARGS.f.rsplit('.')[0] + cut.replace(
        ':', '_') + '.' + ARGS.f.rsplit('.')[1]
    with open(outfile, 'w') as f:
        f.write(title)
        f.write(seq + '\n')
예제 #16
0
#!/usr/local/miniconda3/bin/python

import argparse as ap
from fasta import FastaList

PARSER = ap.ArgumentParser(description='Check fastafile')
PARSER.add_argument('-f', type=str, help='fasta file', required=True)
ARGS = PARSER.parse_args()

fl = FastaList(ARGS.f)
for seq in fl.seq_list:
    for nt in seq.split('\n')[1]:
        if nt not in [
                'A', 'C', 'G', 'T', 'R', 'W', 'K', 'Y', 'S', 'N', 'M', 'V',
                'D', 'H', 'B'
        ]:
            #print(nt)
            print(seq.split('\n')[0])
예제 #17
0
#!/usr/bin/python3
# Removes columns containing degenerate nucleotides from alignment and writes
# a new alignment as output file.

import argparse
from fasta import FastaList

PARSER = argparse.ArgumentParser(description='Removes columns with degenerate '
                                 'nucleotides from alignment '
                                 'and writes a new alignment as '
                                 'output file')
PARSER.add_argument('-i', type=str, help='fastq input filename', required=True)
PARSER.add_argument('-o',
                    type=str,
                    help='fastq output filename',
                    default='out.fa')
ARGS = PARSER.parse_args()
newseq_list = FastaList(ARGS.i)
newseq_list.rm_non_agct_columns()
newseq_list.wr_fasta_file(ARGS.o)