Exemplo n.º 1
0
def design(inp, out, args):

    names, sequences = ft.read_fasta_lists(inp)
    seqs = list()

    for name, sequence in zip(names, sequences):
        seqs.append(Sequence(name=name, sequence=sequence))

    if not args.quiet:
        print("Number of input sequences: ", len(seqs))

    if args.gap_span:
        designer = GapSpanningLibraryDesigner(window_size=args.window_size,
                                              step_size=args.step_size)
    else:
        designer = LibraryDesigner(window_size=args.window_size,
                                   step_size=args.step_size)

    library = designer.design(seqs)

    if not args.quiet:
        print("Number of output Kmers: ", len(library))

    outD = {e.name: e.sequence for e in library}
    namesSorted = sorted(list(outD.keys()))
    ft.write_fasta(namesSorted, [outD[n] for n in namesSorted], out)

    return len(namesSorted)
Exemplo n.º 2
0
def main():

    arg_parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    arg_parser.add_argument(
        '-r',
        '--reps',
        help=
        "Number of replicate datasets to generate for each level of divergence.",
        default=1,
        type=int)

    reqArgs = arg_parser.add_argument_group('Required Arguments')
    reqArgs.add_argument(
        '-i',
        '--input',
        help="Fasta file contianing the protein sequence to downsample.",
        required=True)
    reqArgs.add_argument(
        '-n',
        '--num',
        help=
        "Size(s) of downsampled datasets. Can be a comma-delimited list of integers",
        required=True)
    reqArgs.add_argument(
        '-o',
        '--output',
        help=
        "Directory name for output files. Will be created, if it doesn't already exist",
        required=True)

    args = arg_parser.parse_args()

    # Generate output directory
    if not os.path.isdir(args.output):
        os.mkdir(args.output)
    else:
        print("Warning: %s already exists!" % (args.output))

    # Read in fasta file to downsample
    names, seqs = ft.read_fasta_lists(args.input)

    # Extract file basename
    bName = ".".join(os.path.basename(args.input).split(".")[:-1])

    # Step through each dataset size
    sizes = [int(x) for x in args.num.split(",")]

    for s in sizes:
        sCount = 0
        while sCount < args.reps:
            indexes = random.choices(range(len(names)), k=s)
            ft.write_fasta(
                [names[i] for i in indexes], [seqs[i] for i in indexes],
                "%s/%s_n%04d-%03d.fasta" % (args.output, bName, s, sCount))
            sCount += 1
Exemplo n.º 3
0
def gene_translator(genes_filename, output_filename):
    with open(genes_filename, 'r') as f:
        list_seq = ft.fasta_list(f)
        with open(output_filename, 'w') as fw:
            for seq_tuple in list_seq:
                sequence = ''
                for i in range(0, len(seq_tuple[1]), 3):
                    code = seq_tuple[1][i:i + 3]
                    if len(code) == 3:
                        sequence += genecode[code]
                ft.write_fasta(fw, seq_tuple[0], sequence.rstrip('_'))
Exemplo n.º 4
0
def gene_translator_frame(genes_filename, output_filename):
    with open(genes_filename, 'r') as f:
        list_seq = ft.fasta_list(f)
        write_switch = False
        with open(output_filename, 'w') as fw:
            for seq_tuple in list_seq:
                sequence = ''
                for i in range(0, len(seq_tuple[1]), 3):
                    code = seq_tuple[1][i:i + 3]
                    if code in ['ATG', 'GTG']:
                        if not write_switch:
                            code = 'ATG'
                        write_switch = True
                    elif code in ['TAA', 'TAG', 'TGA']:
                        write_switch = False
                    if write_switch and len(code) == 3:
                        sequence += genecode[code]
                ft.write_fasta(fw, seq_tuple[0], sequence)
Exemplo n.º 5
0
def orf(infile, outfile):
    def check_codon(index, codon):
        if codon == 'ATG' and not semaphores[index]:
            start = ''
            if index < 4:
                start = '+' + str(i + 1 + index) + '+'
            else:
                start = '+c' + str(len(v) - i - index + 3) + '+'
            seq[index] += '*' + start
            semaphores[index] = True
        elif codon in ['TAA', 'TAG', 'TGA'] and semaphores[index]:
            stop = ''
            if index < 4:
                stop = '+' + str(i + 3 + index) + '+'
            else:
                stop = '+' + str(len(v) - i + 1 - index) + '+'
            seq[index] += codon + stop + '*'
            semaphores[index] = False

        if semaphores[index]:
            seq[index] += codon

    with open(outfile, 'w') as fw:
        with open(infile, 'r') as f:
            semaphores = [False for _ in range(6)]
            seq = ['' for _ in range(6)]
            for head, v in bio.fasta_list2(f):
                cdna_codon = cdna(v)[::-1]
                for i in range(0, len(v), 3):
                    check_codon(0, v[i:i + 3])
                    check_codon(1, v[i + 1:i + 4])
                    check_codon(2, v[i + 2:i + 5])
                    check_codon(3, cdna_codon[i:i + 3])
                    check_codon(4, cdna_codon[i + 1:i + 4])
                    check_codon(5, cdna_codon[i + 2:i + 5])
                for i in set("".join(seq).split('*')):
                    if len(i) > 6 and i[-1:] == '+':
                        i = i.split('+')
                        bio.write_fasta(
                            fw, "|".join(head.split('|')[:-1]) + '|:' + i[1] +
                            '-' + i[3], i[2])
Exemplo n.º 6
0
def main():
    argparser = argparse.ArgumentParser( "Output a FASTA containing only unique sequences." )
    argparser.add_argument( '-i','--input', help = "Name of input file." )
    argparser.add_argument( '-o','--output', help = "Name of output file This file will contain the "
                                                    "same sequences as the input file, but duplicates will not be included."
                         )

    args = argparser.parse_args()

    in_names, in_seqs = fastatools.read_fasta_lists( args.input )

    out_names, out_seqs = list(), list()
    seen_seqs = set()

    for name, seq in zip( in_names, in_seqs ):
        if seq not in seen_seqs:
            seen_seqs.add( seq )
            out_names.append( name )
            out_seqs.append( seq )

    fastatools.write_fasta( out_names, out_seqs, args.output )
Exemplo n.º 7
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("fastas", help="Fasta files to dedup", nargs='+')
    parser.add_argument(
        "--prepend",
        default="dedup",
        help="String to add to the beginning of deduped files.")

    args = parser.parse_args()

    for each in args.fastas:

        # Read in fasta file
        names, seqs = ft.read_fasta_lists(each)

        # Convert to dictionary with keys = names, and values = lists of seqs
        fD = defaultdict(list)
        for i, n in enumerate(names):
            fD[n].append(seqs[i])

        newN = []
        newS = []

        for n, sL in fD.items():
            if len(set(sL)) == 1:
                newN.append(n)
                newS.append(sL[0])
            else:
                print(n)
                for s in sL:
                    newN.append(n)
                    newS.append(s)

        ft.write_fasta(newN, newS, "%s_%s" % (args.prepend, each))
Exemplo n.º 8
0
def main():
    arg_parser = argparse.ArgumentParser(
        description="Mutate input sequences to generate diverse datasets.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    arg_parser.add_argument(
        '-n',
        '--num',
        help="Number of mutated sequences to output per input sequence.",
        default=30,
        type=int)
    arg_parser.add_argument(
        '-r',
        '--reps',
        help=
        "Number of replicate datasets to generate for each level of divergence.",
        default=1,
        type=int)

    reqArgs = arg_parser.add_argument_group('Required Arguments')
    reqArgs.add_argument(
        '-i',
        '--input',
        help="Fasta file contianing the protein sequence(s) to mutate.",
        required=True)
    reqArgs.add_argument('-o',
                         '--output',
                         help="Base name for output fasta files.",
                         required=True)
    reqArgs.add_argument(
        '-d',
        '--diverg',
        help=
        "Level of divergence from the input sequence. Should be between 0 and 1. Can include multiple comma-delimited values.",
        required=True)

    args = arg_parser.parse_args()

    # Possible amino acids
    AAs = [
        'A', 'C', "D", "E", 'F', "G", "H", 'I', "K", 'L', 'M', 'N', "P", "Q",
        "R", "S", "T", 'V', 'W', 'Y'
    ]

    # Parse target divergences
    divergs = [float(d) for d in args.diverg.split(",")]

    # Read in input seqs
    iD = ft.read_fasta_dict_upper(args.input)

    for d in divergs:
        r = 0
        while r < args.reps:
            r += 1

            outN = []
            outS = []

            for n, s in iD.items():
                newS = s
                c = 0
                muts = int(d * len(s))
                while c < args.num:
                    c += 1

                    sites = random.choices(range(len(s)), k=muts)
                    for site in sites:
                        subAAs = AAs[::]
                        subAAs.remove(newS[site])
                        newS = newS[:site] + random.choice(
                            subAAs) + newS[site + 1:]

                    outS.append(newS)
                    outN.append("%s_d%.3f_%03d" % (n, d, c))

            ft.write_fasta(outN, outS,
                           "%s_d%.3f_r%03d.fasta" % (args.output, d, r))
Exemplo n.º 9
0
def design(inp, out, args):

    # Generate dict with xmer counts
    xcD = {}

    tN, tS = ft.read_fasta_lists(inp)
    for s in tS:
        xL = kt.kmerList(s, args.xMerSize)
        for x in xL:
            if len(set(x).intersection(args.exSet)) == 0:
                xcD[x] = xcD.get(x, 0) + 1

    #Save count of total xmers in targets
    totalX = len(xcD)

    # Score each target sequence by summing contained xmer scores
    maxScore = 0
    repS = ""
    repN = ""
    for i, s in enumerate(tS):
        theseXs = kt.kmerList(s, args.xMerSize)
        thisScore = sum([xcD[x] for x in theseXs if x in xcD])
        if thisScore > maxScore:
            maxScore = thisScore
            repS = s
            repN = tN[i]

    # Generate peptides using a sliding window across the chosen representative sequence
    rep = [Sequence(name=repN, sequence=repS)]
    designer = LibraryDesigner(window_size=args.yMerSize,
                               step_size=args.step_size)
    library = designer.design(rep)

    repD = {e.name: e.sequence for e in library}
    repNames = sorted(list(repD.keys()))
    repSeqs = [repD[n] for n in repNames]

    # Remove xmers covered by the sliding window peptides
    for s in repSeqs:
        xL = kt.kmerList(s, args.xMerSize)
        for x in xL:
            if x in xcD:
                del (xcD[x])

    # Read in all yMers in targets
    ysD = {}
    yNameD = {}
    for i, s in enumerate(tS):
        yL = kt.kmerList(s, args.yMerSize)
        for j, y in enumerate(yL):
            if len(set(y).intersection(args.exSet)) == 0:
                ysD[y] = 0
                yNameD[y] = "%s_%04d" % (tN[i], j)

    # Design peptides
    newSeqs = []
    newNames = []

    while (1 - (len(xcD) / totalX)) < args.target:

        thisPep = choosePep(ysD, xcD, args)
        thisName = yNameD[thisPep]
        newSeqs.append(thisPep)
        newNames.append(thisName)

        #Remove selected peptide from ysD
        del (ysD[thisPep])

        #Remove covered xMers from xcD
        for eachX in kt.kmerList(thisPep, args.xMerSize):
            if eachX in xcD:
                del (xcD[eachX])

    # Write out peptides
    ft.write_fasta(repNames + newNames, repSeqs + newSeqs, out)

    return len(repSeqs + newSeqs)
Exemplo n.º 10
0
def cdna_writer(input_file, output_file):
    with open(output_file, 'w') as f:
        for k, v in cdna_list(input_file):
            bio.write_fasta(f, k, v)
Exemplo n.º 11
0
def design(inp, out, args):

    # Generate dict with xmer counts
    xcD = defaultdict(int)

    tN, tS = ft.read_fasta_lists(inp)
    for s in tS:
        xL = kt.kmerList(s, args.xMerSize)
        for x in xL:
            if len(set(x).intersection(args.exSet)) == 0:
                xcD[x] += 1

    # Write out tsv with xmer counts, if requested
#     if args.outputXmerTables:
#         writeXmerDict(xcD, "initialXmerCounts.tsv")

#Save count of total xmers in targets
    totalX = len(xcD)

    # If pre-designed peptides are provided, remove any contained xmers from the xcD
    if args.pre:
        for each in args.pre.split(","):
            pN, pS = ft.read_fasta_lists(each)
            for s in pS:
                xL = kt.kmerList(s, args.xMerSize)
                for x in xL:
                    if x in xcD:
                        del (xcD[x])

        # Write out tsv with xmer counts, if requested


#         if args.outputXmerTables:
#             writeXmerDict(xcD, "preRemovedXmerCounts.tsv")

# Read in all yMers in targets
    ysD = {}
    yNameD = {}
    for i, s in enumerate(tS):
        #    for s in tS:
        yL = kt.kmerList(s, args.yMerSize)
        for j, y in enumerate(yL):
            #        for y in yL:
            if len(set(y).intersection(args.exSet)) == 0:
                ysD[y] = 0
                yNameD[y] = "%s_%04d" % (tN[i], j)

    # Design peptides
    newPeps = []
    newNames = []

    while (1 - (len(xcD) / totalX)) < args.target:

        thisPep = choosePep(ysD, xcD, args)
        thisName = yNameD[thisPep]
        newPeps.append(thisPep)
        newNames.append(thisName)

        #Remove selected peptide from ysD
        del (ysD[thisPep])

        #Remove covered xMers from xcD
        for eachX in kt.kmerList(thisPep, args.xMerSize):
            if eachX in xcD:
                del (xcD[eachX])

    # Write out peptides
    ft.write_fasta(newNames, newPeps, out)

    return len(newPeps)