Пример #1
0
def design(inp, out, args):

    names, sequences = ft.read_fasta_lists(inp)
    seqs = list()

    for name, sequence in zip(names, sequences):
        seqs.append(Sequence(name=name, sequence=sequence))

    if not args.quiet:
        print("Number of input sequences: ", len(seqs))

    if args.gap_span:
        designer = GapSpanningLibraryDesigner(window_size=args.window_size,
                                              step_size=args.step_size)
    else:
        designer = LibraryDesigner(window_size=args.window_size,
                                   step_size=args.step_size)

    library = designer.design(seqs)

    if not args.quiet:
        print("Number of output Kmers: ", len(library))

    outD = {e.name: e.sequence for e in library}
    namesSorted = sorted(list(outD.keys()))
    ft.write_fasta(namesSorted, [outD[n] for n in namesSorted], out)

    return len(namesSorted)
def main():

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("inputs", help="One or more input target fasta files (unaligned).", nargs="+")
    parser.add_argument("-e", "--exclude", default="X-", help="Any Xmers or yMers containing these chaarcters will be excluded.")

    reqArgs = parser.add_argument_group('required arguments')
    reqArgs.add_argument( '-k', '--kmer_size', help = "kmer size to use for comparing sequences.", default = 9, type = int, required=True )
    reqArgs.add_argument("-o", "--out", help="Output file name. ", required=True )

    args = parser.parse_args()
    
    #Create set of characters to exclude
    exSet = set(args.exclude)
    
    with open(args.out, "w") as fout:
        fout.write("File\tAvgPropShared\tMedianPropShared\tMinPropShared\tMaxPropShared\n")
        
        #Step through input files
        for eachF in args.inputs:
            
            #Read in seqs in file
            names, seqs = ft.read_fasta_lists(eachF)
            
            propIDs = []
            
            for s1, s2 in it.combinations(seqs, 2):
                propIDs.append(kt.compSeqs(s1, s2, args.kmer_size, filter=exSet))
        
            fout.write("%s\t%.3f\t%.3f\t%.3f\t%.3f\n" % (eachF, np.mean(propIDs), np.median(propIDs), min(propIDs), max(propIDs)))
Пример #3
0
def main():

    arg_parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    arg_parser.add_argument(
        '-r',
        '--reps',
        help=
        "Number of replicate datasets to generate for each level of divergence.",
        default=1,
        type=int)

    reqArgs = arg_parser.add_argument_group('Required Arguments')
    reqArgs.add_argument(
        '-i',
        '--input',
        help="Fasta file contianing the protein sequence to downsample.",
        required=True)
    reqArgs.add_argument(
        '-n',
        '--num',
        help=
        "Size(s) of downsampled datasets. Can be a comma-delimited list of integers",
        required=True)
    reqArgs.add_argument(
        '-o',
        '--output',
        help=
        "Directory name for output files. Will be created, if it doesn't already exist",
        required=True)

    args = arg_parser.parse_args()

    # Generate output directory
    if not os.path.isdir(args.output):
        os.mkdir(args.output)
    else:
        print("Warning: %s already exists!" % (args.output))

    # Read in fasta file to downsample
    names, seqs = ft.read_fasta_lists(args.input)

    # Extract file basename
    bName = ".".join(os.path.basename(args.input).split(".")[:-1])

    # Step through each dataset size
    sizes = [int(x) for x in args.num.split(",")]

    for s in sizes:
        sCount = 0
        while sCount < args.reps:
            indexes = random.choices(range(len(names)), k=s)
            ft.write_fasta(
                [names[i] for i in indexes], [seqs[i] for i in indexes],
                "%s/%s_n%04d-%03d.fasta" % (args.output, bName, s, sCount))
            sCount += 1
Пример #4
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "inputs",
        help="One or more input target fasta files (unaligned).",
        nargs="+")
    parser.add_argument(
        "-e",
        "--exclude",
        default="X-",
        help="Any Xmers or yMers containing these chaarcters will be excluded."
    )

    reqArgs = parser.add_argument_group('required arguments')
    reqArgs.add_argument(
        '-k',
        '--kmer_size',
        help=
        "Comma-delimited list of kmer sizes to use for comparing sequences.",
        required=True)
    reqArgs.add_argument("-o",
                         "--out",
                         help="Output file name. ",
                         required=True)

    args = parser.parse_args()

    #Create set of characters to exclude
    exSet = set(args.exclude)

    #Parse kmers
    kmers = [int(k) for k in args.kmer_size.split(",")]

    with open(args.out, "w") as fout:
        fout.write("File\t%s\t%s\n" % ("\t".join(
            ["Avg%dmerProp" % k
             for k in kmers]), "\t".join(["Avg%dmers" % k for k in kmers])))

        #Step through input files
        for eachF in args.inputs:

            fNames, fSeqs = ft.read_fasta_lists(eachF)

            avgProps = []

            #Step through each kmer size
            for k in kmers:
                cD = kt.kmerDictCountFasta(eachF, k, filter=exSet)
                avgProps.append(np.mean(list(cD.values())))

            fout.write("%s\t%s\t%s\n" % (eachF, "\t".join([
                "%.3f" % (ap / len(fNames)) for ap in avgProps
            ]), "\t".join(["%.3f" % (ap) for ap in avgProps])))
Пример #5
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("inputs",
                        help="One or more input target fasta files (aligned).",
                        nargs="+")
    parser.add_argument(
        "-e",
        "--exclude",
        default="X-",
        help="Any Xmers or yMers containing these chaarcters will be excluded."
    )

    reqArgs = parser.add_argument_group('required arguments')
    reqArgs.add_argument("-o",
                         "--out",
                         help="Output file name. ",
                         required=True)

    args = parser.parse_args()

    #Create set of characters to exclude
    exSet = set(args.exclude)

    with open(args.out, "w") as fout:
        fout.write(
            "File\tAvgIdentity\tMinIdentity\tMaxIdentity\tAvgDivergence\tMinDivergence\tMaxDivergence\n"
        )

        #Step through input files
        for eachF in args.inputs:

            fNames, fSeqs = ft.read_fasta_lists(eachF)

            ids = []

            #Step through each sequence pair
            for s1, s2 in it.combinations(fSeqs, 2):
                ids.append(compSeqs(s1, s2, exSet))

            avg = np.mean(ids)
            mn = min(ids)
            mx = max(ids)

            fout.write("%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" %
                       (eachF, avg, mn, mx, 1 - avg, 1 - mn, 1 - mx))
Пример #6
0
def main():
    argparser = argparse.ArgumentParser( "Output a FASTA containing only unique sequences." )
    argparser.add_argument( '-i','--input', help = "Name of input file." )
    argparser.add_argument( '-o','--output', help = "Name of output file This file will contain the "
                                                    "same sequences as the input file, but duplicates will not be included."
                         )

    args = argparser.parse_args()

    in_names, in_seqs = fastatools.read_fasta_lists( args.input )

    out_names, out_seqs = list(), list()
    seen_seqs = set()

    for name, seq in zip( in_names, in_seqs ):
        if seq not in seen_seqs:
            seen_seqs.add( seq )
            out_names.append( name )
            out_seqs.append( seq )

    fastatools.write_fasta( out_names, out_seqs, args.output )
Пример #7
0
def main():

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("fastas", help="Fasta files to dedup", nargs='+')
    parser.add_argument(
        "--prepend",
        default="dedup",
        help="String to add to the beginning of deduped files.")

    args = parser.parse_args()

    for each in args.fastas:

        # Read in fasta file
        names, seqs = ft.read_fasta_lists(each)

        # Convert to dictionary with keys = names, and values = lists of seqs
        fD = defaultdict(list)
        for i, n in enumerate(names):
            fD[n].append(seqs[i])

        newN = []
        newS = []

        for n, sL in fD.items():
            if len(set(sL)) == 1:
                newN.append(n)
                newS.append(sL[0])
            else:
                print(n)
                for s in sL:
                    newN.append(n)
                    newS.append(s)

        ft.write_fasta(newN, newS, "%s_%s" % (args.prepend, each))
Пример #8
0
def design(inp, out, args):

    # Generate dict with xmer counts
    xcD = {}

    tN, tS = ft.read_fasta_lists(inp)
    for s in tS:
        xL = kt.kmerList(s, args.xMerSize)
        for x in xL:
            if len(set(x).intersection(args.exSet)) == 0:
                xcD[x] = xcD.get(x, 0) + 1

    #Save count of total xmers in targets
    totalX = len(xcD)

    # Score each target sequence by summing contained xmer scores
    maxScore = 0
    repS = ""
    repN = ""
    for i, s in enumerate(tS):
        theseXs = kt.kmerList(s, args.xMerSize)
        thisScore = sum([xcD[x] for x in theseXs if x in xcD])
        if thisScore > maxScore:
            maxScore = thisScore
            repS = s
            repN = tN[i]

    # Generate peptides using a sliding window across the chosen representative sequence
    rep = [Sequence(name=repN, sequence=repS)]
    designer = LibraryDesigner(window_size=args.yMerSize,
                               step_size=args.step_size)
    library = designer.design(rep)

    repD = {e.name: e.sequence for e in library}
    repNames = sorted(list(repD.keys()))
    repSeqs = [repD[n] for n in repNames]

    # Remove xmers covered by the sliding window peptides
    for s in repSeqs:
        xL = kt.kmerList(s, args.xMerSize)
        for x in xL:
            if x in xcD:
                del (xcD[x])

    # Read in all yMers in targets
    ysD = {}
    yNameD = {}
    for i, s in enumerate(tS):
        yL = kt.kmerList(s, args.yMerSize)
        for j, y in enumerate(yL):
            if len(set(y).intersection(args.exSet)) == 0:
                ysD[y] = 0
                yNameD[y] = "%s_%04d" % (tN[i], j)

    # Design peptides
    newSeqs = []
    newNames = []

    while (1 - (len(xcD) / totalX)) < args.target:

        thisPep = choosePep(ysD, xcD, args)
        thisName = yNameD[thisPep]
        newSeqs.append(thisPep)
        newNames.append(thisName)

        #Remove selected peptide from ysD
        del (ysD[thisPep])

        #Remove covered xMers from xcD
        for eachX in kt.kmerList(thisPep, args.xMerSize):
            if eachX in xcD:
                del (xcD[eachX])

    # Write out peptides
    ft.write_fasta(repNames + newNames, repSeqs + newSeqs, out)

    return len(repSeqs + newSeqs)
Пример #9
0
def design(inp, out, args):

    # Generate dict with xmer counts
    xcD = defaultdict(int)

    tN, tS = ft.read_fasta_lists(inp)
    for s in tS:
        xL = kt.kmerList(s, args.xMerSize)
        for x in xL:
            if len(set(x).intersection(args.exSet)) == 0:
                xcD[x] += 1

    # Write out tsv with xmer counts, if requested
#     if args.outputXmerTables:
#         writeXmerDict(xcD, "initialXmerCounts.tsv")

#Save count of total xmers in targets
    totalX = len(xcD)

    # If pre-designed peptides are provided, remove any contained xmers from the xcD
    if args.pre:
        for each in args.pre.split(","):
            pN, pS = ft.read_fasta_lists(each)
            for s in pS:
                xL = kt.kmerList(s, args.xMerSize)
                for x in xL:
                    if x in xcD:
                        del (xcD[x])

        # Write out tsv with xmer counts, if requested


#         if args.outputXmerTables:
#             writeXmerDict(xcD, "preRemovedXmerCounts.tsv")

# Read in all yMers in targets
    ysD = {}
    yNameD = {}
    for i, s in enumerate(tS):
        #    for s in tS:
        yL = kt.kmerList(s, args.yMerSize)
        for j, y in enumerate(yL):
            #        for y in yL:
            if len(set(y).intersection(args.exSet)) == 0:
                ysD[y] = 0
                yNameD[y] = "%s_%04d" % (tN[i], j)

    # Design peptides
    newPeps = []
    newNames = []

    while (1 - (len(xcD) / totalX)) < args.target:

        thisPep = choosePep(ysD, xcD, args)
        thisName = yNameD[thisPep]
        newPeps.append(thisPep)
        newNames.append(thisName)

        #Remove selected peptide from ysD
        del (ysD[thisPep])

        #Remove covered xMers from xcD
        for eachX in kt.kmerList(thisPep, args.xMerSize):
            if eachX in xcD:
                del (xcD[eachX])

    # Write out peptides
    ft.write_fasta(newNames, newPeps, out)

    return len(newPeps)