def design(inp, out, args): names, sequences = ft.read_fasta_lists(inp) seqs = list() for name, sequence in zip(names, sequences): seqs.append(Sequence(name=name, sequence=sequence)) if not args.quiet: print("Number of input sequences: ", len(seqs)) if args.gap_span: designer = GapSpanningLibraryDesigner(window_size=args.window_size, step_size=args.step_size) else: designer = LibraryDesigner(window_size=args.window_size, step_size=args.step_size) library = designer.design(seqs) if not args.quiet: print("Number of output Kmers: ", len(library)) outD = {e.name: e.sequence for e in library} namesSorted = sorted(list(outD.keys())) ft.write_fasta(namesSorted, [outD[n] for n in namesSorted], out) return len(namesSorted)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("inputs", help="One or more input target fasta files (unaligned).", nargs="+") parser.add_argument("-e", "--exclude", default="X-", help="Any Xmers or yMers containing these chaarcters will be excluded.") reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument( '-k', '--kmer_size', help = "kmer size to use for comparing sequences.", default = 9, type = int, required=True ) reqArgs.add_argument("-o", "--out", help="Output file name. ", required=True ) args = parser.parse_args() #Create set of characters to exclude exSet = set(args.exclude) with open(args.out, "w") as fout: fout.write("File\tAvgPropShared\tMedianPropShared\tMinPropShared\tMaxPropShared\n") #Step through input files for eachF in args.inputs: #Read in seqs in file names, seqs = ft.read_fasta_lists(eachF) propIDs = [] for s1, s2 in it.combinations(seqs, 2): propIDs.append(kt.compSeqs(s1, s2, args.kmer_size, filter=exSet)) fout.write("%s\t%.3f\t%.3f\t%.3f\t%.3f\n" % (eachF, np.mean(propIDs), np.median(propIDs), min(propIDs), max(propIDs)))
def main(): arg_parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument( '-r', '--reps', help= "Number of replicate datasets to generate for each level of divergence.", default=1, type=int) reqArgs = arg_parser.add_argument_group('Required Arguments') reqArgs.add_argument( '-i', '--input', help="Fasta file contianing the protein sequence to downsample.", required=True) reqArgs.add_argument( '-n', '--num', help= "Size(s) of downsampled datasets. Can be a comma-delimited list of integers", required=True) reqArgs.add_argument( '-o', '--output', help= "Directory name for output files. Will be created, if it doesn't already exist", required=True) args = arg_parser.parse_args() # Generate output directory if not os.path.isdir(args.output): os.mkdir(args.output) else: print("Warning: %s already exists!" % (args.output)) # Read in fasta file to downsample names, seqs = ft.read_fasta_lists(args.input) # Extract file basename bName = ".".join(os.path.basename(args.input).split(".")[:-1]) # Step through each dataset size sizes = [int(x) for x in args.num.split(",")] for s in sizes: sCount = 0 while sCount < args.reps: indexes = random.choices(range(len(names)), k=s) ft.write_fasta( [names[i] for i in indexes], [seqs[i] for i in indexes], "%s/%s_n%04d-%03d.fasta" % (args.output, bName, s, sCount)) sCount += 1
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "inputs", help="One or more input target fasta files (unaligned).", nargs="+") parser.add_argument( "-e", "--exclude", default="X-", help="Any Xmers or yMers containing these chaarcters will be excluded." ) reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument( '-k', '--kmer_size', help= "Comma-delimited list of kmer sizes to use for comparing sequences.", required=True) reqArgs.add_argument("-o", "--out", help="Output file name. ", required=True) args = parser.parse_args() #Create set of characters to exclude exSet = set(args.exclude) #Parse kmers kmers = [int(k) for k in args.kmer_size.split(",")] with open(args.out, "w") as fout: fout.write("File\t%s\t%s\n" % ("\t".join( ["Avg%dmerProp" % k for k in kmers]), "\t".join(["Avg%dmers" % k for k in kmers]))) #Step through input files for eachF in args.inputs: fNames, fSeqs = ft.read_fasta_lists(eachF) avgProps = [] #Step through each kmer size for k in kmers: cD = kt.kmerDictCountFasta(eachF, k, filter=exSet) avgProps.append(np.mean(list(cD.values()))) fout.write("%s\t%s\t%s\n" % (eachF, "\t".join([ "%.3f" % (ap / len(fNames)) for ap in avgProps ]), "\t".join(["%.3f" % (ap) for ap in avgProps])))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("inputs", help="One or more input target fasta files (aligned).", nargs="+") parser.add_argument( "-e", "--exclude", default="X-", help="Any Xmers or yMers containing these chaarcters will be excluded." ) reqArgs = parser.add_argument_group('required arguments') reqArgs.add_argument("-o", "--out", help="Output file name. ", required=True) args = parser.parse_args() #Create set of characters to exclude exSet = set(args.exclude) with open(args.out, "w") as fout: fout.write( "File\tAvgIdentity\tMinIdentity\tMaxIdentity\tAvgDivergence\tMinDivergence\tMaxDivergence\n" ) #Step through input files for eachF in args.inputs: fNames, fSeqs = ft.read_fasta_lists(eachF) ids = [] #Step through each sequence pair for s1, s2 in it.combinations(fSeqs, 2): ids.append(compSeqs(s1, s2, exSet)) avg = np.mean(ids) mn = min(ids) mx = max(ids) fout.write("%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\n" % (eachF, avg, mn, mx, 1 - avg, 1 - mn, 1 - mx))
def main(): argparser = argparse.ArgumentParser( "Output a FASTA containing only unique sequences." ) argparser.add_argument( '-i','--input', help = "Name of input file." ) argparser.add_argument( '-o','--output', help = "Name of output file This file will contain the " "same sequences as the input file, but duplicates will not be included." ) args = argparser.parse_args() in_names, in_seqs = fastatools.read_fasta_lists( args.input ) out_names, out_seqs = list(), list() seen_seqs = set() for name, seq in zip( in_names, in_seqs ): if seq not in seen_seqs: seen_seqs.add( seq ) out_names.append( name ) out_seqs.append( seq ) fastatools.write_fasta( out_names, out_seqs, args.output )
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("fastas", help="Fasta files to dedup", nargs='+') parser.add_argument( "--prepend", default="dedup", help="String to add to the beginning of deduped files.") args = parser.parse_args() for each in args.fastas: # Read in fasta file names, seqs = ft.read_fasta_lists(each) # Convert to dictionary with keys = names, and values = lists of seqs fD = defaultdict(list) for i, n in enumerate(names): fD[n].append(seqs[i]) newN = [] newS = [] for n, sL in fD.items(): if len(set(sL)) == 1: newN.append(n) newS.append(sL[0]) else: print(n) for s in sL: newN.append(n) newS.append(s) ft.write_fasta(newN, newS, "%s_%s" % (args.prepend, each))
def design(inp, out, args): # Generate dict with xmer counts xcD = {} tN, tS = ft.read_fasta_lists(inp) for s in tS: xL = kt.kmerList(s, args.xMerSize) for x in xL: if len(set(x).intersection(args.exSet)) == 0: xcD[x] = xcD.get(x, 0) + 1 #Save count of total xmers in targets totalX = len(xcD) # Score each target sequence by summing contained xmer scores maxScore = 0 repS = "" repN = "" for i, s in enumerate(tS): theseXs = kt.kmerList(s, args.xMerSize) thisScore = sum([xcD[x] for x in theseXs if x in xcD]) if thisScore > maxScore: maxScore = thisScore repS = s repN = tN[i] # Generate peptides using a sliding window across the chosen representative sequence rep = [Sequence(name=repN, sequence=repS)] designer = LibraryDesigner(window_size=args.yMerSize, step_size=args.step_size) library = designer.design(rep) repD = {e.name: e.sequence for e in library} repNames = sorted(list(repD.keys())) repSeqs = [repD[n] for n in repNames] # Remove xmers covered by the sliding window peptides for s in repSeqs: xL = kt.kmerList(s, args.xMerSize) for x in xL: if x in xcD: del (xcD[x]) # Read in all yMers in targets ysD = {} yNameD = {} for i, s in enumerate(tS): yL = kt.kmerList(s, args.yMerSize) for j, y in enumerate(yL): if len(set(y).intersection(args.exSet)) == 0: ysD[y] = 0 yNameD[y] = "%s_%04d" % (tN[i], j) # Design peptides newSeqs = [] newNames = [] while (1 - (len(xcD) / totalX)) < args.target: thisPep = choosePep(ysD, xcD, args) thisName = yNameD[thisPep] newSeqs.append(thisPep) newNames.append(thisName) #Remove selected peptide from ysD del (ysD[thisPep]) #Remove covered xMers from xcD for eachX in kt.kmerList(thisPep, args.xMerSize): if eachX in xcD: del (xcD[eachX]) # Write out peptides ft.write_fasta(repNames + newNames, repSeqs + newSeqs, out) return len(repSeqs + newSeqs)
def design(inp, out, args): # Generate dict with xmer counts xcD = defaultdict(int) tN, tS = ft.read_fasta_lists(inp) for s in tS: xL = kt.kmerList(s, args.xMerSize) for x in xL: if len(set(x).intersection(args.exSet)) == 0: xcD[x] += 1 # Write out tsv with xmer counts, if requested # if args.outputXmerTables: # writeXmerDict(xcD, "initialXmerCounts.tsv") #Save count of total xmers in targets totalX = len(xcD) # If pre-designed peptides are provided, remove any contained xmers from the xcD if args.pre: for each in args.pre.split(","): pN, pS = ft.read_fasta_lists(each) for s in pS: xL = kt.kmerList(s, args.xMerSize) for x in xL: if x in xcD: del (xcD[x]) # Write out tsv with xmer counts, if requested # if args.outputXmerTables: # writeXmerDict(xcD, "preRemovedXmerCounts.tsv") # Read in all yMers in targets ysD = {} yNameD = {} for i, s in enumerate(tS): # for s in tS: yL = kt.kmerList(s, args.yMerSize) for j, y in enumerate(yL): # for y in yL: if len(set(y).intersection(args.exSet)) == 0: ysD[y] = 0 yNameD[y] = "%s_%04d" % (tN[i], j) # Design peptides newPeps = [] newNames = [] while (1 - (len(xcD) / totalX)) < args.target: thisPep = choosePep(ysD, xcD, args) thisName = yNameD[thisPep] newPeps.append(thisPep) newNames.append(thisName) #Remove selected peptide from ysD del (ysD[thisPep]) #Remove covered xMers from xcD for eachX in kt.kmerList(thisPep, args.xMerSize): if eachX in xcD: del (xcD[eachX]) # Write out peptides ft.write_fasta(newNames, newPeps, out) return len(newPeps)