#After selection samples = aaf_kmercount(selection_dir,kl,n,options.nThreads,memSize/options.nThreads) ###Merge output wc files divFile = selection_dir+'.wc' handle = open(divFile, 'w') handle.close() for sample in samples: countfile = sample + '.wc' os.system('cat {} >> {}'.format(countfile, divFile)) os.remove(countfile) ###Run kmer_merge outFile = selection_dir+'.dat.gz' handle = smartopen(outFile, 'w') handle.write(('#-k {}\n'.format(kl)).encode()) handle.write(('#-n {}\n'.format(n)).encode()) for i, sample in enumerate(samples): handle.write(('#sample{}: {}\n'.format(i + 1, sample)).encode()) handle.close() command = "{} -k s -c -d '0' -a 'T,M,F'".format(filt) cut = [] for i, sample in enumerate(samples): command += " '{}.pkdat.gz'".format(sample) cut.append(str((i + 1) * 2)) command += ' | cut -f {} | gzip >> {}'.format(','.join(cut), outFile) print('\n', time.strftime('%c')) print(command)
# MA 02110-1301, USA. # from AAF import smartopen, present from optparse import OptionParser import sys Usage = "%prog [options] shared_kmer_table" version = '%prog 20170118.1' parser = OptionParser(Usage, version = version) parser.add_option("-n", dest = "filter", type = int, default = 1, help = "k-mer filtering threshold, default = 1") (options, args) = parser.parse_args() kmer_table = smartopen(sys.argv[1]) outfile = open(sys.argv[1].split('.')[0]+'_kmerMatrix.tsv','w') n = options.filter line1 = kmer_table.readline() if line1.startswith('#') #with header outfile.write('kmers') for line in kmer_table: if line.startswith('#sample'): outfile.write('\t'+line.split(":")[1].strip()+'\n') else: outfile.write(line.split()[0]+'\t') outfile.write('\t'.join([present(i,n) for i in line.split()[1:]])+'\n') else: outfile.write(line1.split()[0]+'\t') outfile.write('\t'.join([present(i,n) for i in line1.split()[1:]])+'\n')
if flag[-3] == '1': #read unmapped if flag[-1] == '1': #read paired if flag[-4] =='1': #mate unmapped dic[line.split()[0]] = 'both' else: if flag[5] == '1': dic[line.split()[0]] = 'R1' else: dic[line.split()[0]] = 'R2' else: if flag[5] == '1': dic[line.split()[0]] = 'R1' else: dic[line.split()[0]] = 'R2' file1 = smartopen(sys.argv[2]) out1 = gzip.open(sys.argv[1].split('.')[0] + '_pair1.fq.gz','w') out3 = gzip.open(sys.argv[1].split('.')[0] + '_singleton.fq.gz','w') for record1 in SeqIO.parse(file1,'fastq'): if record1.id in dic: if dic[record1.id] == 'both': SeqIO.write(record1,out1,"fastq") elif dic[record1.id] == 'R1': SeqIO.write(record1,out3,"fastq") file1.close() out1.close() file2 = smartopen(sys.argv[3]) out2 = gzip.open(sys.argv[1].split('.')[0] + '_pair2.fq.gz','w') for record2 in SeqIO.parse(file2,'fastq'): if record2.id in dic:
pattern[kmer] = line_pattern return pattern Usage = "%prog [options] shared_kmer_table kmer_file" version = '%prog 20161117.1' parser = OptionParser(Usage, version = version) parser.add_option("-n", dest = "filter", type = int, default = 1, help = "k-mer filtering threshold, default = 1") parser.add_option("-t", dest = "nThreads", type = int, default = 1, help = "number of threads to use, default = 1") parser.add_option("-G", dest = "memsize", type = float, default = 1, help = "max memory to use (in GB), default = 1") (options, args) = parser.parse_args() kmer_table = smartopen(sys.argv[1]) input = smartopen(sys.argv[2]) n = options.filter nThreads = options.nThreads memory = options.memsize line = input.readline() line = input.readline() if line.startswith(tuple('ATCG')): Type = 'kmer' output = open(os.path.basename(sys.argv[2]).split('.')[0]+'.pattern','w') elif line.startswith(tuple('01')): Type = 'pattern' print(os.path.basename(sys.argv[2]).split('.')[0]+'.pattern') output = open(os.path.basename(sys.argv[2]).split('.')[0]+'.kmer','w') else:
if flag[-3] == '1': #read unmapped if flag[-1] == '1': #read paired if flag[-4] == '1': #mate unmapped dic[line.split()[0]] = 'both' else: if flag[5] == '1': dic[line.split()[0]] = 'R1' else: dic[line.split()[0]] = 'R2' else: if flag[5] == '1': dic[line.split()[0]] = 'R1' else: dic[line.split()[0]] = 'R2' file1 = smartopen(sys.argv[2]) out1 = gzip.open(sys.argv[1].split('.')[0] + '_pair1.fq.gz', 'w') out3 = gzip.open(sys.argv[1].split('.')[0] + '_singleton.fq.gz', 'w') for record1 in SeqIO.parse(file1, 'fastq'): if record1.id in dic: if dic[record1.id] == 'both': SeqIO.write(record1, out1, "fastq") elif dic[record1.id] == 'R1': SeqIO.write(record1, out3, "fastq") file1.close() out1.close() file2 = smartopen(sys.argv[3]) out2 = gzip.open(sys.argv[1].split('.')[0] + '_pair2.fq.gz', 'w') for record2 in SeqIO.parse(file2, 'fastq'): if record2.id in dic:
Usage = "%prog [options] <data directory> <sequence format, fasta or fastq>" version = '%prog 20161212.1' parser = OptionParser(Usage, version=version) (options, args) = parser.parse_args() if os.path.isdir(sys.argv[1]): file_list_1 = os.listdir(sys.argv[1]) file_list = [os.path.join(sys.argv[1], x) for x in file_list_1] else: file_list = [sys.argv[1]] seq_form = sys.argv[2] dic = {} c = 0 g = 0 n = 0 total = 0 length = [] for seq_file in file_list: if not seq_file.endswith("~"): fh = smartopen(seq_file) for seq_record in SeqIO.parse(fh, seq_form): c += seq_record.seq.count('C') g += seq_record.seq.count('G') n += seq_record.seq.count('N') total += len(seq_record.seq) length.append(len(seq_record.seq)) print(seq_file, c, g, n, total) print('GC:', float((c + g)) / (total - n)) print('mean:', total / len(length)) print('total:', total)
fitch = './fitch_kmerX_long' else: fitch = './fitch_kmerX' if not is_exe(fitch): print(fitch+' not found. Make sure it is in your PATH or the') print('current directory, and that it is executable') sys.exit() else: if options.long: fitch = 'fitch_kmerX_long' else: fitch = 'fitch_kmerX' #check input files try: kmerTable = smartopen(options.iptf) except IOError: print('Cannot open file', options.iptf) sys.exit() try: singleton = open(options.countfs) except IOError: print('Cannot open file', options.countf) sys.exit() ###Read header and get sample list samples = [] #species list line = kmerTable.readline() ll = line.split() kl = int(ll[1]) #kmer length
Usage = "%prog [options] <data directory> <sequence format, fasta or fastq>" version = '%prog 20161212.1' parser = OptionParser(Usage, version = version) (options, args) = parser.parse_args() if os.path.isdir(sys.argv[1]): file_list_1 = os.listdir(sys.argv[1]) file_list = [os.path.join(sys.argv[1],x) for x in file_list_1] else: file_list = [sys.argv[1]] seq_form = sys.argv[2] dic={} c=0 g=0 n=0 total=0 length = [] for seq_file in file_list: if not seq_file.endswith("~"): fh = smartopen(seq_file) for seq_record in SeqIO.parse(fh, seq_form): c += seq_record.seq.count('C') g += seq_record.seq.count('G') n += seq_record.seq.count('N') total += len(seq_record.seq) length.append(len(seq_record.seq)) print(seq_file, c, g, n, total) print('GC:',float((c+g))/(total-n)) print('mean:',total/len(length)) print('total:',total)
type=int, default=1, help="k-mer filtering threshold, default = 1") parser.add_option("-t", dest="nThreads", type=int, default=1, help="number of threads to use, default = 1") parser.add_option("-G", dest="memsize", type=float, default=1, help="max memory to use (in GB), default = 1") (options, args) = parser.parse_args() kmer_table = smartopen(sys.argv[1]) input = smartopen(sys.argv[2]) n = options.filter nThreads = options.nThreads memory = options.memsize line = input.readline() line = input.readline() if line.startswith(tuple('ATCG')): Type = 'kmer' output = open( os.path.basename(sys.argv[2]).split('.')[0] + '.pattern', 'w') elif line.startswith(tuple('01')): Type = 'pattern' print(os.path.basename(sys.argv[2]).split('.')[0] + '.pattern') output = open(os.path.basename(sys.argv[2]).split('.')[0] + '.kmer', 'w')
#After selection samples = aaf_kmercount(selection_dir,kl,n,options.nThreads,memSize/options.nThreads) ###Merge output wc files divFile = selection_dir+'.wc' handle = open(divFile, 'w') handle.close() for sample in samples: countfile = sample + '.wc' os.system('cat {} >> {}'.format(countfile, divFile)) os.remove(countfile) ###Run kmer_merge outFile = selection_dir+'.dat.gz' handle = smartopen(outFile, 'w') handle.write(('#-k {}\n'.format(kl)).encode('latin-1')) handle.write(('#-n {}\n'.format(n)).encode('latin-1')) for i, sample in enumerate(samples): handle.write(('#sample{}: {}\n'.format(i + 1, sample)).encode('latin-1')) handle.close() command = "{} -k s -c -d '0' -a 'T,M,F'".format(filt) cut = [] for i, sample in enumerate(samples): command += " '{}.pkdat.gz'".format(sample) cut.append(str((i + 1) * 2)) command += ' | cut -f {} | gzip >> {}'.format(','.join(cut), outFile) print('\n', time.strftime('%c')) print(command)