def main(): readslist = [] readslist2 = [] input = "/Users/Xin/Desktop/IC_project/output/rpoB_output/ICW_rpoB.fasta" output0 = "read_counts.csv" output = open("ICW_rpoB_nopaired.fasta", 'w') reads_tag ='_2:N:0:5' try: f0 = open(input, 'r') print("Open success!") except IOError: print ("no such file!") for line in f0: tmp0 = re.search("^\>(M00704:49:000000000-AFW6D[\d\:]+)",line) if tmp0: read = tmp0.group(1) readslist.append(read) print("Number of the reads in the list is ", len(readslist)) f0.close() df_reads = pd.DataFrame(readslist, columns = ['read']) read_counts = df_reads['read'].value_counts() read_counts.to_csv(output0) f= open(input, 'r') with open("/Users/Xin/Desktop/IC_project/output/rpoB_output/read_counts.csv", 'r') as csvfile: f1 = csv.DictReader(csvfile, delimiter = ",", fieldnames = ['read','count']) for row in f1: if int(row['count']) == 2: readslist2.append(row['read']+reads_tag) subset = Subsampling() subset.exclude(f, readslist2, output)
def main(): input1dir = input('Reads collections: ') input2dir = input('Reads filter: ') outputdir = input('Output file: ') output = open(outputdir, 'w') try: all_reads = open(input1dir, 'r') except IOError: print ("no such file!") readslist = [] with open(input2dir, 'r') as inputfile: f = csv.DictReader(inputfile, delimiter =",", fieldnames=['reads','genus']) for row in f: readslist.append(row['reads']) # all the reads print("total reads in filter list:", len(readslist)) subset = Subsampling() subset.exclude(all_reads, readslist, output)
def main(): genuslist = [] readlist = [] inputfiledir1 = "/Users/Xin/Desktop/IC_project/output/Jan222016/resource_tables/ICC_DS2_2_unmapped_genus_count.csv" inputfiledir2 = "/Users/Xin/Desktop/IC_project/output/Jan222016/resource_tables/ICC_DS2_2_unmapped_taxa.csv" inputfiledir3 = "/Users/Xin/Desktop/IC_project/output/ICC_DS2_CLC_mapping_output/ICC_DS2_2_CLC_unmapped.fa" outputfiledir = "/Users/Xin/Desktop/IC_project/output/Feb032016/ICC_DS2_2_unmapped_genus.fa" outputfile = open(outputfiledir, 'w') seq = open(inputfiledir3, 'r') genus = pd.read_csv(inputfiledir1, names=["taxa", "count"], header=None) read_taxa = pd.read_csv(inputfiledir2, names=["num", "read","id","taxa"], header=None) for row in genus["taxa"]: genuslist.append(row) for i in range(0,len(read_taxa["taxa"])): tmp = read_taxa["taxa"][i] if tmp in genuslist: readlist.append(read_taxa["read"][i]) subset = Subsampling() subset.include(seq, readlist, outputfile)
def main(argv): if len(argv[1:]) == 3: input1dir = argv[1] input2dir = argv[2] outputdir = argv[3] else: print("Three arguements are needed!!") output = open(outputdir, 'w') try: all_reads = open(input1dir, 'r') except IOError: print ("no such file!") readslist = [] with open(input2dir, 'r') as inputfile: f = csv.DictReader(inputfile, delimiter =",", fieldnames=['reads','genus']) for row in f: readslist.append(row['reads']) # all the reads print("total reads in filter list:", len(readslist)) subset = Subsampling() subset.exclude(all_reads, readslist, output)