def clean_single_read_data(fastq1_file, fastq1ok_file, fastq_bad_file, verbose=False, adapters_file=None, cutoff=None, polyG_cutoff=23): ''' Remove reads containing N, # quality, polyG/polyC tracks and adapters. ''' wh1 = open(fastq1ok_file, "w") bad = open(fastq_bad_file, "w") statistics = { "pe": 0, "se": 0, "N": 0, "zeroQ": 0, "polyC%s" % polyG_cutoff: 0, "polyG%s" % polyG_cutoff: 0, "adapters": 0, } if cutoff: cutoff = int(cutoff) cutoff_key = "length%s" % cutoff statistics[cutoff_key] = 0 print "Load adapters file" adapters = [] if adapters_file: with open(adapters_file) as fh: for line in fh: adap = line.strip().split()[0] rev_adap = get_revcomp(adap) if not adap in adapters: adapters.append(adap) if not rev_adap in adapters: adapters.append(rev_adap) else: print "Adapter file missing" print "Number of adapters:", len(adapters) for i, read1 in enumerate(fastq_reader(fastq1_file)): error1 = None if verbose: print i, round(100 * statistics["se"]/float(i+1), 2), "% of good", statistics, "\r", if cutoff: if read1.length < cutoff: error1 = cutoff_key if not error1: error1 = is_bad_read(read1, adapters, polyG_cutoff) if error1: bad.write(read1.fastq_with_error(error1)) statistics[error1] += 1 else: wh1.write(read1.fastq) statistics["se"] += 1 wh1.close() bad.close() if i > 0: statistics["fraction"] = statistics["pe"]/float(i) print print statistics return statistics
def kmer_to_repbase_with_mongo(kmer_file): client = MongoClient() client = MongoClient('mongodb://localhost:27017/') db = client.Repbase index = db.MainIndex name_index = db.NameIndex name_hash = {} print "Iter over kmers" for d in sc_iter_simple_tab_file(kmer_file): (kmer, tf) = d kmer = kmer.lower() print kmer, tf data = index.find_one({'kmer':kmer}) if not data: rkmer = get_revcomp(kmer) data = index.find_one({'kmer':rkmer}) if data: matches = data["index"] for rid, tf in matches: if rid in name_hash: name = name_hash[rid] else: name = name_index.find_one({"kid":rid}) name = name["name"] name_hash[rid] = name print "\t", name, tf else: print "\t???"
def clean_se_run(settings): ''' ''' # 1. Reads filter illumina kmers # 2. Filter data # 3. Save print "Load library for key=", settings["k"] with open(settings["pickle_libraries_file"]) as fh: library = cPickle.load(fh) library = library[settings["k"]] kmers =set(library.keys()) for kmer in library.keys(): kmers.add(get_revcomp(kmer)) with open(settings["dat_libraries_file"], "w") as fh: for kmer in kmers: fh.write("%s\t-\n" % kmer) prefix = settings["prefix"] verbose = settings["verbose"] adapters_file = settings["dat_libraries_file"] fastq1_file = "%s.fastq" % prefix fastq1ok_file = "%s.ok.fastq" % prefix fastq_bad_file = "%s.bad.fastq" % prefix clean_single_read_data(fastq1_file, fastq1ok_file, fastq_bad_file, verbose=verbose, adapters_file=adapters_file, cutoff=settings["cutoff"], polyG_cutoff=settings["polyGcutoff"] )
def check_adapters(settings): ''' ''' print "Load library for key=", settings["k"] with open(settings["pickle_libraries_file"]) as fh: library = cPickle.load(fh) library = library[settings["k"]] assert len(library.keys()[0]) == settings["k"] print "Library size:", len(library.keys()) contaminated_kmers = {} print "Iter over kmers" for i, d in enumerate(sc_read_simple_tab_file(settings["fastq_file"])): (kmer, tf) = d tf = int(tf) kmer = kmer.lower() print i, kmer, tf, "\r", if settings["cutoff"] and tf < settings["cutoff"]: break rkmer = get_revcomp(kmer) if kmer in library or rkmer in library: print print kmer, tf, library[kmer] contaminated_kmers[kmer] = (tf, library[kmer]) all_kmers = set(contaminated_kmers.keys()) contaminated_kmers = contaminated_kmers.items() contaminated_kmers.sort(key=lambda x: x[1], reverse=True) print "Save data" with open(settings["output_file"], "w") as fh: for (k, v) in contaminated_kmers: rkey = get_revcomp(k) s = "%s\t%s\n" % (k, v) fh.write(s) if not rkey in all_kmers: s = "%s\t%s\n" % (rkey, v) fh.write(s) return contaminated_kmers
def clean_pair_reads_data(fastq1_file, fastq2_file, fastq1ok_file, fastq2ok_file, fastq_se_file, fastq_bad_file, verbose=False, adapters_file=None, cutoff=None, polyG_cutoff=23): ''' Remove reads containing N, # quality, polyG/polyC tracks and adapters. ''' wh1 = open(fastq1ok_file, "w") wh2 = open(fastq2ok_file, "w") se = open(fastq_se_file, "w") bad = open(fastq_bad_file, "w") statistics = { "pe": 0, "se": 0, "N": 0, "zeroQ": 0, "polyC%s" % polyG_cutoff: 0, "polyG%s" % polyG_cutoff: 0, "adapters": 0, } if cutoff: cutoff = int(cutoff) cutoff_key = "length%s" % cutoff statistics[cutoff_key] = 0 adapters = [] if adapters_file: with open(adapters_file) as fh: for line in fh.readlines(): adap = line.strip().split()[0] rev_adap = get_revcomp(adap) if not adap in adapters: adapters.append(adap) if not rev_adap in adapters: adapters.append(rev_adap) print "Number of adapters:", len(adapters) for i, (read1, read2) in enumerate(iter_pe_data(fastq1_file, fastq2_file)): error1 = None error2 = None if verbose: print i, round(100 * statistics["pe"]/float(i+1), 2), "% of good", statistics, "\r", if cutoff: if read1.length < cutoff: error1 = cutoff_key if read2.length < cutoff: error2 = cutoff_key if not (error1 or error1): error1 = is_bad_read(read1, adapters, polyG_cutoff) error2 = is_bad_read(read2, adapters, polyG_cutoff) if not error1 and not error2: wh1.write(read1.fastq) wh2.write(read2.fastq) statistics["pe"] += 1 continue if error1: bad.write(read1.fastq_with_error(error1)) statistics[error1] += 1 else: se.write(read1.fastq) statistics["se"] += 1 if error2: bad.write(read2.fastq_with_error(error2)) statistics[error2] += 1 else: se.write(read2.fastq) statistics["se"] += 1 wh1.close() wh2.close() se.close() bad.close() if i > 0: statistics["fraction"] = statistics["pe"]/float(i) print print statistics return statistics
def kmer_to_cegma_with_mongo(kmer_file, verbose=False): client = MongoClient() client = MongoClient('mongodb://localhost:27017/') db = client.Repbase index = db.CegmaMainIndex name_index = db.CegmaNameIndex repbase_index = db.MainIndex name_hash = {} print "Iter over kmers" match = { "repbase": 0, "cegma": 0, "repbase_cegma": 0, "other": 0, } match_distr = { "repbase": defaultdict(int), "cegma": defaultdict(int), "repbase_cegma": defaultdict(int), "other": defaultdict(int), } for d in sc_iter_simple_tab_file(kmer_file): (kmer, tf) = d repbase_hit = False cegma_hit = False # print tf, kmer, "\r", tf = int(tf) print match, tf, "\r", kmer = kmer.lower() # if verbose: # print tf, kmer, "\r", data = repbase_index.find_one({'kmer':kmer}) if not data: rkmer = get_revcomp(kmer) data = repbase_index.find_one({'kmer':rkmer}) if data: repbase_hit = True data = index.find_one({'kmer':kmer}) if not data: rkmer = get_revcomp(kmer) data = index.find_one({'kmer':rkmer}) if data: matches = data["index"] cegma_hit = True # print # print kmer, tf # for rid, tf in matches: # if rid in name_hash: # name = name_hash[rid] # else: # name = name_index.find_one({"kid":rid}) # name = name["name"].strip() # name_hash[rid] = name # print "\t", name, tf # else: # # print "\t???" # pass if repbase_hit and cegma_hit: match["repbase_cegma"] += 1 match_distr["repbase_cegma"][tf] += 1 continue elif repbase_hit: match["repbase"] += 1 match_distr["repbase"][tf] += 1 continue elif cegma_hit: match["cegma"] += 1 match_distr["cegma"][tf] += 1 continue else: match["other"] += 1 print print match_distr print match