def getRandomSequences(rands, in_f, out_f, t, length, amount, seq_lst, seq_count): my_nt_count = 0 my_rand_count = 0 seqs_gen = SeqIterator.SeqIterator(in_f, file_type=t) fd_seq_write = open(out_f, 'w') seq_writer = SeqIterator.SeqWriter(fd_seq_write, file_type=t) for rec in seqs_gen: seq_len = len(rec[1]) if seq_len < length: continue # if debug(): # print "my_nt_count", my_nt_count, "my_nt_count and length", my_nt_count + seq_len - length + 1, "next random position", rands[my_rand_count], "continue", str(my_nt_count + seq_len - length + 1 < rands[my_rand_count]) if my_nt_count + seq_len - length + 1 < rands[my_rand_count]: my_nt_count += seq_len - length + 1 continue num = 0 while my_nt_count + seq_len - length >= rands[my_rand_count]: b = rands[my_rand_count] - my_nt_count e = b + length # if debug(): # print "id", str(rec[0]), "begin", str(b), "end", str(e), "my_rand_pos", rands[my_rand_count], "my_nt_count", my_nt_count if t == "fastq": nextSeq = (getID(rec[0], num, b, e), rec[1][b:e], rec[2][b:e]) elif t == "fasta": nextSeq = (getID(rec[0], num, b, e), rec[1][b:e]) seq_writer.write(nextSeq) my_rand_count += 1 num += 1 if my_rand_count >= len(rands): return my_nt_count += seq_len - length + 1
def outputFasta(input_file, output_file, lineLength): input_iterator = SeqIterator.SeqIterator(input_file) output_iterator = SeqIterator.SeqWriter(open(output_file, 'w'), file_type=Constants.FASTA, line_toggle=True, line_length=lineLength) for record in input_iterator: output_iterator.write(record) sys.stderr.write("%sNumber of records processed:\t%s\n" % (logstr, str(input_iterator.records_processed())))
def __init__(self, file_name1, file_name2, file_type=Constants.FASTQ, gzip_switch=False): self.SeqIterator1 = SeqIterator.SeqIterator(file_name1, file_type=file_type, gzip_switch=gzip_switch) self.SeqIterator2 = SeqIterator.SeqIterator(file_name2, file_type=file_type, gzip_switch=gzip_switch)
def remove_wildcards(fastq_file): """Iterate through a fastq_file and remove reads that have wildcards in them. """ fastq_iter = SeqIterator.SeqIterator(fastq_file, file_type='fastq') fastq_writer = SeqIterator.SeqWriter(sys.stdout, file_type='fastq') count = 0 count_n = 0 for fastq_record in fastq_iter: count += 1 if "N" in fastq_record[1] or "n" in fastq_record[1]: count_n += 1 else: fastq_writer.write(fastq_record) return count, count_n
def sampler(the_dir, output, num, replicates, file_type): if num < 0: raise ValueError onlyfiles = [f for f in listdir(the_dir) if isfile(join(the_dir, f))] onlyfiles.sort() #onlyfiles = tuple(onlyfiles) print "Counting the sequences..." sys.stdout.flush() counts = count_sequences(the_dir, onlyfiles, file_type) total_seqs = sum(counts) if num < 1: num_to_get = int(math.ceil(num * total_seqs)) else: num_to_get = num print "From a total of %s sequences in %s files, %s sequences will be sampled uniformly at random with replacement for each of %s files." % ( str(total_seqs), str(len(onlyfiles)), str(num_to_get), str(replicates)) sys.stdout.flush() locations = [] for i in range(replicates): for _ in range(int(num_to_get)): r_n = random.randint(1, total_seqs) s = (r_n, i) locations.append(s) locations.sort(key=lambda tup: tup[0]) files = [] print "Creating %s files with prefix %s with a total of %s random sequences with replacement to be drawn." % ( str(replicates), output, str(len(locations))) for i in range(replicates): files.append( SeqIterator.SeqWriter(open( join(output + "." + str(i) + "." + file_type), 'w'), file_type=file_type)) seq_counter = 0 list_counter = 0 for ii in range(len(onlyfiles)): f = join(the_dir, onlyfiles[ii]) seq_g = SeqIterator.SeqIterator(f, file_type=file_type) print "Processing %s with %s sequences" % (f, str(counts[ii])) sys.stdout.flush() for seq in seq_g: seq_counter += 1 while list_counter < len( locations) and seq_counter == locations[list_counter][0]: files[locations[list_counter][1]].write(seq) list_counter += 1 if list_counter >= len(locations): return if list_counter % 10000 == 0: for i in range(replicates): files[i].flush()
def selectRecords(readsfile, begin, end, file_type): reads_iterator = SeqIterator.SeqIterator(readsfile, file_type = file_type) output_writer = SeqIterator.SeqWriter(sys.stdout, file_type = file_type) counter = 0 for record in reads_iterator: if file_type == 'fastq' or file_type == 'fasta': record_length = len(record[1]) else: record_length = len(record["SEQ"]) if record_length >= begin and record_length <= end: output_writer.write(record) counter += 1 output_writer.flush() output_writer.close() return counter
def simpleExtract(FASTQ_file): def entropy(alpha_vector): dna_count = Counter(alpha_vector) ent = -1 * sum([ dna_count[base] * math.log(dna_count[base], 2) for base in dna_count ]) return ent fastq_iter = SeqIterator.SeqIterator(FASTQ_file, file_type='fastq') total = 0 wildcards = 0 for record in fastq_iter: total += 1 seq_seq = record[1] if "N" in seq_seq: wildcards += 1 continue ent = entropy(seq_seq) dust_score = dust(seq_seq) run_stats = longest_run(seq_seq) run_mean, run_variance = run_stats[0], run_stats[1] print("{}\t{}\t{}\t{}".format(ent, dust_score, run_mean, run_variance)) sys.stderr.write( "There were {} / {} reads excluded because of wildcards.\n".format( wildcards, total))
def examine_mapping(sam_file, predict_file): """Compute average alignment score and edit distance.""" sam_iter = SeqIterator.SeqIterator(sam_file, file_type='sam') total_sam = 0 count_as = 0 count_ed = 0 avg_align_score = 0 avg_edit_distance = 0 pred_dict = defaultdict(lambda: [0] * 3) if predict_file: with open(predict_file) as predict_fd: for line in predict_fd: line = line.split() pred_dict[line[0].replace("'", "")] = line[1:] for record in sam_iter: total_sam += 1 name = record["QNAME"] pred = pred_dict[name] align_score = float(record[Constants.SAM_KEY_ALIGNMENT_SCORE]) try: edit_distance = int(record["NM:i"]) except KeyError: edit_distance = None flag = int(record["FLAG"]) if (int(pred[0]) == 0 or int(pred[0]) == 1) and flag != 4: count_as += 1 avg_align_score += align_score if flag < 512: avg_edit_distance += edit_distance count_ed += 1 return count_as, count_ed, total_sam, avg_align_score / count_as, avg_edit_distance / count_ed
def samMatchFASTQ(sam_file, fasta_file): sam_dictionary = { record["QNAME"]: True for record in SeqIterator.SeqIterator(sam_file, file_type='SAM') } fasta_iterator = SeqIterator.SeqIterator(fasta_file, file_type='fastq') fasta_writer = SeqIterator.SeqWriter(sys.stdout, file_type='fastq') counter = 0 #sys.stderr.write("sam_dictionary:\n%s\n" % str(sam_dictionary)) for fasta_record in fasta_iterator: # sys.stderr.write("FASTQ record:\t%s\n" % fasta_record[0]) # sys.stderr.flush() if sam_dictionary.get(fasta_record[0], False): counter += 1 fasta_writer.write(fasta_record) return counter
def calculateHistogram(readsfile, file_type): reads_iterator = SeqIterator.SeqIterator(readsfile, file_type = file_type) histogram = {} counter = 0 for record in reads_iterator: counter += 1 record_length = len(record[1]) if record_length in histogram: histogram[record_length] += 1 else: histogram[record_length] = 1 keys = histogram.keys() keys.sort() items = [] average = 0 median = -1 number_so_far = 0 sum_of_squares = 0 for k in keys: items.append(histogram[k]) average += histogram[k] * k sum_of_squares += k * k * histogram[k] number_so_far += histogram[k] if median == -1 and number_so_far >= (counter / 2.0): median = k std_dev = math.sqrt((counter * sum_of_squares - average * average) / (counter * (counter - 1))) average = average / (counter + 0.0) sys.stdout.write(str(keys) + "\n") sys.stdout.write(str(items) + "\n") sys.stdout.write("Average:\t%s\n" % (str(average))) sys.stdout.write("Median:\t%s\n" % (str(median))) sys.stdout.write("Standard Deviation:\t%s\n" % (str(std_dev))) return counter
def removeN(reads, file_type=Constants.FASTQ): inputReads = SeqIterator.SeqIterator(reads, file_type=file_type) outputWriter = SeqIterator.SeqWriter(sys.stdout, file_type=file_type) counter_N = 0 counter_all = 0 for record in inputReads: counter_all += 1 if 'N' in record[1].upper(): counter_N += 1 else: outputWriter.write(record) if counter_all <= 10: sys.stderr.write(record[1] + "\n") sys.stderr.flush() outputWriter.flush() outputWriter.flush() return (counter_all, counter_N)
def count_sequences(my_dir, onlyfiles, file_type): count = [] for f in onlyfiles: print f sys.stdout.flush() seq_g = SeqIterator.SeqIterator(join(my_dir, f), file_type) count.append(seq_g.count()) return tuple(count)
def simulate(norescore_ambig_dict, rescore_ambig_dict): SAM_writer = SeqIterator.SeqWriter(sys.stdout, file_type=Constants.SAM) for QNAME in norescore_ambig_dict: if rescore_ambig_dict == None or QNAME not in rescore_ambig_dict: record_list = norescore_ambig_dict[QNAME] selected_record = record_list[random.randint(1, len(record_list)) - 1] SAM_writer.write(selected_record) SAM_writer.flush()
def extract(sam_file): sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM) sam_writer = SeqIterator.SeqWriter(sys.stdout, file_type=Constants.SAM) last_record = [] rescored_count = 0 for record in sam_input: if record[Constants.SAM_KEY_FLAG] == Constants.SAM_VALUE_UNMAPPED: continue elif len(last_record) >= 1 and record[ Constants.SAM_KEY_QNAME] != last_record[0][ Constants.SAM_KEY_QNAME]: rescored_count += writeTo(last_record, sam_writer) last_record = [record] else: last_record.append(record) rescored_count += writeTo(last_record, sam_writer) sam_writer.flush() return rescored_count
def removeDuplicates(reads, file_type = Constants.FASTQ): inputReads = SeqIterator.SeqIterator(reads, file_type = file_type) outputWriter = SeqIterator.SeqWriter(sys.stdout, file_type = file_type) counter_N = 0 counter_all = 0 duplicateDict = {} for record in inputReads: counter_all += 1 if record[0] in duplicateDict: counter_N += 1 else: duplicateDict[record[0]] = 0 outputWriter.write(record) if counter_all <= 10: sys.stderr.write(record[1] + "\n") sys.stderr.flush() outputWriter.flush() outputWriter.flush() return (counter_all, counter_N)
def processSAM(SAM_file, gzip_switch, paired_end): sam_iterator = SeqIterator.SeqIterator(SAM_file, file_type = Constants.SAM, gzip_switch = gzip_switch) my_counter = Counts() record_dict = sam_iterator.convertToDict("R1", "R2") my_counter.__sam = sam_iterator.records_processed() if not paired_end: processSingle(record_dict, my_counter) else: processPaired(record_dict, my_counter) return my_counter
def hairpinValidate(hairpin_recovered_filename, test_filename, bound, gzip_switch): counter = { "Hairpin_Hits": 0, "Test_NotFound": 0, "Test_Ambig": 0, "Test_UnmapFilt": 0, "Test_Correct": 0, "Test_Incorrect": 0 } hairpin_dict = SeqIterator.SeqIterator( hairpin_recovered_filename, file_type=Constants.SAM).convertToDict() test_dict = SeqIterator.SeqIterator( test_filename, file_type=Constants.SAM, gzip_switch=gzip_switch).convertToDict() for H_QNAME in hairpin_dict: if len(hairpin_dict[H_QNAME] ) != 1 or "XA:Z" in hairpin_dict[H_QNAME][0]: continue hairpin_SAM = hairpin_dict[H_QNAME][0] flag = hairpin_SAM[Constants.SAM_KEY_FLAG] if isUnmapped(flag) or isFiltered(flag): continue counter["Hairpin_Hits"] += 1 hairpin_pos = int(hairpin_SAM[Constants.SAM_KEY_POS]) test_SAM_list = test_dict.get(H_QNAME, None) if test_SAM_list == None: counter["Test_NotFound"] += 1 elif len(test_SAM_list) != 1 or "XA:Z" in test_SAM_list[0]: counter["Test_Ambig"] += 1 else: test_pos = int(test_SAM_list[0][Constants.SAM_KEY_POS]) test_SAM = test_SAM_list[0] if isUnmapped(test_SAM[Constants.SAM_KEY_FLAG]) or isFiltered( test_SAM[Constants.SAM_KEY_FLAG]): counter["Test_UnmapFilt"] += 1 elif test_SAM[Constants.SAM_KEY_RNAME] == hairpin_SAM[Constants.SAM_KEY_RNAME] and \ test_pos <= hairpin_pos + bound and \ test_pos >= hairpin_pos - bound: counter["Test_Correct"] += 1 else: counter["Test_Incorrect"] += 1 return counter
def findASDistribution(sam_file, bucket_count=bucket_count_default): sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM) as_max = -sys.maxint - 1 as_min = sys.maxint no_hits = 0 total_records = 0 for record in sam_input: alignment_score = float(record[Constants.SAM_KEY_ALIGNMENT_SCORE]) if record[Constants.SAM_KEY_RNAME].startswith( Constants.SAM_VALUE_STAR): no_hits += 1 continue if alignment_score > as_max: as_max = alignment_score if alignment_score < as_min: as_min = alignment_score total_records += 1 #total_records = sam_input.records_processed() sam_input.reset() buckets = [0] * (bucket_count + 1) as_range = as_max - as_min bucket_length = as_range / bucket_count score_bounds = as_min score_bounds_list = [] for _ in range(bucket_count): score_bounds += bucket_length score_bounds_list.append(score_bounds) stuff = 0 # counter = 0 total_score = 0 median_position = int(total_records / 2) median_score = 0 for record in sam_input: alignment_score = float(record[Constants.SAM_KEY_ALIGNMENT_SCORE]) if record[Constants.SAM_KEY_RNAME].startswith( Constants.SAM_VALUE_STAR): continue if stuff == median_position: median_score = alignment_score total_score += alignment_score bucket_index = int( math.floor((alignment_score - as_min) / bucket_length)) # if bucket_index == 2: # print bucket_index, alignment_score buckets[bucket_index] += 1 # if counter < 10: # print bucket_index, alignment_score # counter += 1 stuff += 1 avg_score = total_score / (stuff + 0.0) assert total_records == stuff freq = map((lambda x: x / (total_records + 0.0)), buckets) return (freq, score_bounds_list, total_records, as_max, as_min, buckets, avg_score, median_score, no_hits)
def processSAM(SAM_file, gzip_switch, bound, paired_end): sam_iterator = SeqIterator.SeqIterator(SAM_file, file_type = Constants.SAM, gzip_switch = gzip_switch) counter_dict = {"Total_SAM_Records" : 0, "Records_Analyzed" : 0, "Correct" : 0, "Incorrect" : 0, "Unmap/Filter" : 0, "Improper" : 0} #"FP" : 0, "FN" : 0, "TP" : 0, "TN" : 0, record_dict = sam_iterator.convertToDict("R1", "R2") counter_dict["Total_SAM_Records"] = sam_iterator.records_processed() counter = 0 for key in record_dict.keys(): if counter < 10: sys.stderr.write("%s\n" % str(record_dict[key])) counter += 1 if not paired_end and (len(record_dict[key]) > 1 or "XA:Z" in record_dict[key][0]): # if counter <= 10: # print "0" continue if isUnmapped(record_dict[key][0][Constants.SAM_KEY_FLAG]) or isFiltered(record_dict[key][0][Constants.SAM_KEY_FLAG]): # if counter <= 10: # print "1" counter_dict["Unmap/Filter"] += 1 continue if paired_end and (len(record_dict[key])) != 2: # if counter <= 10: # print "2" continue if paired_end and ("XA:Z" in record_dict[key][0] or "XA:Z" in record_dict[key][1]): # if counter <= 10: # print "3" continue if paired_end and (not isProper(record_dict[key][0][Constants.SAM_KEY_FLAG]) or not isProper(record_dict[key][1][Constants.SAM_KEY_FLAG])): # if counter <= 10: # print "4" counter_dict["Improper"] += 1 continue counter_dict["Records_Analyzed"] += 1 SAM_record = record_dict[key][0] QNAME = re.split('_|:|-', SAM_record[Constants.SAM_KEY_QNAME]) REAL_POS = int(QNAME[2]) RNAME = SAM_record[Constants.SAM_KEY_RNAME] TEST_POS = int(SAM_record[Constants.SAM_KEY_POS]) REAL_POS_u = REAL_POS + bound REAL_POS_l = REAL_POS - bound if paired_end: SAM_record2 = record_dict[key][1] RNAME2 = SAM_record2[Constants.SAM_KEY_RNAME] TEST_POS2 = int(SAM_record2[Constants.SAM_KEY_POS]) if not paired_end and QNAME[1] == RNAME and TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l: counter_dict["Correct"] += 1 elif not paired_end: counter_dict["Incorrect"] += 1 if paired_end and ((QNAME[1] == RNAME and TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l) or (QNAME[1] == RNAME2 and TEST_POS2 <= REAL_POS_u and TEST_POS2 >= REAL_POS_l)): counter_dict["Correct"] += 1 elif paired_end: counter_dict["Incorrect"] += 1 return counter_dict
def getSeqCount(file_str, t, length): seqs_gen = SeqIterator.SeqIterator(file_str, file_type=t) if length < 0: return seqs_gen.count() else: seq_count = 0 for rec in seqs_gen: seq_len = len(rec[1]) if seq_len < length: continue seq_count += 1 return seq_count
def findASDistribution(sam_file, use_edit_distance, bucket_count=bucket_count_default): sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM) as_max = -sys.maxint - 1 as_min = sys.maxint no_hits = 0 total_records = 0 scores = [] for record in sam_input: if record[Constants.SAM_KEY_RNAME].startswith( Constants.SAM_VALUE_STAR) or isUnmapped( record[Constants.SAM_KEY_FLAG]): no_hits += 1 continue if not use_edit_distance: alignment_score = float( record[Constants.SAM_KEY_ALIGNMENT_SCORE]) / float( len(record[Constants.SAM_KEY_SEQ])) else: alignment_score = float( record[Constants.SAM_KEY_DISTANCE]) / float( len(record[Constants.SAM_KEY_SEQ])) scores.append(alignment_score) if alignment_score > as_max: as_max = alignment_score if alignment_score < as_min: as_min = alignment_score total_records += 1 scores.sort() total_score = sum(scores) avg_score = total_score / (total_records + 0.0) median_position = int(total_records / 2) median_score = scores[median_position] if bucket_count > 0: buckets = [0] * (bucket_count + 1) as_range = as_max - as_min bucket_length = as_range / bucket_count score_bounds = as_min score_bounds_list = [] for _ in range(bucket_count): score_bounds += bucket_length score_bounds_list.append(score_bounds) for alignment_score in scores: bucket_index = int( math.floor((alignment_score - as_min) / bucket_length)) buckets[bucket_index] += 1 freq = map((lambda x: x / (total_records + 0.0)), buckets) return (freq, score_bounds_list, total_records, as_max, as_min, buckets, avg_score, median_score, no_hits) else: return (scores, "", total_records, as_max, as_min, "", avg_score, median_score, no_hits)
def extract(sam_file): sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM) last_record = [] ambig_dictionary = {} for record in sam_input: if record[Constants.SAM_KEY_FLAG] == Constants.SAM_VALUE_UNMAPPED: continue elif len(last_record) >= 1 and record[ Constants.SAM_KEY_QNAME] != last_record[0][ Constants.SAM_KEY_QNAME]: writeTo(last_record, ambig_dictionary) last_record = [record] else: last_record.append(record) writeTo(last_record, ambig_dictionary) return ambig_dictionary
def processSAM(SAM_file, gzip_switch, paired_end, filter_value): if paired_end: raise NotImplementedError if gzip_switch: raise NotImplementedError filter_value = float(filter_value) sam_iterator = SeqIterator.SeqIterator(SAM_file, file_type=Constants.SAM, gzip_switch=gzip_switch) my_counter = Counts() record_dict = sam_iterator.convertToDict("R1", "R2") my_counter.__sam = sam_iterator.records_processed() if not paired_end: processSingle(record_dict, my_counter, filter_value) else: pass return my_counter
def extractFeatures(FASTQ_file): fastq_iter = SeqIterator.SeqIterator(FASTQ_file, file_type='fastq') # 28 features including the seq_id feature_labels = [ "seq_id", "entropy", "dust", "bz2", "lzma", "length", "freq_A", "freq_C", "freq_G", "freq_T", "run_mean", "run_variance", "run_max_length", "dkg_2", "rkg_2", "dkg_3", "rkg_3", "dkg_4", "rkg_4", "dkg_5", "rkg_5", "qual_mean", "qual_max", "qual_min", "qual_variance", "qual_skewness", "qual_kurt", "qual_mean_diff" ] # 15 features for each partition partition1_labels = [ "entropy_1", "dust_1", "bz2_1", "lzma_1", "freq_A_1", "freq_C_1", "freq_G_1", "freq_T_1", "qual_mean_1", "qual_max_1", "qual_min_1", "qual_variance_1", "qual_skewness_1", "qual_kurt_1", "qual_mean_diff_1" ] partition2_labels = [ "entropy_2", "dust_2", "bz2_2", "lzma_2", "freq_A_2", "freq_C_2", "freq_G_2", "freq_T_2", "qual_mean_2", "qual_max_2", "qual_min_2", "qual_variance_2", "qual_skewness_2", "qual_kurt_2", "qual_mean_diff_2" ] partition3_labels = [ "entropy_3", "dust_3", "bz2_3", "lzma_3", "freq_A_3", "freq_C_3", "freq_G_3", "freq_T_3", "qual_mean_3", "qual_max_3", "qual_min_3", "qual_variance_3", "qual_skewness_3", "qual_kurt_3", "qual_mean_diff_3" ] print(feature_labels + partition1_labels + partition2_labels + partition3_labels) noNs = 0 total = 0 for fastq_record in fastq_iter: features = extractFeatureRow(fastq_record) total += 1 if features: print(features) noNs += 1 sys.stderr.write("The number without Ns, and the total " "in the file:\t%d\t%d\n" % (noNs, total))
def processSAM(SAM_file, gzip_switch, bound, paired_end, dwgsim, print_value=10): sam_iterator = SeqIterator.SeqIterator(SAM_file, file_type=Constants.SAM, gzip_switch=gzip_switch) counter_dict = { "Total_SAM_Records": 0, "Records_Analyzed": 0, "Correct": 0, "Incorrect": 0, "Unmap/Filter": 0, "Improper": 0 } #"FP" : 0, "FN" : 0, "TP" : 0, "TN" : 0, record_dict = sam_iterator.convertToDict("R1", "R2") counter_dict["Total_SAM_Records"] = sam_iterator.records_processed() counter = 0 for key in record_dict.keys(): if counter < print_value: sys.stderr.write("%s\n" % str(record_dict[key])) counter += 1 if dwgsim and key.startswith("rand_"): continue if not paired_end and ((len(record_dict[key]) > 1 or "XA:Z" in record_dict[key][0])): #sys.stderr.write("Got here:\t%s\t%s\t%s\n" % (key, str(len(record_dict[key])), str("XA:Z" in record_dict[key][0]))) # if counter <= 10: # print "0" continue if isUnmapped( record_dict[key][0][Constants.SAM_KEY_FLAG]) or isFiltered( record_dict[key][0][Constants.SAM_KEY_FLAG]): # if counter <= 10: # print "1" counter_dict["Unmap/Filter"] += 1 continue if paired_end and (len(record_dict[key])) != 2: # if counter <= 10: # print "2" continue if paired_end and (("XA:Z" in record_dict[key][0] or "XA:Z" in record_dict[key][1])): # if counter <= 10: # print "3" continue if paired_end and ( not isProper(record_dict[key][0][Constants.SAM_KEY_FLAG]) or not isProper(record_dict[key][1][Constants.SAM_KEY_FLAG])): # if counter <= 10: # print "4" counter_dict["Improper"] += 1 continue counter_dict["Records_Analyzed"] += 1 SAM_record = record_dict[key][0] QNAME = re.split('_|:|-', SAM_record[Constants.SAM_KEY_QNAME].replace('@', '')) if not dwgsim: QNAME = QNAME[1:len(QNAME)] REAL_POS = int(QNAME[1]) # For DWGSIM, the real position is at index 1 REAL_POS2 = int(QNAME[2]) RNAME = SAM_record[Constants.SAM_KEY_RNAME] TEST_POS = int(SAM_record[Constants.SAM_KEY_POS]) TEST_POS2 = TEST_POS + len(SAM_record[Constants.SAM_KEY_SEQ]) - 1 REAL_POS_u = REAL_POS + bound REAL_POS_l = REAL_POS - bound REAL_POS2_u = REAL_POS2 + bound REAL_POS2_l = REAL_POS2 - bound if paired_end: SAM_record2 = record_dict[key][1] RNAME2 = SAM_record2[Constants.SAM_KEY_RNAME] TEST_POS2 = int(SAM_record2[Constants.SAM_KEY_POS]) if not paired_end and QNAME[0] == RNAME and ( (not dwgsim and ((TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l) or (TEST_POS2 <= REAL_POS2_u and TEST_POS2 >= REAL_POS2_l))) or (dwgsim and ((TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l) or (TEST_POS <= REAL_POS2_u and TEST_POS >= REAL_POS2_l)))): counter_dict["Correct"] += 1 elif not paired_end: counter_dict["Incorrect"] += 1 if paired_end and ((QNAME[0] == RNAME and TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l) or (QNAME[0] == RNAME2 and TEST_POS2 <= REAL_POS_u and TEST_POS2 >= REAL_POS_l)): counter_dict["Correct"] += 1 elif paired_end: counter_dict["Incorrect"] += 1 return counter_dict
def extractRef(in_file, match_string, out_file_d): ref_iterator = SeqIterator.SeqIterator(in_file, file_type=Constants.FASTA) ref_writer = SeqIterator.SeqWriter(out_file_d, file_type=Constants.FASTA) for fasta_record in ref_iterator: if match_string in fasta_record[0]: ref_writer.write(fasta_record)