Пример #1
0
def getRandomSequences(rands, in_f, out_f, t, length, amount, seq_lst,
                       seq_count):
    my_nt_count = 0
    my_rand_count = 0
    seqs_gen = SeqIterator.SeqIterator(in_f, file_type=t)
    fd_seq_write = open(out_f, 'w')
    seq_writer = SeqIterator.SeqWriter(fd_seq_write, file_type=t)
    for rec in seqs_gen:
        seq_len = len(rec[1])
        if seq_len < length:
            continue


#         if debug():
#             print "my_nt_count", my_nt_count, "my_nt_count and length", my_nt_count + seq_len - length + 1, "next random position", rands[my_rand_count], "continue", str(my_nt_count + seq_len - length + 1 < rands[my_rand_count])
        if my_nt_count + seq_len - length + 1 < rands[my_rand_count]:
            my_nt_count += seq_len - length + 1
            continue
        num = 0
        while my_nt_count + seq_len - length >= rands[my_rand_count]:
            b = rands[my_rand_count] - my_nt_count
            e = b + length
            #             if debug():
            #                 print "id", str(rec[0]), "begin", str(b), "end", str(e), "my_rand_pos", rands[my_rand_count], "my_nt_count", my_nt_count
            if t == "fastq":
                nextSeq = (getID(rec[0], num, b, e), rec[1][b:e], rec[2][b:e])
            elif t == "fasta":
                nextSeq = (getID(rec[0], num, b, e), rec[1][b:e])
            seq_writer.write(nextSeq)
            my_rand_count += 1
            num += 1
            if my_rand_count >= len(rands):
                return
        my_nt_count += seq_len - length + 1
Пример #2
0
def outputFasta(input_file, output_file, lineLength):
    input_iterator = SeqIterator.SeqIterator(input_file)
    output_iterator = SeqIterator.SeqWriter(open(output_file, 'w'),
                                            file_type=Constants.FASTA,
                                            line_toggle=True,
                                            line_length=lineLength)
    for record in input_iterator:
        output_iterator.write(record)
    sys.stderr.write("%sNumber of records processed:\t%s\n" %
                     (logstr, str(input_iterator.records_processed())))
Пример #3
0
 def __init__(self,
              file_name1,
              file_name2,
              file_type=Constants.FASTQ,
              gzip_switch=False):
     self.SeqIterator1 = SeqIterator.SeqIterator(file_name1,
                                                 file_type=file_type,
                                                 gzip_switch=gzip_switch)
     self.SeqIterator2 = SeqIterator.SeqIterator(file_name2,
                                                 file_type=file_type,
                                                 gzip_switch=gzip_switch)
Пример #4
0
def remove_wildcards(fastq_file):
    """Iterate through a fastq_file and remove reads that have wildcards in them. """
    fastq_iter = SeqIterator.SeqIterator(fastq_file, file_type='fastq')
    fastq_writer = SeqIterator.SeqWriter(sys.stdout, file_type='fastq')
    count = 0
    count_n = 0
    for fastq_record in fastq_iter:
        count += 1
        if "N" in fastq_record[1] or "n" in fastq_record[1]:
            count_n += 1
        else:
            fastq_writer.write(fastq_record)
    return count, count_n
Пример #5
0
def sampler(the_dir, output, num, replicates, file_type):
    if num < 0:
        raise ValueError
    onlyfiles = [f for f in listdir(the_dir) if isfile(join(the_dir, f))]
    onlyfiles.sort()
    #onlyfiles = tuple(onlyfiles)
    print "Counting the sequences..."
    sys.stdout.flush()
    counts = count_sequences(the_dir, onlyfiles, file_type)
    total_seqs = sum(counts)
    if num < 1:
        num_to_get = int(math.ceil(num * total_seqs))
    else:
        num_to_get = num
    print "From a total of %s sequences in %s files, %s sequences will be sampled uniformly at random with replacement for each of %s files." % (
        str(total_seqs), str(len(onlyfiles)), str(num_to_get), str(replicates))
    sys.stdout.flush()
    locations = []
    for i in range(replicates):
        for _ in range(int(num_to_get)):
            r_n = random.randint(1, total_seqs)
            s = (r_n, i)
            locations.append(s)
    locations.sort(key=lambda tup: tup[0])
    files = []
    print "Creating %s files with prefix %s with a total of %s random sequences with replacement to be drawn." % (
        str(replicates), output, str(len(locations)))
    for i in range(replicates):
        files.append(
            SeqIterator.SeqWriter(open(
                join(output + "." + str(i) + "." + file_type), 'w'),
                                  file_type=file_type))
    seq_counter = 0
    list_counter = 0
    for ii in range(len(onlyfiles)):
        f = join(the_dir, onlyfiles[ii])
        seq_g = SeqIterator.SeqIterator(f, file_type=file_type)
        print "Processing %s with %s sequences" % (f, str(counts[ii]))
        sys.stdout.flush()
        for seq in seq_g:
            seq_counter += 1
            while list_counter < len(
                    locations) and seq_counter == locations[list_counter][0]:
                files[locations[list_counter][1]].write(seq)
                list_counter += 1
            if list_counter >= len(locations):
                return
            if list_counter % 10000 == 0:
                for i in range(replicates):
                    files[i].flush()
Пример #6
0
def selectRecords(readsfile, begin, end, file_type):
    reads_iterator = SeqIterator.SeqIterator(readsfile, file_type = file_type)
    output_writer = SeqIterator.SeqWriter(sys.stdout, file_type = file_type)
    counter = 0
    for record in reads_iterator:
        if file_type == 'fastq' or file_type == 'fasta':
            record_length = len(record[1])
        else:
            record_length = len(record["SEQ"])
        if record_length >= begin and record_length <= end:
            output_writer.write(record)
            counter += 1
    output_writer.flush()
    output_writer.close()
    return counter
Пример #7
0
def simpleExtract(FASTQ_file):
    def entropy(alpha_vector):
        dna_count = Counter(alpha_vector)
        ent = -1 * sum([
            dna_count[base] * math.log(dna_count[base], 2)
            for base in dna_count
        ])
        return ent

    fastq_iter = SeqIterator.SeqIterator(FASTQ_file, file_type='fastq')
    total = 0
    wildcards = 0
    for record in fastq_iter:
        total += 1
        seq_seq = record[1]
        if "N" in seq_seq:
            wildcards += 1
            continue
        ent = entropy(seq_seq)
        dust_score = dust(seq_seq)
        run_stats = longest_run(seq_seq)
        run_mean, run_variance = run_stats[0], run_stats[1]
        print("{}\t{}\t{}\t{}".format(ent, dust_score, run_mean, run_variance))
    sys.stderr.write(
        "There were {} / {} reads excluded because of wildcards.\n".format(
            wildcards, total))
Пример #8
0
def examine_mapping(sam_file, predict_file):
    """Compute average alignment score and edit distance."""
    sam_iter = SeqIterator.SeqIterator(sam_file, file_type='sam')
    total_sam = 0
    count_as = 0
    count_ed = 0
    avg_align_score = 0
    avg_edit_distance = 0
    pred_dict = defaultdict(lambda: [0] * 3)
    if predict_file:
        with open(predict_file) as predict_fd:
            for line in predict_fd:
                line = line.split()
                pred_dict[line[0].replace("'", "")] = line[1:]
    for record in sam_iter:
        total_sam += 1
        name = record["QNAME"]
        pred = pred_dict[name]
        align_score = float(record[Constants.SAM_KEY_ALIGNMENT_SCORE])
        try:
            edit_distance = int(record["NM:i"])
        except KeyError:
            edit_distance = None
        flag = int(record["FLAG"])
        if (int(pred[0]) == 0 or int(pred[0]) == 1) and flag != 4:
            count_as += 1
            avg_align_score += align_score
            if flag < 512:
                avg_edit_distance += edit_distance
                count_ed += 1
    return count_as, count_ed, total_sam, avg_align_score / count_as, avg_edit_distance / count_ed
Пример #9
0
def samMatchFASTQ(sam_file, fasta_file):
    sam_dictionary = {
        record["QNAME"]: True
        for record in SeqIterator.SeqIterator(sam_file, file_type='SAM')
    }
    fasta_iterator = SeqIterator.SeqIterator(fasta_file, file_type='fastq')
    fasta_writer = SeqIterator.SeqWriter(sys.stdout, file_type='fastq')
    counter = 0
    #sys.stderr.write("sam_dictionary:\n%s\n" % str(sam_dictionary))
    for fasta_record in fasta_iterator:
        #         sys.stderr.write("FASTQ record:\t%s\n" % fasta_record[0])
        #         sys.stderr.flush()
        if sam_dictionary.get(fasta_record[0], False):
            counter += 1
            fasta_writer.write(fasta_record)
    return counter
Пример #10
0
def calculateHistogram(readsfile, file_type):
    reads_iterator = SeqIterator.SeqIterator(readsfile, file_type = file_type)
    histogram = {}
    counter = 0
    for record in reads_iterator:
        counter += 1
        record_length = len(record[1])
        if record_length in histogram:
            histogram[record_length] += 1
        else:
            histogram[record_length] = 1
    keys = histogram.keys()
    keys.sort()
    items = []
    average = 0
    median = -1
    number_so_far = 0
    sum_of_squares = 0
    for k in keys:
        items.append(histogram[k])
        average += histogram[k] * k
        sum_of_squares += k * k * histogram[k]
        number_so_far += histogram[k]
        if median == -1 and number_so_far >= (counter / 2.0):
            median = k
    std_dev = math.sqrt((counter * sum_of_squares - average * average) / (counter * (counter - 1)))
    average = average / (counter + 0.0)
    sys.stdout.write(str(keys) + "\n")
    sys.stdout.write(str(items) + "\n")
    sys.stdout.write("Average:\t%s\n" % (str(average)))
    sys.stdout.write("Median:\t%s\n" % (str(median)))
    sys.stdout.write("Standard Deviation:\t%s\n" % (str(std_dev)))
    return counter
Пример #11
0
def removeN(reads, file_type=Constants.FASTQ):
    inputReads = SeqIterator.SeqIterator(reads, file_type=file_type)
    outputWriter = SeqIterator.SeqWriter(sys.stdout, file_type=file_type)
    counter_N = 0
    counter_all = 0
    for record in inputReads:
        counter_all += 1
        if 'N' in record[1].upper():
            counter_N += 1
        else:
            outputWriter.write(record)
            if counter_all <= 10:
                sys.stderr.write(record[1] + "\n")
                sys.stderr.flush()
                outputWriter.flush()
    outputWriter.flush()
    return (counter_all, counter_N)
Пример #12
0
def count_sequences(my_dir, onlyfiles, file_type):
    count = []
    for f in onlyfiles:
        print f
        sys.stdout.flush()
        seq_g = SeqIterator.SeqIterator(join(my_dir, f), file_type)
        count.append(seq_g.count())
    return tuple(count)
Пример #13
0
def simulate(norescore_ambig_dict, rescore_ambig_dict):
    SAM_writer = SeqIterator.SeqWriter(sys.stdout, file_type=Constants.SAM)
    for QNAME in norescore_ambig_dict:
        if rescore_ambig_dict == None or QNAME not in rescore_ambig_dict:
            record_list = norescore_ambig_dict[QNAME]
            selected_record = record_list[random.randint(1, len(record_list)) -
                                          1]
            SAM_writer.write(selected_record)
    SAM_writer.flush()
def extract(sam_file):
    sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM)
    sam_writer = SeqIterator.SeqWriter(sys.stdout, file_type=Constants.SAM)
    last_record = []
    rescored_count = 0
    for record in sam_input:
        if record[Constants.SAM_KEY_FLAG] == Constants.SAM_VALUE_UNMAPPED:
            continue
        elif len(last_record) >= 1 and record[
                Constants.SAM_KEY_QNAME] != last_record[0][
                    Constants.SAM_KEY_QNAME]:
            rescored_count += writeTo(last_record, sam_writer)
            last_record = [record]
        else:
            last_record.append(record)
    rescored_count += writeTo(last_record, sam_writer)
    sam_writer.flush()
    return rescored_count
Пример #15
0
def removeDuplicates(reads, file_type = Constants.FASTQ):
    inputReads = SeqIterator.SeqIterator(reads, file_type = file_type)
    outputWriter = SeqIterator.SeqWriter(sys.stdout, file_type = file_type)
    counter_N = 0
    counter_all = 0
    duplicateDict = {}
    for record in inputReads:
        counter_all += 1
        if record[0] in duplicateDict:
            counter_N += 1
        else:
            duplicateDict[record[0]] = 0
            outputWriter.write(record)
            if counter_all <= 10:
                sys.stderr.write(record[1] + "\n")
                sys.stderr.flush()
                outputWriter.flush()
    outputWriter.flush()
    return (counter_all, counter_N)
Пример #16
0
def processSAM(SAM_file, gzip_switch, paired_end):
    sam_iterator = SeqIterator.SeqIterator(SAM_file, file_type = Constants.SAM, gzip_switch = gzip_switch)
    my_counter = Counts()
    record_dict = sam_iterator.convertToDict("R1", "R2")
    my_counter.__sam = sam_iterator.records_processed()
    if not paired_end:
        processSingle(record_dict, my_counter)
    else:
        processPaired(record_dict, my_counter)
    return my_counter
Пример #17
0
def hairpinValidate(hairpin_recovered_filename, test_filename, bound,
                    gzip_switch):
    counter = {
        "Hairpin_Hits": 0,
        "Test_NotFound": 0,
        "Test_Ambig": 0,
        "Test_UnmapFilt": 0,
        "Test_Correct": 0,
        "Test_Incorrect": 0
    }
    hairpin_dict = SeqIterator.SeqIterator(
        hairpin_recovered_filename, file_type=Constants.SAM).convertToDict()
    test_dict = SeqIterator.SeqIterator(
        test_filename, file_type=Constants.SAM,
        gzip_switch=gzip_switch).convertToDict()
    for H_QNAME in hairpin_dict:
        if len(hairpin_dict[H_QNAME]
               ) != 1 or "XA:Z" in hairpin_dict[H_QNAME][0]:
            continue
        hairpin_SAM = hairpin_dict[H_QNAME][0]
        flag = hairpin_SAM[Constants.SAM_KEY_FLAG]
        if isUnmapped(flag) or isFiltered(flag):
            continue
        counter["Hairpin_Hits"] += 1
        hairpin_pos = int(hairpin_SAM[Constants.SAM_KEY_POS])
        test_SAM_list = test_dict.get(H_QNAME, None)
        if test_SAM_list == None:
            counter["Test_NotFound"] += 1
        elif len(test_SAM_list) != 1 or "XA:Z" in test_SAM_list[0]:
            counter["Test_Ambig"] += 1
        else:
            test_pos = int(test_SAM_list[0][Constants.SAM_KEY_POS])
            test_SAM = test_SAM_list[0]
            if isUnmapped(test_SAM[Constants.SAM_KEY_FLAG]) or isFiltered(
                    test_SAM[Constants.SAM_KEY_FLAG]):
                counter["Test_UnmapFilt"] += 1
            elif test_SAM[Constants.SAM_KEY_RNAME] == hairpin_SAM[Constants.SAM_KEY_RNAME] and \
                test_pos <= hairpin_pos + bound and \
                test_pos >= hairpin_pos - bound:
                counter["Test_Correct"] += 1
            else:
                counter["Test_Incorrect"] += 1
    return counter
Пример #18
0
def findASDistribution(sam_file, bucket_count=bucket_count_default):
    sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM)
    as_max = -sys.maxint - 1
    as_min = sys.maxint
    no_hits = 0
    total_records = 0
    for record in sam_input:
        alignment_score = float(record[Constants.SAM_KEY_ALIGNMENT_SCORE])
        if record[Constants.SAM_KEY_RNAME].startswith(
                Constants.SAM_VALUE_STAR):
            no_hits += 1
            continue
        if alignment_score > as_max:
            as_max = alignment_score
        if alignment_score < as_min:
            as_min = alignment_score
        total_records += 1
    #total_records = sam_input.records_processed()
    sam_input.reset()
    buckets = [0] * (bucket_count + 1)
    as_range = as_max - as_min
    bucket_length = as_range / bucket_count
    score_bounds = as_min
    score_bounds_list = []
    for _ in range(bucket_count):
        score_bounds += bucket_length
        score_bounds_list.append(score_bounds)
    stuff = 0
    #     counter = 0
    total_score = 0
    median_position = int(total_records / 2)
    median_score = 0
    for record in sam_input:
        alignment_score = float(record[Constants.SAM_KEY_ALIGNMENT_SCORE])
        if record[Constants.SAM_KEY_RNAME].startswith(
                Constants.SAM_VALUE_STAR):
            continue
        if stuff == median_position:
            median_score = alignment_score
        total_score += alignment_score
        bucket_index = int(
            math.floor((alignment_score - as_min) / bucket_length))
        #         if bucket_index == 2:
        #             print bucket_index, alignment_score
        buckets[bucket_index] += 1
        #         if counter < 10:
        #             print bucket_index, alignment_score
        #             counter += 1
        stuff += 1
    avg_score = total_score / (stuff + 0.0)
    assert total_records == stuff
    freq = map((lambda x: x / (total_records + 0.0)), buckets)
    return (freq, score_bounds_list, total_records, as_max, as_min, buckets,
            avg_score, median_score, no_hits)
def processSAM(SAM_file, gzip_switch, bound, paired_end):
    sam_iterator = SeqIterator.SeqIterator(SAM_file, file_type = Constants.SAM, gzip_switch = gzip_switch)
    counter_dict = {"Total_SAM_Records" : 0, "Records_Analyzed" : 0, "Correct" : 0, "Incorrect" : 0, "Unmap/Filter" : 0, "Improper" : 0} #"FP" : 0, "FN" : 0, "TP" : 0, "TN" : 0,
    record_dict = sam_iterator.convertToDict("R1", "R2")
    counter_dict["Total_SAM_Records"] = sam_iterator.records_processed()
    counter = 0
    for key in record_dict.keys():
        if counter < 10:
            sys.stderr.write("%s\n" % str(record_dict[key]))
            counter += 1
        if not paired_end and (len(record_dict[key]) > 1 or "XA:Z" in record_dict[key][0]):
#             if counter <= 10:
#                 print "0"
            continue
        if isUnmapped(record_dict[key][0][Constants.SAM_KEY_FLAG]) or isFiltered(record_dict[key][0][Constants.SAM_KEY_FLAG]):
#             if counter <= 10:
#                 print "1"
            counter_dict["Unmap/Filter"] += 1
            continue
        if paired_end and (len(record_dict[key])) != 2:
#             if counter <= 10:
#                 print "2"
            continue 
        if paired_end and ("XA:Z" in record_dict[key][0] or "XA:Z" in record_dict[key][1]):
#             if counter <= 10:
#                 print "3"
            continue
        if paired_end and (not isProper(record_dict[key][0][Constants.SAM_KEY_FLAG]) or not isProper(record_dict[key][1][Constants.SAM_KEY_FLAG])):
#             if counter <= 10:
#                 print "4"
            counter_dict["Improper"] += 1
            continue
        counter_dict["Records_Analyzed"] += 1
        SAM_record = record_dict[key][0]
        QNAME = re.split('_|:|-', SAM_record[Constants.SAM_KEY_QNAME])
        REAL_POS = int(QNAME[2])
        RNAME = SAM_record[Constants.SAM_KEY_RNAME]
        TEST_POS = int(SAM_record[Constants.SAM_KEY_POS])
        REAL_POS_u = REAL_POS + bound
        REAL_POS_l = REAL_POS - bound
        if paired_end:
            SAM_record2 = record_dict[key][1]
            RNAME2 = SAM_record2[Constants.SAM_KEY_RNAME]
            TEST_POS2 = int(SAM_record2[Constants.SAM_KEY_POS])
        if not paired_end and QNAME[1] == RNAME and  TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l:
            counter_dict["Correct"] += 1
        elif not paired_end:
            counter_dict["Incorrect"] += 1
        if paired_end and ((QNAME[1] == RNAME and TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l) or 
                           (QNAME[1] == RNAME2 and  TEST_POS2 <= REAL_POS_u and TEST_POS2 >= REAL_POS_l)):
            counter_dict["Correct"] += 1
        elif paired_end:
            counter_dict["Incorrect"] += 1
    return counter_dict
Пример #20
0
def getSeqCount(file_str, t, length):
    seqs_gen = SeqIterator.SeqIterator(file_str, file_type=t)
    if length < 0:
        return seqs_gen.count()
    else:
        seq_count = 0
        for rec in seqs_gen:
            seq_len = len(rec[1])
            if seq_len < length:
                continue
            seq_count += 1
        return seq_count
def findASDistribution(sam_file,
                       use_edit_distance,
                       bucket_count=bucket_count_default):
    sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM)
    as_max = -sys.maxint - 1
    as_min = sys.maxint
    no_hits = 0
    total_records = 0
    scores = []
    for record in sam_input:
        if record[Constants.SAM_KEY_RNAME].startswith(
                Constants.SAM_VALUE_STAR) or isUnmapped(
                    record[Constants.SAM_KEY_FLAG]):
            no_hits += 1
            continue
        if not use_edit_distance:
            alignment_score = float(
                record[Constants.SAM_KEY_ALIGNMENT_SCORE]) / float(
                    len(record[Constants.SAM_KEY_SEQ]))
        else:
            alignment_score = float(
                record[Constants.SAM_KEY_DISTANCE]) / float(
                    len(record[Constants.SAM_KEY_SEQ]))
        scores.append(alignment_score)
        if alignment_score > as_max:
            as_max = alignment_score
        if alignment_score < as_min:
            as_min = alignment_score
        total_records += 1
    scores.sort()
    total_score = sum(scores)
    avg_score = total_score / (total_records + 0.0)
    median_position = int(total_records / 2)
    median_score = scores[median_position]
    if bucket_count > 0:
        buckets = [0] * (bucket_count + 1)
        as_range = as_max - as_min
        bucket_length = as_range / bucket_count
        score_bounds = as_min
        score_bounds_list = []
        for _ in range(bucket_count):
            score_bounds += bucket_length
            score_bounds_list.append(score_bounds)
        for alignment_score in scores:
            bucket_index = int(
                math.floor((alignment_score - as_min) / bucket_length))
            buckets[bucket_index] += 1
        freq = map((lambda x: x / (total_records + 0.0)), buckets)
        return (freq, score_bounds_list, total_records, as_max, as_min,
                buckets, avg_score, median_score, no_hits)
    else:
        return (scores, "", total_records, as_max, as_min, "", avg_score,
                median_score, no_hits)
Пример #22
0
def extract(sam_file):
    sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM)
    last_record = []
    ambig_dictionary = {}
    for record in sam_input:
        if record[Constants.SAM_KEY_FLAG] == Constants.SAM_VALUE_UNMAPPED:
            continue
        elif len(last_record) >= 1 and record[
                Constants.SAM_KEY_QNAME] != last_record[0][
                    Constants.SAM_KEY_QNAME]:
            writeTo(last_record, ambig_dictionary)
            last_record = [record]
        else:
            last_record.append(record)
    writeTo(last_record, ambig_dictionary)
    return ambig_dictionary
Пример #23
0
def processSAM(SAM_file, gzip_switch, paired_end, filter_value):
    if paired_end:
        raise NotImplementedError
    if gzip_switch:
        raise NotImplementedError
    filter_value = float(filter_value)
    sam_iterator = SeqIterator.SeqIterator(SAM_file,
                                           file_type=Constants.SAM,
                                           gzip_switch=gzip_switch)
    my_counter = Counts()
    record_dict = sam_iterator.convertToDict("R1", "R2")
    my_counter.__sam = sam_iterator.records_processed()
    if not paired_end:
        processSingle(record_dict, my_counter, filter_value)
    else:
        pass
    return my_counter
Пример #24
0
def extractFeatures(FASTQ_file):
    fastq_iter = SeqIterator.SeqIterator(FASTQ_file, file_type='fastq')
    # 28 features including the seq_id
    feature_labels = [
        "seq_id", "entropy", "dust", "bz2", "lzma", "length", "freq_A",
        "freq_C", "freq_G", "freq_T", "run_mean", "run_variance",
        "run_max_length", "dkg_2", "rkg_2", "dkg_3", "rkg_3", "dkg_4", "rkg_4",
        "dkg_5", "rkg_5", "qual_mean", "qual_max", "qual_min", "qual_variance",
        "qual_skewness", "qual_kurt", "qual_mean_diff"
    ]
    # 15 features for each partition
    partition1_labels = [
        "entropy_1", "dust_1", "bz2_1", "lzma_1", "freq_A_1", "freq_C_1",
        "freq_G_1", "freq_T_1", "qual_mean_1", "qual_max_1", "qual_min_1",
        "qual_variance_1", "qual_skewness_1", "qual_kurt_1", "qual_mean_diff_1"
    ]
    partition2_labels = [
        "entropy_2", "dust_2", "bz2_2", "lzma_2", "freq_A_2", "freq_C_2",
        "freq_G_2", "freq_T_2", "qual_mean_2", "qual_max_2", "qual_min_2",
        "qual_variance_2", "qual_skewness_2", "qual_kurt_2", "qual_mean_diff_2"
    ]
    partition3_labels = [
        "entropy_3", "dust_3", "bz2_3", "lzma_3", "freq_A_3", "freq_C_3",
        "freq_G_3", "freq_T_3", "qual_mean_3", "qual_max_3", "qual_min_3",
        "qual_variance_3", "qual_skewness_3", "qual_kurt_3", "qual_mean_diff_3"
    ]
    print(feature_labels + partition1_labels + partition2_labels +
          partition3_labels)
    noNs = 0
    total = 0
    for fastq_record in fastq_iter:
        features = extractFeatureRow(fastq_record)
        total += 1
        if features:
            print(features)
            noNs += 1
    sys.stderr.write("The number without Ns, and the total "
                     "in the file:\t%d\t%d\n" % (noNs, total))
Пример #25
0
def processSAM(SAM_file,
               gzip_switch,
               bound,
               paired_end,
               dwgsim,
               print_value=10):
    sam_iterator = SeqIterator.SeqIterator(SAM_file,
                                           file_type=Constants.SAM,
                                           gzip_switch=gzip_switch)
    counter_dict = {
        "Total_SAM_Records": 0,
        "Records_Analyzed": 0,
        "Correct": 0,
        "Incorrect": 0,
        "Unmap/Filter": 0,
        "Improper": 0
    }  #"FP" : 0, "FN" : 0, "TP" : 0, "TN" : 0,
    record_dict = sam_iterator.convertToDict("R1", "R2")
    counter_dict["Total_SAM_Records"] = sam_iterator.records_processed()
    counter = 0
    for key in record_dict.keys():
        if counter < print_value:
            sys.stderr.write("%s\n" % str(record_dict[key]))
            counter += 1
        if dwgsim and key.startswith("rand_"):
            continue
        if not paired_end and ((len(record_dict[key]) > 1
                                or "XA:Z" in record_dict[key][0])):
            #sys.stderr.write("Got here:\t%s\t%s\t%s\n" % (key, str(len(record_dict[key])), str("XA:Z" in record_dict[key][0])))
            #             if counter <= 10:
            #                 print "0"
            continue
        if isUnmapped(
                record_dict[key][0][Constants.SAM_KEY_FLAG]) or isFiltered(
                    record_dict[key][0][Constants.SAM_KEY_FLAG]):
            #             if counter <= 10:
            #                 print "1"
            counter_dict["Unmap/Filter"] += 1
            continue
        if paired_end and (len(record_dict[key])) != 2:
            #             if counter <= 10:
            #                 print "2"
            continue
        if paired_end and (("XA:Z" in record_dict[key][0]
                            or "XA:Z" in record_dict[key][1])):
            #             if counter <= 10:
            #                 print "3"
            continue
        if paired_end and (
                not isProper(record_dict[key][0][Constants.SAM_KEY_FLAG])
                or not isProper(record_dict[key][1][Constants.SAM_KEY_FLAG])):
            #             if counter <= 10:
            #                 print "4"
            counter_dict["Improper"] += 1
            continue
        counter_dict["Records_Analyzed"] += 1
        SAM_record = record_dict[key][0]
        QNAME = re.split('_|:|-',
                         SAM_record[Constants.SAM_KEY_QNAME].replace('@', ''))
        if not dwgsim:
            QNAME = QNAME[1:len(QNAME)]
        REAL_POS = int(QNAME[1])  # For DWGSIM, the real position is at index 1
        REAL_POS2 = int(QNAME[2])
        RNAME = SAM_record[Constants.SAM_KEY_RNAME]
        TEST_POS = int(SAM_record[Constants.SAM_KEY_POS])
        TEST_POS2 = TEST_POS + len(SAM_record[Constants.SAM_KEY_SEQ]) - 1
        REAL_POS_u = REAL_POS + bound
        REAL_POS_l = REAL_POS - bound
        REAL_POS2_u = REAL_POS2 + bound
        REAL_POS2_l = REAL_POS2 - bound
        if paired_end:
            SAM_record2 = record_dict[key][1]
            RNAME2 = SAM_record2[Constants.SAM_KEY_RNAME]
            TEST_POS2 = int(SAM_record2[Constants.SAM_KEY_POS])
        if not paired_end and QNAME[0] == RNAME and (
            (not dwgsim and
             ((TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l) or
              (TEST_POS2 <= REAL_POS2_u and TEST_POS2 >= REAL_POS2_l))) or
            (dwgsim and
             ((TEST_POS <= REAL_POS_u and TEST_POS >= REAL_POS_l) or
              (TEST_POS <= REAL_POS2_u and TEST_POS >= REAL_POS2_l)))):
            counter_dict["Correct"] += 1
        elif not paired_end:
            counter_dict["Incorrect"] += 1
        if paired_end and ((QNAME[0] == RNAME and TEST_POS <= REAL_POS_u
                            and TEST_POS >= REAL_POS_l) or
                           (QNAME[0] == RNAME2 and TEST_POS2 <= REAL_POS_u
                            and TEST_POS2 >= REAL_POS_l)):
            counter_dict["Correct"] += 1
        elif paired_end:
            counter_dict["Incorrect"] += 1
    return counter_dict
Пример #26
0
def extractRef(in_file, match_string, out_file_d):
    ref_iterator = SeqIterator.SeqIterator(in_file, file_type=Constants.FASTA)
    ref_writer = SeqIterator.SeqWriter(out_file_d, file_type=Constants.FASTA)
    for fasta_record in ref_iterator:
        if match_string in fasta_record[0]:
            ref_writer.write(fasta_record)