Exemplo n.º 1
0
def detect(data, args):
    in_file = data['r2_path']
    out_prefix = data['sample_id']
    out_file = out_prefix + "_polyA.dat.gz"
    out_name_false = out_prefix + "_none.dat.gz"
    counts = Counter()
    num_line = 0
    logger.my_logger.info("reading file %s" % in_file)
    logger.my_logger.info("creating files %s %s" % (out_file, out_name_false))
    data['detect'] = out_file
    if os.path.exists(out_file):
        return data
    with file_transaction(out_file) as tx_out_file:
        with open_fastq(in_file) as handle, gzip.open(tx_out_file, 'w') as out, gzip.open(out_name_false, 'w') as out_false:
            for line in handle:
                #print line
                num_line += 1
                if num_line % 1000000 == 0:
                    logger.my_logger.info("read %s lines:" % num_line)
                if line.startswith("@HISEQ"):
                    #print line
                    name = line.strip()
                    seq = handle.next().strip()
                    handle.next().strip()
                    qual = handle.next().strip()
                    find = _adapter(seq, qual)
                    #print "%s %s" % (seq, find)
                    if find:
                        seq, qual = find
                        ns = poly_A_percentage(seq)
                        #ns = polyA(seq)
                        if ns:
                            if ns[1]-ns[0] >= 6:
                                #print "positions are" + str(ns[0]) + ".." + str(ns[1])
                                mod = seq[:ns[0]]
                                seq_polyA = seq[ns[0]:ns[1]]
                                seq_gene = seq[ns[1]:]
                                qual_polyA = qual[ns[0]:ns[1]]
                                qual_gene = qual[ns[1]:]
                                #print "%s\t%s\t%s\t%s\t%s\t%s\n" % (name,mod,sf,qf)
                                out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (name, ns[0], ns[1], mod, seq_polyA, qual_polyA, seq_gene, qual_gene))
                                counts['polyA'] += 1
                                if len(mod) > 0:
                                    counts['mod'] += 1
                            else:
                                counts['shortA'] += 1
                                out_false.write("%s\t%s\t%s\t%s\n" % ("shortA", name, seq, qual))
                        else:
                            counts['noA'] += 1
                            out_false.write("%s\t%s\t%s\t%s\n" % ("None", name, seq, qual))
                    else:
                        out_false.write("%s\t%s\t%s\t%s\n" % ("No_tag", name, seq, qual))
                        counts['notag'] += 1
        with file_transaction(out_prefix + ".stat") as tx_stat_file:
            df = Series(counts)
            df.to_csv(tx_stat_file, sep="\t")
        logger.my_logger.info("%s" % counts)
    return data
Exemplo n.º 2
0
def prep_r2_with_barcode(fq1, fq2, out_file):

    safe_makedir(os.path.dirname(out_file))
    if file_exists(out_file):
        print ("%s and %s have already been barcode-prepped, skipping."
               % (fq1, fq2))
        return out_file

    with open_fastq(fq1) as r1_file, open_fastq(fq2) as r2_file:
        with file_transaction(out_file) as tx_out_file:
            out_handle = open(tx_out_file, "w")
            read_count = 0
            buf = list()
            r1_r2 = itertools.izip(r1_file, r2_file)
            for header1, header2 in r1_r2:
                seq1, seq2 = r1_r2.next()
                plus1, plus2 = r1_r2.next()
                qual1, qual2 = r1_r2.next()

                read_name1, read_name2 = header1.split()[0][1:], header2.split()[0][1:]
                assert read_name1 == read_name2, "FASTQ files may be out of order."
                seq2, qual2 = seq2.rstrip(), qual2.rstrip()
                barcode, seq, qual = mask(seq1[0:6], qual1[0:6], min_qual=10) + \
                                     mask(seq1[6:], qual1[6:]), seq2, qual2
                barcoded_name = ":".join([read_name2, barcode])

                print(format_fastq([barcoded_name, seq, qual]), file=out_handle)
            out_handle.close()
    return out_file
Exemplo n.º 3
0
def _summarize(in_file, align_r2, count_file, out_file):
    log_file = out_file + ".log"
    logger.my_logger.info("summarize results")
    read_gene, counts_gene = _get_first_read(count_file)
    logger.my_logger.info("load read 1")
    read_gene = _get_second_read(align_r2, read_gene)
    logger.my_logger.info("load read 2")
    stats = defaultdict(Counter)
    if not os.path.exists(out_file):
        with gzip.open(in_file) as handle_polya:
            log_handle = open(log_file, 'w')
            for line in handle_polya:
                cols = line.strip().split("\t")
                read = cols[0].split(" ")[0].replace("@", "")
                if read in read_gene:
                    find = tune(cols[3], cols[4])
                    if len(cols[3] + cols[4] + cols[6]) > 135:
                        continue
                    if find:
                        log_handle.write("%s %s %s ---> %s %s\n" % (read, cols[3], cols[4], find, read_gene[read]))
                        if read_gene[read][1]:
                            #print "is polya"
                            gene = read_gene[read][0]
                            stats[gene]["polyA"] += 1
                            if find[0] != "":
                                stats[gene][find[0]] += 1
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, 'w') as out:
                for gene in counts_gene:
                    out.write("%s counts %s\n" % (gene, counts_gene[gene]))
                    if gene in stats:
                        for mod, c in stats[gene].iteritems():
                            out.write("%s %s %s\n" % (gene, mod, c))
Exemplo n.º 4
0
def rmdup(align_file, out_file):
    cmd = ("samtools view -bh {align_file} | samtools sort -o -n - {tmp} | bammarkduplicates rmdup=1   O={tx_out_file}")
    tmp = align_file + "_tmp"
    if not os.path.exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            do.run(cmd.format(**locals()))
    return out_file
Exemplo n.º 5
0
def _bowtie_align(fastq_file, control_index, out_file):
    cmd = ("bowtie2 -p 4 --no-unal -x {control_index} -U {fastq_file} | samtools view -Shb /dev/stdin > {tx_out_file} ")
    stat_file = out_file + ".flagstat"
    if not os.path.exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            do.run(cmd.format(**locals()), "bowtie2 %s" % fastq_file)
    do.run("samtools flagstat {out_file} > {stat_file}".format(**locals()), "stats control sequences")
    return stat_file
Exemplo n.º 6
0
def rmdup(align_file, out_file):
    cmd = (
        "samtools view -bh {align_file} | samtools sort -o -n - {tmp} | bammarkduplicates rmdup=1   O={tx_out_file}"
    )
    tmp = align_file + "_tmp"
    if not os.path.exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            do.run(cmd.format(**locals()))
    return out_file
Exemplo n.º 7
0
def count_umi(sam_file, gtf_file, barcode_to_well, multimappers=False):
    """
    stripped down implementation of the HTSeq algorithm for counting
    """
    base, _ = os.path.splitext(sam_file)
    out_file = base + ".counts"
    out_umi_file = base + ".counts_umi.gz"
    out_umi_pos_file = base + ".counts_umi_pos.gz"
    if file_exists(out_file):
        return out_file
    wells = sorted(barcode_to_well.values())
    seen_umi = defaultdict(set)
    seen_umi_list = defaultdict(Counter)
    seen_umi_pos_list = defaultdict(Counter)
    exons = HTSeq.GenomicArrayOfSets("auto", stranded=False)
    gtf_handle = HTSeq.GFF_Reader(gtf_file)
    for feature in gtf_handle:
        if feature.type == "exon":
            exons[feature.iv] += feature.attr["gene_id"]

    sam_handle = HTSeq.SAM_Reader(sam_file)
    for read in sam_handle:
        if not read.aligned:
            continue
        if not multimappers:
            try:
                if read.optional_field("NH") > 1:
                    continue
            except KeyError:
                pass
        iv_seq = (co.ref_iv for co in read.cigar if co.type == "M" and co.size > 0)
        fs = set()
        for iv in iv_seq:
            for iv2, fs2 in exons[iv].steps():
                if not fs:
                    fs = fs2.copy()
                else:
                    fs = fs.intersection(fs2)
        if len(fs) == 1:
            fields = read.original_sam_line.split("\t")
            position = "%s:%s" % (fields[2], fields[3])
            barcode, umi = get_barcode_and_umi(read)
            if barcode not in barcode_to_well:
                continue
            seen_umi[(list(fs)[0], barcode_to_well[barcode])].add(umi)
            seen_umi_list[(list(fs)[0], barcode_to_well[barcode])][umi] += 1
            seen_umi_pos_list[(position, barcode_to_well[barcode], list(fs)[0])][umi] += 1
    write_extensive_summary(seen_umi_list, out_umi_file)
    write_extensive_summary_by_pos(seen_umi_pos_list, out_umi_pos_file)
    with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                print("\t".join(["feature"] + wells), file=out_handle)
                for feature in get_feature_names(gtf_file):
                    counts = [len(seen_umi[(feature, well)]) for well in wells]
                    print("\t".join([feature] + map(str, counts)), file=out_handle)
Exemplo n.º 8
0
def _assign_gene(in_file, prefix):
    """read featureCounts output and assign each read a gene"""
    out_file = prefix + "assign.dat"
    if not os.path.exists(out_file):
        with open(in_file) as handle, file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, 'w') as out:
                for line in handle:
                    cols = line.strip().split("\t")
                    if cols[1] == "Assigned":
                        out.write("%s\t%s\n" % (cols[0], cols[2]))
    return out_file
Exemplo n.º 9
0
def _bowtie_align(fastq_file, control_index, out_file):
    cmd = (
        "bowtie2 -p 4 --no-unal -x {control_index} -U {fastq_file} | samtools view -Shb /dev/stdin > {tx_out_file} "
    )
    stat_file = out_file + ".flagstat"
    if not os.path.exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            do.run(cmd.format(**locals()), "bowtie2 %s" % fastq_file)
    do.run("samtools flagstat {out_file} > {stat_file}".format(**locals()),
           "stats control sequences")
    return stat_file
Exemplo n.º 10
0
def _assign_gene(in_file, prefix):
    """read featureCounts output and assign each read a gene"""
    out_file = prefix + "assign.dat"
    if not os.path.exists(out_file):
        with open(in_file) as handle, file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, 'w') as out:
                for line in handle:
                    cols = line.strip().split("\t")
                    if cols[1] == "Assigned":
                        out.write("%s\t%s\n" % (cols[0], cols[2]))
    return out_file
Exemplo n.º 11
0
def _summarize(in_file, align_r2, count_file, align_r1,  out_file):
    if not os.path.exists(out_file):
        log_file = out_file + ".log"
        logger.my_logger.info("summarize results")
        read_gene, counts_gene = _get_first_read(count_file)
        read_position = _get_read1_position(align_r1, read_gene)
        logger.my_logger.info("load read 1 done")
        read_gene = _get_second_read(align_r2, read_gene)
        logger.my_logger.info("load read 2 done")
        stats = defaultdict(Counter)
        duplicate = {}
        find_dup = 0
        with gzip.open(in_file) as handle_polya:
            log_handle = open(log_file, 'w')
            for line in handle_polya:
                cols = line.strip().split("\t")
                read = cols[0].split(" ")[1].replace("@", "")
                primer = cols[0].split(" ")[0]
                if read in read_gene:
                    log_handle.write("found %s %s %s %s ---> %s\n" % (read, primer, cols[3], cols[4], read_gene[read]))
                    if read_gene[read][1]:
                        if len(cols[3] + cols[4] + cols[6]) > 135:
                            continue
                        find = tune(cols[3], cols[4])
                        pos = read_position[read][0]
                        log_handle.write("log %s %s %s %s\n" % (read, primer, pos, read_gene[read]))
                        if (pos, primer) in duplicate:
                            find_dup += 1
                            continue
                        if find and not (pos, primer) in duplicate:
                            duplicate[(pos, primer)] = 0
                            log_handle.write("corrected %s %s %s --->%s %s\n" % (read, cols[3], cols[4], find, read_gene[read]))
                            #print "is polya"
                            gene = read_gene[read][0]
                            polya_size = _get_bin(len(find[1]))
                            stats[gene][polya_size] += 1
                            if find[0] != "":
                                stats[gene][(polya_size, find[0])] += 1
                        else:
                            log_handle.write("removed %s %s %s ---> %s %s\n" % (read, cols[3], cols[4], find, read_gene[read]))
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, 'w') as out:
                for gene in counts_gene:
                    out.write("%s total counts %s 0\n" % (gene, counts_gene[gene]))
                    if gene in stats:
                        for mod, c in stats[gene].iteritems():
                            if isinstance(mod, tuple):
                                u_times = _get_u_times(mod[1])
                                out.write("%s %s %s %s %s\n" % (gene, mod[0], mod[1], c, u_times))
                            else:
                                out.write("%s polyA %s %s 0\n" % (gene, mod, c))
        logger.my_logger.info("Found %s exact duplicates\n" % find_dup)
Exemplo n.º 12
0
def bwa_align(fastq_path, reference_prefix, out_file, cores=1):
    edit_distance = MAX_EDIT_DISTANCE
    if file_exists(out_file):
        print ("%s has already been aligned, skipping." % (fastq_path))
        return out_file

    with file_transaction(out_file) as tx_out_file:

        cmd = ("bwa aln -n {edit_distance} -l 24 {reference_prefix} "
               "{fastq_path} -t {cores} | bwa samse {reference_prefix} - {fastq_path} "
               "> {tx_out_file}").format(**locals())
        subprocess.check_call(cmd, shell=True)
    return out_file
Exemplo n.º 13
0
def _get_counts_stats(count_file, out_file):
    seen = {}
    stats = Counter()
    if os.path.exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        with open(count_file) as in_handle:
            for line in in_handle:
                read, label = line.strip().split("\t")[:2]
                if read not in seen:
                    stats[label] += 1
                seen[read] = 0
        with open(tx_out_file, "w") as out_handle:
            for label in stats:
                out_handle.write("%s %s\n" % (label, stats[label]))
    return out_file
Exemplo n.º 14
0
def _get_counts_stats(count_file, out_file):
    seen = {}
    stats = Counter()
    if os.path.exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        with open(count_file) as in_handle:
            for line in in_handle:
                read, label = line.strip().split("\t")[:2]
                if read not in seen:
                    stats[label] += 1
                seen[read] = 0
        with open(tx_out_file, "w") as out_handle:
            for label in stats:
                out_handle.write("%s %s\n" % (label, stats[label]))
    return out_file
Exemplo n.º 15
0
def clean_align(align_file, out_file):
    if file_exists(out_file):
        logger.my_logger.info("%s has already been cleaned, skipping." % (align_file))
        return out_file

    count_total_reads = 0
    count_assigned_reads = 0
    count_assigned_aligned_reads = 0
    with pysam.Samfile(align_file, "r") as in_handle, file_transaction(out_file) as tx_out_file:
        out_handle = pysam.Samfile(tx_out_file, "wh", template=in_handle)
        for read in in_handle:
            count_total_reads += 1
            count_assigned_reads += 1
            if poorly_mapped_read(read):
                continue
            count_assigned_aligned_reads += 1
            out_handle.write(read)
        out_handle.close

    return out_file
Exemplo n.º 16
0
def _summarize(in_file, align_r2, count_file, out_file):
    log_file = out_file + ".log"
    logger.my_logger.info("summarize results")
    read_gene, counts_gene = _get_first_read(count_file)
    logger.my_logger.info("load read 1 done")
    read_gene = _get_second_read(align_r2, read_gene)
    logger.my_logger.info("load read 2 done")
    stats = defaultdict(Counter)
    if not os.path.exists(out_file):
        with gzip.open(in_file) as handle_polya:
            log_handle = open(log_file, 'w')
            for line in handle_polya:
                cols = line.strip().split("\t")
                read = cols[0].split(" ")[0].replace("@", "")
                if read in read_gene:
                    log_handle.write("found %s %s %s ---> %s\n" % (read, cols[3], cols[4], read_gene[read]))
                    if read_gene[read][1]:
                        if len(cols[3] + cols[4] + cols[6]) > 135:
                            continue
                        find = tune(cols[3], cols[4])
                        if find:
                            log_handle.write("corrected %s %s %s --->%s %s\n" % (read, cols[3], cols[4], find, read_gene[read]))
                            #print "is polya"
                            gene = read_gene[read][0]
                            polya_size = _get_bin(len(find[1]))
                            stats[gene][polya_size] += 1
                            if find[0] != "":
                                stats[gene][(polya_size, find[0])] += 1
                        else:
                            log_handle.write("removed %s %s %s ---> %s %s\n" % (read, cols[3], cols[4], find, read_gene[read]))
        with file_transaction(out_file) as tx_out_file:
            with open(tx_out_file, 'w') as out:
                for gene in counts_gene:
                    out.write("%s total counts %s 0\n" % (gene, counts_gene[gene]))
                    if gene in stats:
                        for mod, c in stats[gene].iteritems():
                            if isinstance(mod, tuple):
                                u_times = _get_u_times(mod[1])
                                out.write("%s %s %s %s %s\n" % (gene, mod[0], mod[1], c, u_times))
                            else:
                                out.write("%s polyA %s %s 0\n" % (gene, mod, c))
Exemplo n.º 17
0
def clean_align(align_file, out_file):
    if file_exists(out_file):
        logger.my_logger.info("%s has already been cleaned, skipping." %
                              (align_file))
        return out_file

    count_total_reads = 0
    count_assigned_reads = 0
    count_assigned_aligned_reads = 0
    with pysam.Samfile(
            align_file,
            "rb") as in_handle, file_transaction(out_file) as tx_out_file:
        out_handle = pysam.Samfile(tx_out_file, "wh", template=in_handle)
        for read in in_handle:
            count_total_reads += 1
            count_assigned_reads += 1
            if poorly_mapped_read(read):
                continue
            count_assigned_aligned_reads += 1
            out_handle.write(read)
        out_handle.close

    return out_file
Exemplo n.º 18
0
def detect(data, args):
    in_file = data['r2_path']
    out_prefix = data['sample_id']
    out_file = out_prefix + "_polyA.dat.gz"
    out_name_false = out_prefix + "_none.dat.gz"
    counts = Counter()
    num_line = 0
    logger.my_logger.info("reading file %s" % in_file)
    logger.my_logger.info("creating files %s %s" % (out_file, out_name_false))
    data['detect'] = out_file
    if os.path.exists(out_file):
        return data
    with file_transaction(out_file) as tx_out_file:
        with open_fastq(in_file) as handle, gzip.open(tx_out_file,
                                                      'w') as out, gzip.open(
                                                          out_name_false,
                                                          'w') as out_false:
            for line in handle:
                #print line
                num_line += 1
                if num_line % 1000000 == 0:
                    logger.my_logger.info("read %s lines:" % num_line)
                if line.startswith("@HISEQ"):
                    #print line
                    name = line.strip()
                    seq = handle.next().strip()
                    handle.next().strip()
                    qual = handle.next().strip()
                    find = _adapter(seq, qual)
                    #print "%s %s" % (seq, find)
                    if find:
                        seq, qual = find
                        ns = poly_A_percentage(seq)
                        #ns = polyA(seq)
                        if ns:
                            if ns[1] - ns[0] >= 6:
                                #print "positions are" + str(ns[0]) + ".." + str(ns[1])
                                mod = seq[:ns[0]]
                                seq_polyA = seq[ns[0]:ns[1]]
                                seq_gene = seq[ns[1]:]
                                qual_polyA = qual[ns[0]:ns[1]]
                                qual_gene = qual[ns[1]:]
                                #print "%s\t%s\t%s\t%s\t%s\t%s\n" % (name,mod,sf,qf)
                                out.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                          (name, ns[0], ns[1], mod, seq_polyA,
                                           qual_polyA, seq_gene, qual_gene))
                                counts['polyA'] += 1
                                if len(mod) > 0:
                                    counts['mod'] += 1
                            else:
                                counts['shortA'] += 1
                                out_false.write("%s\t%s\t%s\t%s\n" %
                                                ("shortA", name, seq, qual))
                        else:
                            counts['noA'] += 1
                            out_false.write("%s\t%s\t%s\t%s\n" %
                                            ("None", name, seq, qual))
                    else:
                        out_false.write("%s\t%s\t%s\t%s\n" %
                                        ("No_tag", name, seq, qual))
                        counts['notag'] += 1
        with file_transaction(out_prefix + ".stat") as tx_stat_file:
            df = Series(counts)
            df.to_csv(tx_stat_file, sep="\t")
        logger.my_logger.info("%s" % counts)
    return data
Exemplo n.º 19
0
def write_extensive_summary_by_pos(well_umi_gen, out_file):
    with file_transaction(out_file) as tx_out_file:
        with gzip.open(tx_out_file, 'wb') as out_handle:
            well_umi_gen_str = [[("\t%s\t%s\t%s\t" % (gen_well[0], gen_well[1], gen_well[2])).join(map(str, umi)) for umi in well_umi_gen[gen_well].items()] for gen_well in well_umi_gen]
            out_handle.write("\n".join(["\n".join(item) for item in well_umi_gen_str]))
            out_handle.write("\n")