Пример #1
0
def scrub_sample_GFFs(sample_dirs, gff_filename, count_filename,
                      group_filename, fastq_filename, output_prefix, tree):

    for sample_name, d in sample_dirs.items():
        outf = open(os.path.join(d, output_prefix + '.gff.tmp'), 'w')
        for r in GFF.collapseGFFReader(os.path.join(d, gff_filename)):
            n = len(r.ref_exons)
            if n == 1:
                GFF.write_collapseGFF_format(outf, r)

            new_ref_exons = scrub_ref_exons(r, tree)
            if new_ref_exons is None:
                print("No changes made due to error:",
                      r.seqid,
                      file=sys.stderr)
            else:
                #print "before:", r.ref_exons
                #print "after :", new_ref_exons
                r.ref_exons = new_ref_exons
            GFF.write_collapseGFF_format(outf, r)
        outf.close()
        cleanup_scrubbed_files_redundancy(outf.name, \
                                          os.path.join(d, group_filename), \
                                          os.path.join(d, count_filename), \
                                          os.path.join(d, fastq_filename) if fastq_filename is not None else None,
                                          os.path.join(d, output_prefix))
Пример #2
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction",
                        type=int,
                        default=5,
                        help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    count_filename, gff_filename, rep_filename, rep_type = sanity_check_collapse_input(
        args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = list(recs.keys())
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.' + ('fq' if rep_type == 'fastq' else 'fa'),
             'w')
    for r in SeqIO.parse(open(rep_filename), rep_type):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, rep_type)
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
    print("Output written to:", rep_filename, file=sys.stderr)
    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
Пример #3
0
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group,
                     addon_name, fuzzy_junction, allow_5merge, max_3_diff,
                     n_chunks):
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    recs = []
    tree = OrderedDict()
    i = 0
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)

    split_files = []
    i = 0
    counter = 0
    f_gff = open(addon_gff + '.split' + str(i), 'w')
    f_group = open(addon_group + '.split' + str(i), 'w')
    for v1 in tree.values():
        for strand in ('+', '-'):
            v2 = v1[strand]
            for _start, _end, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write("{0}\t{1}\n".format(
                        recs[cur].seqid,
                        ",".join(addon_group_info[recs[cur].seqid])))
                    counter += 1
            if counter >= (i + 1) * chunk_size:
                i += 1
                f_gff.close()
                f_group.close()
                split_files.append((f_gff.name, f_group.name))
                f_gff = open(addon_gff + '.split' + str(i), 'w')
                f_group = open(addon_group + '.split' + str(i), 'w')
    if not f_gff.closed:
        f_gff.close()
        f_group.close()
        split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(target=chain_helper,
                    args=(ref_gff, ref_group, split_gff, split_group, ref_name,
                          addon_name + '.' + str(i), fuzzy_junction,
                          allow_5merge, max_3_diff))
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, addon_name + '.' + str(i)))
    for p in pools:
        p.join()
    return result_prefixes, split_files
Пример #4
0
def write_reclist_to_gff_n_info(rec_list, final_prefix, ref_name, addon_name, use_fq=False):
    # now go through the rec list and figure out in what order we are outputting the total records
    tree = defaultdict(lambda: {'+':ClusterTree(0,0), '-':ClusterTree(0,0)})
    tree_keys_numeric = set()
    tree_keys_alpha = set()
    for i,match_rec in enumerate(rec_list):
        tree[match_rec.rec.chr][match_rec.rec.strand].insert(match_rec.rec.start, match_rec.rec.end, i)

    for chrom in tree:
        try:
            k = int(chrom)
            tree_keys_numeric.add(k)
        except ValueError:
            tree_keys_alpha.add(chrom)
    tree_keys = sorted(list(tree_keys_numeric)) + sorted(list(tree_keys_alpha))

    f_gff = open(final_prefix+'.gff', 'w')
    f_info = open(final_prefix+'.mega_info.txt', 'w')
    writer_info = DictWriter(f_info, fieldnames=['superPBID', ref_name, addon_name], delimiter='\t')
    writer_info.writeheader()
    f_group = open(final_prefix+'.group.txt', 'w')
    if use_fq:
        f_fq = open(final_prefix+'.rep.fq', 'w')
    # sort the combined gff (tree) by chromosome and strand (- first)

    new_group_info = {}

    pb_i = 0

    for _chr in tree_keys:
        # remember to convert potential integer chromsomes keys back to string now that we sorted them!
        _chr = str(_chr)
        for _strand in ('+', '-'):
            for _start,_end,_indices in tree[_chr][_strand].getregions():
                # further sort these records by (start, end, num_exons)
                _indices.sort(key=lambda i: (rec_list[i].rec.start, rec_list[i].rec.end, len(rec_list[i].rec.ref_exons)))
                pb_i += 1
                for pb_j, recs_index in enumerate(_indices):
                    pbgene = "PB.{0}".format(pb_i)
                    pbid = "PB.{0}.{1}".format(pb_i, pb_j + 1)
                    match_rec = rec_list[recs_index]
                    new_group_info[pbid] = match_rec.members
                    match_rec.rec.seqid = pbid
                    match_rec.rec.geneid = pbgene
                    GFF.write_collapseGFF_format(f_gff, match_rec.rec)
                    writer_info.writerow({'superPBID': pbid, ref_name: match_rec.ref_id, addon_name: match_rec.addon_id})
                    f_group.write("{0}\t{1}\n".format(pbid, ",".join(match_rec.members)))
                    if use_fq:
                        match_rec.seqrec.id = pbid
                        match_rec.seqrec.description = ''
                        SeqIO.write(match_rec.seqrec, f_fq, 'fastq')
    f_gff.close()
    f_info.close()
    f_group.close()
    if use_fq:
        f_fq.close()
    return new_group_info
def regroup_sam_to_gff(pooled_sam, demux_count_file, output_prefix, out_group_dict, in_fafq=None):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None: type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(lambda: set()) # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file),delimiter=','):
        for k,v in r.iteritems():
            if k=='id': continue
            if int(v) > 0: in_tissue[r['id']].add(k)
	
    in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.itervalues():
        handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w')
        if in_fafq is not None: handles_fafq[g] = open("{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w')

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = fafq_dict.keys()
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None: fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GMAPSAMReader(pooled_sam, True)
    for r in reader:
        if r.sID == '*':
            print >> sys.stderr, "Ignore {0} because unmapped.".format(r.qID)
            continue
        m = rex_pbid.match(r.qID)
        if m is not None: pbid = m.group(1)
        else: pbid = r.qID
        # convert SAM record to GFF record type
        r.seqid = pbid
        r.chr = r.sID
        r.start, r.end = r.sStart, r.sEnd
        r.strand = r.flag.strand
        r.ref_exons = r.segments
        r.cds_exons = None

        groups_to_write_in = set()
        if pbid not in in_tissue:
            print >> sys.stderr, "WARNING: {0} does not belong to any group indicated by outgroup_dict".format(pbid)
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
Пример #6
0
def regroup_sam_to_gff(pooled_sam, demux_count_file, output_prefix, out_group_dict, in_fafq=None):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None: type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(lambda: set()) # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file),delimiter=','):
        for k,v in r.iteritems():
            if k=='id': continue
            if int(v) > 0: in_tissue[r['id']].add(k)

    in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.itervalues():
        handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w')
        if in_fafq is not None: handles_fafq[g] = open("{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq), 'w')

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = fafq_dict.keys()
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None: fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GMAPSAMReader(pooled_sam, True)
    for r in reader:
        if r.sID == '*':
            print >> sys.stderr, "Ignore {0} because unmapped.".format(r.qID)
            continue
        m = rex_pbid.match(r.qID)
        if m is not None: pbid = m.group(1)
        else: pbid = r.qID
        # convert SAM record to GFF record type
        r.seqid = pbid
        r.chr = r.sID
        r.start, r.end = r.sStart, r.sEnd
        r.strand = r.flag.strand
        r.ref_exons = r.segments
        r.cds_exons = None

        groups_to_write_in = set()
        if pbid not in in_tissue:
            print >> sys.stderr, "WARNING: {0} does not belong to any group indicated by outgroup_dict".format(pbid)
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
def regroup_gff(pooled_gff,
                demux_count_file,
                output_prefix,
                out_group_dict,
                in_fafq=None):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None: type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(
        lambda: set())  # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file), delimiter=','):
        for k, v in r.items():
            if k == 'id': continue
            if int(v) > 0: in_tissue[r['id']].add(k)

    in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.values():
        handles[g] = open("{o}_{g}_only.gff".format(o=output_prefix, g=g), 'w')
        if in_fafq is not None:
            handles_fafq[g] = open(
                "{o}_{g}_only.{t}".format(o=output_prefix, g=g, t=type_fafq),
                'w')

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = list(fafq_dict.keys())
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None: fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GFF.collapseGFFReader(pooled_gff)
    for r in reader:
        groups_to_write_in = set()
        pbid = r.seqid
        if pbid not in in_tissue:
            print(
                "WARNING: {0} does not belong to any group indicated by outgroup_dict"
                .format(pbid),
                file=sys.stderr)
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
Пример #8
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction", type=int, default=5, help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
Пример #9
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.nomono'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(
        args.input_prefix)

    good = []
    f = open(output_prefix + '.gff', 'w')
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        if len(r.ref_exons) > 1:
            good.append(r.seqid)
            GFF.write_collapseGFF_format(f, r)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
Пример #10
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix", help="Input prefix (ex: test.collapsed.min_fl_2)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.nomono'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(args.input_prefix)

    good = []
    f = open(output_prefix + '.gff', 'w')
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        if len(r.ref_exons) > 1:
            good.append(r.seqid)
            GFF.write_collapseGFF_format(f, r)

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon, internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        try:
            # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
            return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)
        except ValueError:
            return 0

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1 #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {'+':IntervalTree(), '-':IntervalTree()}) # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist, max_5_diff=args.max_5_diff, max_3_diff=args.max_3_diff)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename+'.fuzzy', 'w')
    f_group = open(group_filename+'.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            # note: get_fl_from_id only works on IsoSeq1 and 2 ID formats, will return 0 if IsoSeq3 format or other
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
Пример #12
0
def collapse_fuzzy_junctions(gff_filename, group_filename, allow_extra_5exon,
                             internal_fuzzy_max_dist):
    def get_fl_from_id(members):
        # ex: 13cycle_1Mag1Diff|i0HQ_SIRV_1d1m|c139597/f1p0/178
        return sum(int(_id.split('/')[1].split('p')[0][1:]) for _id in members)

    def can_merge(m, r1, r2):
        if m == 'exact':
            return True
        else:
            if not allow_extra_5exon:
                return False
        # below is continued only if (a) is 'subset' or 'super' AND (b) allow_extra_5exon is True
        if m == 'subset':
            r1, r2 = r2, r1  #  rotate so r1 is always the longer one
        if m == 'super' or m == 'subset':
            n2 = len(r2.ref_exons)
            # check that (a) r1 and r2 end on same 3' exon, that is the last acceptor site agrees
            # AND (b) the 5' start of r2 is sandwiched between the matching r1 exon coordinates
            if r1.strand == '+':
                return abs(r1.ref_exons[-1].start - r2.ref_exons[-1].start) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[-n2].start <= r2.ref_exons[0].start < r1.ref_exons[-n2].end
            else:
                return abs(r1.ref_exons[0].end - r2.ref_exons[0].end) <= internal_fuzzy_max_dist and \
                    r1.ref_exons[n2-1].start <= r2.ref_exons[-1].end < r1.ref_exons[n2].end
        return False

    d = {}
    recs = defaultdict(lambda: {
        '+': IntervalTree(),
        '-': IntervalTree()
    })  # chr --> strand --> tree
    fuzzy_match = defaultdict(lambda: [])
    for r in GFF.collapseGFFReader(gff_filename):
        d[r.seqid] = r
        has_match = False
        r.segments = r.ref_exons
        for r2 in recs[r.chr][r.strand].find(r.start, r.end):
            r2.segments = r2.ref_exons
            m = compare_junctions.compare_junctions(
                r, r2, internal_fuzzy_max_dist=internal_fuzzy_max_dist)
            if can_merge(m, r, r2):
                fuzzy_match[r2.seqid].append(r.seqid)
                has_match = True
                break
        if not has_match:
            recs[r.chr][r.strand].insert(r.start, r.end, r)
            fuzzy_match[r.seqid] = [r.seqid]

    group_info = {}
    with open(group_filename) as f:
        for line in f:
            pbid, members = line.strip().split('\t')
            group_info[pbid] = [x for x in members.split(',')]

    # pick for each fuzzy group the one that has the most exons (if tie, then most FL)
    keys = fuzzy_match.keys()
    keys.sort(key=lambda x: map(int, x.split('.')[1:]))
    f_gff = open(gff_filename + '.fuzzy', 'w')
    f_group = open(group_filename + '.fuzzy', 'w')
    for k in keys:
        all_members = []
        best_pbid, best_size, best_num_exons = fuzzy_match[k][0], len(
            group_info[fuzzy_match[k][0]]), len(d[fuzzy_match[k][0]].ref_exons)
        all_members += group_info[fuzzy_match[k][0]]
        for pbid in fuzzy_match[k][1:]:
            _size = get_fl_from_id(group_info[pbid])
            _num_exons = len(d[pbid].ref_exons)
            all_members += group_info[pbid]
            if _num_exons > best_num_exons or (_num_exons == best_num_exons
                                               and _size > best_size):
                best_pbid, best_size, best_num_exons = pbid, _size, _num_exons
        GFF.write_collapseGFF_format(f_gff, d[best_pbid])
        f_group.write("{0}\t{1}\n".format(best_pbid, ",".join(all_members)))
    f_gff.close()
    f_group.close()

    return fuzzy_match
Пример #13
0
def chain_split_file(ref_gff, ref_group, ref_name, addon_gff, addon_group,
                     addon_name, fuzzy_junction, allow_5merge, max_3_diff,
                     n_chunks):
    addon_group_info = sp.MegaPBTree.read_group(addon_group, None)
    recs = []
    tree = OrderedDict()
    i = 0
    for r in GFF.collapseGFFReader(addon_gff):
        if r.chr not in tree:
            tree[r.chr] = {'+': ClusterTree(0, 0), '-': ClusterTree(0, 0)}
        tree[r.chr][r.strand].insert(r.start, r.end, i)
        recs.append(r)
        i += 1

    n = len(recs)
    chunk_size = (n // n_chunks) + (n % n_chunks > 0)
    #print("# of recs: {0}, cpus: {1}, chunk_size: {2}".format(n, n_chunks, chunk_size))

    split_files = []
    i = 0
    counter = 0
    f_gff = open(addon_gff + '.split' + str(i), 'w')
    f_group = open(addon_group + '.split' + str(i), 'w')
    for v1 in tree.values():
        for strand in ('+', '-'):
            v2 = v1[strand]
            for _start, _end, _indices in v2.getregions():
                for cur in _indices:
                    GFF.write_collapseGFF_format(f_gff, recs[cur])
                    f_group.write("{0}\t{1}\n".format(
                        recs[cur].seqid,
                        ",".join(addon_group_info[recs[cur].seqid])))
                    counter += 1
            # note: becuz we are limited by how the records are organized by (chrom, strand)
            # we may not end up using all the chunks, ex: if all records are on the same locus, we end up writing everything to one split file
            if counter >= (i + 1) * chunk_size:
                i += 1
                f_gff.close()
                f_group.close()
                split_files.append((f_gff.name, f_group.name))
                if i >= n_chunks or counter >= len(recs):
                    break
                f_gff = open(addon_gff + '.split' + str(i), 'w')
                f_group = open(addon_group + '.split' + str(i), 'w')
    if not f_gff.closed:
        f_gff.close()
        f_group.close()
        split_files.append((f_gff.name, f_group.name))

    result_prefixes = []
    pools = []
    for i, (split_gff, split_group) in enumerate(split_files):
        p = Process(target=chain_helper,
                    args=(ref_gff, ref_group, split_gff, split_group, ref_name,
                          addon_name + '.' + str(i), fuzzy_junction,
                          allow_5merge, max_3_diff))
        p.start()
        pools.append(p)
        result_prefixes.append((ref_name, addon_name + '.' + str(i)))
    for p in pools:
        p.join()
    #print("split files: {0}, result_prefix: {1}".format(split_files, result_prefixes))
    return result_prefixes, split_files
Пример #14
0
def get_gff_from_list(gff_filename, listfile, partial_ok=False):
    seqs = [line.strip() for line in open(listfile)]
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in seqs or r.seqid.split('|')[0] in seqs or (partial_ok and any(r.seqid.startswith(x) for x in seqs)):
            GFF.write_collapseGFF_format(sys.stdout, r)
Пример #15
0
def cleanup_scrubbed_files_redundancy(gff_filename, group_filename,
                                      count_filename, fastq_filename,
                                      output_prefix):

    junction_seen = defaultdict(lambda: defaultdict(lambda: [
    ]))  # key (chr,strand) -> dict of (series of junctions) -> record
    for r in GFF.collapseGFFReader(gff_filename):
        n = len(r.ref_exons)
        if n == 1:
            junc_str = str(r.start) + ',' + str(r.end)
            junction_seen[r.chr, r.strand][junc_str] = [r]
        else:
            junc_str = ",".join(
                str(r.ref_exons[i].end) + ',' + str(r.ref_exons[i + 1].start)
                for i in range(n - 1))
            junction_seen[r.chr, r.strand][junc_str].append(r)

    # write out cleaned GFF
    outf = open(output_prefix + '.gff', 'w')
    outf2 = open(output_prefix + '.merged_ids.txt', 'w')
    merged = {}
    keys = list(junction_seen.keys())
    keys.sort()
    for k in keys:
        for bunch in junction_seen[k].values():
            if len(bunch) == 1:  # just one record, write it out
                r = bunch[0]
                GFF.write_collapseGFF_format(outf, r)
                merged[r.seqid] = [r.seqid]
            else:
                # find the representative
                r = bunch[0]
                for r2 in bunch[1:]:
                    if r2.end - r2.start > r.end - r.start:
                        r = r2
                GFF.write_collapseGFF_format(outf, r)
                merged[r.seqid] = [x.seqid for x in bunch]
            outf2.write("{0}\t{1}\n".format(r.seqid,
                                            ",".join(merged[r.seqid])))
    outf.close()
    outf2.close()

    count_d, count_header = read_count_file(count_filename)
    # write out count file
    outf = open(output_prefix + '.abundance.txt', 'w')
    outf.write(count_header)
    writer = DictWriter(outf, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for pbid, bunch in merged.items():
        # combine the counts
        r = count_d[bunch[0]]
        r['pbid'] = pbid
        for field in fields_to_add:
            r[field] = float(r[field])
        for _id in bunch[1:]:
            for field in fields_to_add:
                r[field] += float(count_d[_id][field])
        writer.writerow(r)
    outf.close()

    group_info = read_group_file(group_filename)
    # write out group file
    outf = open(output_prefix + '.group.txt', 'w')
    for pbid, bunch in merged.items():
        # combine the groups
        g = [group_info[bunch[0]]]
        for _id in bunch[1:]:
            g.append(group_info[_id])
        outf.write("{0}\t{1}\n".format(pbid, ",".join(g)))
    outf.close()

    # write out fastq file if present
    if fastq_filename is not None:
        outf = open(output_prefix + '.rep.fq', 'w')
        for r in SeqIO.parse(open(fastq_filename), 'fastq'):
            if r.id.split('|')[0] in merged or r.id in merged:
                SeqIO.write(r, outf, 'fastq')
        outf.close()

    print(
        "scrubbed files written: {0}.gff, {0}.group.txt, {0}.abundance.txt, {0}.merged_ids.txt"
        .format(output_prefix),
        file=sys.stderr)
Пример #16
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: filtered.microexon)")
    parser.add_argument(
        "--micro_exon_size",
        type=int,
        default=12,
        help="Filter away microexons < micro_exon_size (default: 12bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered.microexon'

    count_filename, gff_filename, rep_filename = sanity_check_collapse_input(
        args.input_prefix)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        for r in xxx:
            min_exon_size = min(e.end - e.start for e in r.ref_exons)
            if min_exon_size > 12:  # minimum exon must be > default 12 bp
                GFF.write_collapseGFF_format(f, r)
                good.append(r.seqid)

    f.close()

    # read abundance first
    d, count_header = read_count_file(count_filename)

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
Пример #17
0
def filter_by_count(input_prefix,
                    output_prefix,
                    min_count,
                    dun_use_group_count=False):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        f = open(group_filename)
        for line in f:
            #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split('\t')
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(',')
            for m in members:
                i = m.find('|')
                if i > 0:
                    tmp = m.split('|')[1].split('/')[1]  #ex: tmp = f30p16
                else:
                    tmp = m.split('/')[1]
                fl_count, p_count = tmp.split('p')
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid],
                                               fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
        f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.items():
        print(k, v)
    f.close()

    # group_max_count_p NOT used for now
    good = [
        x for x in d if int(d[x]['count_fl']) >= min_count and (
            dun_use_group_count or group_max_count_fl[x] >= min_count)
    ]

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print("Output written to:", output_prefix + '.gff', file=sys.stderr)
    print("Output written to:", output_prefix + '.rep.fq', file=sys.stderr)
    print("Output written to:",
          output_prefix + '.abundance.txt',
          file=sys.stderr)
Пример #18
0
def main():
    from argparse import ArgumentParser
    parser = ArgumentParser()
    parser.add_argument("input_prefix",
                        help="Input prefix (ex: test.collapsed.min_fl_2)")
    parser.add_argument("--fuzzy_junction",
                        type=int,
                        default=5,
                        help="Fuzzy junction max dist (default: 5bp)")

    args = parser.parse_args()
    output_prefix = args.input_prefix + '.filtered'

    #group_filename = args.input_prefix + '.group.txt'
    count_filename = args.input_prefix + '.abundance.txt'
    gff_filename = args.input_prefix + '.gff'
    rep_filename = args.input_prefix + '.rep.fq'

    if not os.path.exists(count_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            count_filename)
        sys.exit(-1)
    if not os.path.exists(gff_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            gff_filename)
        sys.exit(-1)
    if not os.path.exists(rep_filename):
        print >> sys.stderr, "File {0} does not exist. Abort!".format(
            rep_filename)
        sys.exit(-1)

    recs = defaultdict(lambda: [])
    reader = GFF.collapseGFFReader(gff_filename)
    for r in reader:
        assert r.seqid.startswith('PB.')
        recs[int(r.seqid.split('.')[1])].append(r)

    good = []
    f = open(output_prefix + '.gff', 'w')
    keys = recs.keys()
    keys.sort()
    for k in recs:
        xxx = recs[k]
        filter_out_subsets(xxx, args.fuzzy_junction)
        for r in xxx:
            GFF.write_collapseGFF_format(f, r)
            good.append(r.seqid)
    f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k, v in d.iteritems():
        print k, v
    f.close()

    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
            SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
Пример #19
0
def filter_by_count(input_prefix, output_prefix, min_count, dun_use_group_count=False):

    group_filename = input_prefix + '.group.txt'
    count_filename = input_prefix + '.abundance.txt'
    gff_filename = input_prefix + '.gff'
    rep_filename = input_prefix + '.rep.fq'

    if not dun_use_group_count:
        # read group
        group_max_count_fl = {}
        group_max_count_p = {}
        f = open(group_filename)
        for line in f:
            #ex: PB.1.1  i0HQ_54b0ca|c58773/f30p16/700
            pbid, members = line.strip().split('\t')
            group_max_count_fl[pbid] = 0
            group_max_count_p[pbid] = 0
            members = members.split(',')
            for m in members:
                i = m.find('|')
                if i > 0:
                    tmp = m.split('|')[1].split('/')[1] #ex: tmp = f30p16
                else:
                    tmp = m.split('/')[1]
                fl_count, p_count = tmp.split('p')
                fl_count = int(fl_count[1:])
                p_count = int(p_count)
                group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count)
                group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count)
        f.close()

    # read abundance first
    f = open(count_filename)
    count_header = ''
    while True:
        cur_pos = f.tell()
        line = f.readline()
        if not line.startswith('#'):
            f.seek(cur_pos)
            break
        else:
            count_header += line
    d = dict((r['pbid'], r) for r in DictReader(f, delimiter='\t'))
    for k,v in d.iteritems():
        print k,v
    f.close()

    # group_max_count_p NOT used for now
    good = filter(lambda x: int(d[x]['count_fl']) >= min_count and (dun_use_group_count or group_max_count_fl[x] >= min_count), d)

    # write output GFF
    f = open(output_prefix + '.gff', 'w')
    for r in GFF.collapseGFFReader(gff_filename):
        if r.seqid in good: GFF.write_collapseGFF_format(f, r)
    f.close()


    # write output rep.fq
    f = open(output_prefix + '.rep.fq', 'w')
    for r in SeqIO.parse(open(rep_filename), 'fastq'):
        if r.name.split('|')[0] in good:
           SeqIO.write(r, f, 'fastq')
    f.close()

    # write output to .abundance.txt
    f = open(output_prefix + '.abundance.txt', 'w')
    f.write(count_header)
    writer = DictWriter(f, fieldnames=['pbid','count_fl','count_nfl','count_nfl_amb','norm_fl','norm_nfl','norm_nfl_amb'], \
                        delimiter='\t', lineterminator='\n')
    writer.writeheader()
    for k in good:
        r = d[k]
        writer.writerow(r)
    f.close()

    print >> sys.stderr, "Output written to:", output_prefix + '.gff'
    print >> sys.stderr, "Output written to:", output_prefix + '.rep.fq'
    print >> sys.stderr, "Output written to:", output_prefix + '.abundance.txt'