Пример #1
0
def gtf_add_isoform(gtf, iso, out=sys.stdout, quiet=False):
    isoforms = {}

    if not quiet:
        sys.stderr.write('Reading isoforms...\n')

    for line in gzip_reader(iso):
        if line[0] == '#':
            continue
        cols = line.rstrip().split('\t')
        isoforms[cols[1]] = cols[0]

    if not quiet:
        sys.stderr.write('Reading/Writing GTF...\n')

    for line in gzip_reader(gtf):
        try:
            comment = None
            idx = line.find('#')
            if idx > -1:
                if idx == 0:
                    sys.stdout.write(line)
                    continue
                comment = line[idx:]
                line = line[:-idx]
            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip(
            ).split('\t')
            transcript_id = None
            for key, val in [
                    x.split(' ')
                    for x in [x.strip() for x in attrs.split(';')] if x
            ]:
                if val[0] == '"' and val[-1] == '"':
                    val = val[1:-1]
                if key == 'transcript_id':
                    transcript_id = val

            if attrs[-1] != ';':
                attrs = '%s;' % attrs

            if transcript_id in isoforms:
                attrs = '%s isoform_id "%s";' % (attrs,
                                                 isoforms[transcript_id])

            out.write('\t'.join([
                chrom, source, feature, start, end, score, strand, frame, attrs
            ]))
            if comment:
                out.write('\t%s' % comment)
            out.write('\n')
        except:
            import traceback
            sys.stderr.write('Error parsing line:\n%s\n' % line)
            traceback.print_exc()
            sys.exit(1)
Пример #2
0
def gtf_addreflink(gtf, reflink, out=sys.stdout, quiet=False, replace=False):
    link_values = {}

    if not quiet:
        sys.stderr.write('Reading refLink...\n')

    for line in gzip_reader(reflink):
        cols = line.rstrip().split('\t')
        link_values[cols[2]] = (cols[0], cols[6])

    if not quiet:
        sys.stderr.write('Reading GTF...\n')

    for line in gzip_reader(gtf):
        try:
            comment = None
            idx = line.find('#')
            if idx > -1:
                if idx == 0:
                    sys.stdout.write(line)
                    continue
                comment = line[idx:]
                line = line[:-idx]
            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t')
            transcript_id = None
            gene_id = None
            for key, val in [x.split(' ') for x in [x.strip() for x in attrs.split(';')] if x]:
                if val[0] == '"' and val[-1] == '"':
                    val = val[1:-1]
                if key == 'transcript_id':
                    transcript_id = val
                elif key == 'gene_id':
                    gene_id = val

            if attrs[-1] != ';':
                attrs = '%s;' % attrs

            if transcript_id in link_values:
                if replace:
                    attrs = 'gene_id "%s"; transcript_id "%s"; gene_name "%s"; orig_gene_id "%s";' % (link_values[transcript_id][1], transcript_id, link_values[transcript_id][0], gene_id)
                else:
                    extra = 'gene_name "%s"; isoform_id "%s";' % link_values[transcript_id]
                    attrs = '%s %s' % (attrs, extra)

            out.write('\t'.join([chrom, source, feature, start, end, score, strand, frame, attrs]))
            if comment:
                out.write('\t%s' % comment)
            out.write('\n')
        except:
            import traceback
            sys.stderr.write('Error parsing line:\n%s\n' % line)
            traceback.print_exc()
            sys.exit(1)
Пример #3
0
def gtf_add_isoform(gtf, iso, out=sys.stdout, quiet=False):
    isoforms = {}

    if not quiet:
        sys.stderr.write('Reading isoforms...\n')

    for line in gzip_reader(iso):
        if line[0] == '#':
            continue
        cols = line.rstrip().split('\t')
        isoforms[cols[1]] = cols[0]

    if not quiet:
        sys.stderr.write('Reading/Writing GTF...\n')

    for line in gzip_reader(gtf):
        try:
            comment = None
            idx = line.find('#')
            if idx > -1:
                if idx == 0:
                    sys.stdout.write(line)
                    continue
                comment = line[idx:]
                line = line[:-idx]
            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t')
            transcript_id = None
            for key, val in [x.split(' ') for x in [x.strip() for x in attrs.split(';')] if x]:
                if val[0] == '"' and val[-1] == '"':
                    val = val[1:-1]
                if key == 'transcript_id':
                    transcript_id = val

            if attrs[-1] != ';':
                attrs = '%s;' % attrs

            if transcript_id in isoforms:
                attrs = '%s isoform_id "%s";' % (attrs, isoforms[transcript_id])

            out.write('\t'.join([chrom, source, feature, start, end, score, strand, frame, attrs]))
            if comment:
                out.write('\t%s' % comment)
            out.write('\n')
        except:
            import traceback
            sys.stderr.write('Error parsing line:\n%s\n' % line)
            traceback.print_exc()
            sys.exit(1)
Пример #4
0
def bedgraph_clean(bedgraph, chrom_sizes, out=sys.stdout):
    refs = {}
    with open(chrom_sizes) as f:
        for line in f:
            cols = line.strip().split("\t")
            refs[cols[0]] = int(cols[1])

    first = True
    extra = ""

    for line in gzip_reader(bedgraph, callback=lambda: extra):
        if first:
            out.write(line)  # header
            first = False
            continue
        cols = line.strip().split("\t")
        ref = cols[0]
        start = int(cols[1])
        end = int(cols[2])

        extra = "%s:%s-%s" % (ref, start, end)

        if not ref in refs:
            continue

        if start >= refs[ref]:
            # skip this... it makes no sense
            continue
        if end > refs[ref]:
            # truncate this record...
            cols[2] = refs[ref]

        out.write("%s\n" % "\t".join([str(x) for x in cols]))
Пример #5
0
def gtf_add_xref(gtf, xref, column=4, out=sys.stdout, quiet=False):
    gene_names = {}

    if not quiet:
        sys.stderr.write("Reading xref...\n")
    for line in gzip_reader(xref):
        if line[0] == "#":
            continue
        cols = line.rstrip().split("\t")
        gene_names[cols[0]] = cols[column]

    if not quiet:
        sys.stderr.write("Reading/writing GTF...\n")
    for line in gzip_reader(gtf):
        try:
            comment = None
            idx = line.find("#")
            if idx > -1:
                if idx == 0:
                    sys.stdout.write(line)
                    continue
                comment = line[idx:]
                line = line[:-idx]
            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split("\t")
            transcript_id = None
            for key, val in [x.split(" ") for x in [x.strip() for x in attrs.split(";")] if x]:
                if val[0] == '"' and val[-1] == '"':
                    val = val[1:-1]
                if key == "transcript_id":
                    transcript_id = val

            if attrs[-1] != ";":
                attrs = "%s;" % attrs

            if transcript_id in gene_names:
                attrs = '%s gene_name "%s";' % (attrs, gene_names[transcript_id])

            out.write("\t".join([chrom, source, feature, start, end, score, strand, frame, attrs]))
            if comment:
                out.write("\t%s" % comment)
            out.write("\n")
        except:
            import traceback

            sys.stderr.write("Error parsing line:\n%s\n" % line)
            traceback.print_exc()
            sys.exit(1)
Пример #6
0
def qseq_reader(fname=None, fileobj=None, quiet=False):
    if not fileobj:
        if not fname:
            raise ValueError('Must pass fname or fileobj!')

        for line in gzip_reader(fname, quiet=quiet):
            yield QseqRecord(*line.strip().split('\t')[:11])
    else:
        for line in fileobj:
            yield QseqRecord(*line.strip().split('\t')[:11])
Пример #7
0
def gtf_filter(fname, filters, out=sys.stdout):
    for line in gzip_reader(fname):
        cols = line.strip('\n').split('\t')
        good = True
        for filt in filters:
            if not filt.process(cols):
                good = False
                break

        if good:
            out.write('%s\n' % '\t'.join([str(x) for x in cols]))
Пример #8
0
def gtf_filter(fname, filters, out=sys.stdout):
    for line in gzip_reader(fname):
        cols = line.strip('\n').split('\t')
        good = True
        for filt in filters:
            cols = filt.process(cols)
            if not cols:
                good = False
                break

        if good:
            out.write('%s\n' % '\t'.join([str(x) for x in cols]))
Пример #9
0
def gtf_remove_dup(fname, out=sys.stdout, quiet=False):
    if not quiet:
        sys.stderr.write('Reading GTF...\n')
    dup_count = 0
    good_count = 0
    for line in gzip_reader(fname, quiet=quiet):
        try:
            if line[0] == '#':
                out.write(line)
                continue

            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip().split('\t')
            transcript_id = None
            for key, val in [x.split(' ') for x in [x.strip() for x in attrs.split(';')] if x]:
                if val[0] == '"' and val[-1] == '"':
                    val = val[1:-1]
                if key == 'transcript_id':
                    transcript_id = val

            if '_dup' in transcript_id:
                dup_count += 1
                continue

            good_count += 1
            out.write(line)

        except:
            import traceback
            sys.stderr.write('Error parsing line:\n%s\n' % line)
            traceback.print_exc()
            sys.exit(1)

    if not quiet:
        sys.stderr.write('Kept %s transcript/exon annotations\n' % good_count)
        sys.stderr.write('Removed %s duplicate transcript/exon annotations\n' % dup_count)

    return (good_count, dup_count)
Пример #10
0
def gtf_addreflink(gtf, reflink, out=sys.stdout, quiet=False, replace=False):
    link_values = {}

    if not quiet:
        sys.stderr.write('Reading refLink...\n')

    for line in gzip_reader(reflink):
        cols = line.rstrip().split('\t')
        link_values[cols[2]] = (cols[0], cols[6])

    if not quiet:
        sys.stderr.write('Reading GTF...\n')

    for line in gzip_reader(gtf):
        try:
            comment = None
            idx = line.find('#')
            if idx > -1:
                if idx == 0:
                    sys.stdout.write(line)
                    continue
                comment = line[idx:]
                line = line[:-idx]
            chrom, source, feature, start, end, score, strand, frame, attrs = line.rstrip(
            ).split('\t')
            transcript_id = None
            gene_id = None
            for key, val in [
                    x.split(' ')
                    for x in [x.strip() for x in attrs.split(';')] if x
            ]:
                if val[0] == '"' and val[-1] == '"':
                    val = val[1:-1]
                if key == 'transcript_id':
                    transcript_id = val
                elif key == 'gene_id':
                    gene_id = val

            if attrs[-1] != ';':
                attrs = '%s;' % attrs

            if transcript_id in link_values:
                if replace:
                    attrs = 'gene_id "%s"; transcript_id "%s"; gene_name "%s"; orig_gene_id "%s";' % (
                        link_values[transcript_id][1], transcript_id,
                        link_values[transcript_id][0], gene_id)
                else:
                    extra = 'gene_name "%s"; isoform_id "%s";' % link_values[
                        transcript_id]
                    attrs = '%s %s' % (attrs, extra)

            out.write('\t'.join([
                chrom, source, feature, start, end, score, strand, frame, attrs
            ]))
            if comment:
                out.write('\t%s' % comment)
            out.write('\n')
        except:
            import traceback
            sys.stderr.write('Error parsing line:\n%s\n' % line)
            traceback.print_exc()
            sys.exit(1)