def split_gtf_by_category(infiles, outfiles, catname):

    catfile, gtffile = infiles
    categories = pd.read_csv(catfile, index_col=0, squeeze=True, sep="\t")

    # create output filepool
    outpool = iotools.FilePool("{}_%s.gtf.gz".format(catname), force=True)

    gtffile = iotools.open_file(gtffile)

    for gtfline in gtf.iterator(gtffile):

        try:
            transcript_id = gtfline.transcript_id
        except AttributeError:
            transcript_id = None

        try:
            gene_id = gtfline.gene_id
        except AttributeError:
            gene_id = None

        if transcript_id in categories.index:
            outpool.write(categories[transcript_id], str(gtfline) + "\n")
        elif gene_id in categories.index:
            outpool.write(categories[gene_id], str(gtfline) + "\n")

    outpool.close()
Exemplo n.º 2
0
def chunk_iterator_column(infile, args, prefix, use_header=False):
    """split at column.

    The table need not be sorted by this column.
    If num_files is given, files will randomly created
    and tags according to column randomly assigned.


    """

    column, max_files = args
    files = iotools.FilePool()
    header = False

    if max_files:
        map_tag2file = {}

    for line in infile:
        if line[0] == "#":
            continue

        if not header and use_header:
            files.setHeader(line)
            header = True
            continue

        key = line[:-1].split("\t")[column]
        if max_files:
            if key in map_tag2file:
                key = map_tag2file[key]
            else:
                n = "%010i" % (len(map_tag2file) % max_files)
                map_tag2file[key] = n
                key = n

        files.write("%s/%s.in" % (prefix, key), line)

    for filename, count in list(files.items()):
        E.info("created file %s with %i items" % (filename, count))
        yield filename
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-o",
                        "--min-overlap",
                        dest="min_overlap",
                        type=int,
                        help="minimum overlap")

    parser.add_argument(
        "-w",
        "--pattern-window",
        dest="pattern_window",
        type=str,
        help="regular expression to extract window coordinates from "
        "test id ")

    parser.add_argument("-i",
                        "--invert",
                        dest="invert",
                        action="store_true",
                        help="invert direction of fold change ")

    parser.set_defaults(min_overlap=10,
                        invert=False,
                        pattern_window="(\S+):(\d+)-(\d+)"),

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    outfiles = iotools.FilePool(args.output_filename_pattern)

    if args.invert:
        test_f = lambda l2fold: l2fold < 0
    else:
        test_f = lambda l2fold: l2fold > 0

    def read():

        rx_window = re.compile(args.pattern_window)
        # filter any of the DESeq/EdgeR message that end up at the top of the
        # output file

        for data in iotools.iterate(args.stdin):

            contig, start, end = rx_window.match(data.test_id).groups()
            start, end = list(map(int, (start, end)))

            yield DATA._make(
                (data.test_id, contig, start, end, data.treatment_name,
                 float(data.treatment_mean),
                 float(data.treatment_std), data.control_name,
                 float(data.control_mean), float(data.control_std),
                 float(data.pvalue), float(data.qvalue), float(data.l2fold),
                 float(data.fold), int(data.significant), data.status, 0))

    def grouper(data, distance=10):

        last = next(data)
        entries = [last]

        while 1:
            d = next(data)
            if d is None:
                break
            if d.contig == last.contig and d.start < last.start:
                raise ValueError("error not sorted by start")

            if ((d.contig != last.contig) or (d.start - last.end > distance)
                    or (d.status != last.status)
                    or (d.significant != last.significant)
                    or (d.l2fold * last.l2fold < 0)):
                yield entries
                entries = []

            entries.append(d)
            last = d

        yield entries

    counter = E.Counter()

    args.stdout.write("\t".join(DATA._fields) + "\n")

    # set of all sample names - used to create empty files
    samples = set()

    # need to sort by coordinate
    all_data = list(read())
    all_data.sort(key=lambda x: (x.contig, x.start))

    group_id = 0

    for group in grouper(iter(all_data), distance=args.min_overlap):
        group_id += 1

        start, end = group[0].start, group[-1].end
        assert start < end, 'start > end: %s' % str(group)
        n = float(len(group))
        counter.input += n

        g = group[0]

        if g.l2fold < 0:
            l2fold = max([x.l2fold for x in group])
            fold = max([x.fold for x in group])
        else:
            l2fold = min([x.l2fold for x in group])
            fold = min([x.fold for x in group])

        outdata = DATA._make(
            (str(group_id), g.contig, start, end, g.treatment_name,
             sum([x.treatment_mean for x in group]) / n,
             max([x.treatment_std for x in group]), g.control_name,
             sum([x.control_mean
                  for x in group]) / n, max([x.control_std for x in group]),
             max([x.pvalue for x in group]), max([x.qvalue for x in group]),
             l2fold, fold, g.significant, g.status, int(n)))

        samples.add(g.treatment_name)
        samples.add(g.control_name)
        if g.significant:
            if test_f(g.l2fold):
                # treatment lower methylation than control
                outfiles.write(
                    g.treatment_name, "%s\t%i\t%i\t%i\t%f\n" %
                    (g.contig, g.start, g.end, group_id,
                     sum([x.treatment_mean for x in group]) / n))

            else:
                outfiles.write(
                    g.control_name, "%s\t%i\t%i\t%i\t%f\n" %
                    (g.contig, g.start, g.end, group_id,
                     sum([x.control_mean for x in group]) / n))

        args.stdout.write("\t".join(map(str, outdata)) + "\n")

        counter.output += 1

    # create empty files
    for sample in samples:
        outfiles.write(sample, "")

    outfiles.close()
    E.info("%s" % counter)

    # write footer and output benchmark information.
    E.stop()