def merge_transcripts(results):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    library_ids = []
    expressed_gtf_files = []
    background_gtf_files = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        library_ids.append(library_id)
        prefix = os.path.join(results.classify_dir, library_id)
        expressed_gtf_files.append(prefix + ".expr.gtf")
        background_gtf_files.append(prefix + ".bkgd.gtf")
    library_id_map = {}
    for line in open(results.library_id_map):
        fields = line.strip().split('\t')
        library_id_map[fields[0]] = fields[1]
    # make a classification report
    logging.info("Writing classification report")
    fileh = open(results.classify_report_file, 'w')
    header_fields = [
        "library_id", "library_name", "category", "train.auc", "test.auc",
        "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn",
        "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp",
        "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc"
    ]
    print >> fileh, '\t'.join(header_fields)
    for library_id in library_ids:
        prefix = os.path.join(results.classify_dir, library_id)
        library_name = library_id_map[library_id]
        intergenic_perf_file = prefix + ".intergenic.perf.txt"
        intronic_perf_file = prefix + ".intronic.perf.txt"
        input_fileh = open(intergenic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = ([library_id, library_name, "intergenic"] +
                      line.strip().split('\t'))
            print >> fileh, '\t'.join(fields)
        input_fileh.close()
        input_fileh = open(intronic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = ([library_id, library_name, "intronic"] +
                      line.strip().split('\t'))
            print >> fileh, '\t'.join(fields)
        input_fileh.close()
    fileh.close()
    # add reference gtf file
    expressed_gtf_files.append(results.ref_gtf_file)
    background_gtf_files.append(results.ref_gtf_file)
    # merge sort gtf files
    logging.info("Merging and sorting expressed GTF files")
    merge_sort_gtf_files(expressed_gtf_files,
                         results.expressed_gtf_file,
                         tmp_dir=results.tmp_dir)
    logging.info("Merging and sorting background GTF files")
    merge_sort_gtf_files(background_gtf_files,
                         results.background_gtf_file,
                         tmp_dir=results.tmp_dir)
    return 0
def split_gtf_file(gtf_file,
                   split_dir,
                   ref_gtf_file,
                   category_stats_file,
                   bufsize=(1 << 30)):
    # split input gtf by library and mark test ids
    keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid))
    bufobj = BufferedFileSplitter(keyfunc, bufsize)
    ref_fileh = open(ref_gtf_file, 'w')
    stats_dict = collections.defaultdict(lambda: CategoryStats())
    logging.info("Splitting transcripts by library")
    for line in open(gtf_file):
        f = GTFFeature.from_string(line)
        is_ref = bool(int(f.attrs[GTFAttr.REF]))
        if is_ref:
            print >> ref_fileh, str(f)
            continue
        library_id = f.attrs[GTFAttr.LIBRARY_ID]
        # keep statistics
        if f.feature_type == 'transcript':
            is_test = bool(int(f.attrs[GTFAttr.TEST]))
            if is_test:
                category = Category.SAME_STRAND
            else:
                category = int(f.attrs[GTFAttr.CATEGORY])
            score = float(f.attrs[GTFAttr.SCORE])
            statsobj = stats_dict[library_id]
            statsobj.library_id = library_id
            statsobj.counts[category] += 1
            statsobj.signal[category] += score
        # write features from each library to separate files
        bufobj.write(library_id, line)
    # close open file handles
    ref_fileh.close()
    bufobj.close()
    logging.debug("Buffer flushes: %d" % (bufobj.flushes))
    # write library category statistics
    logging.info("Writing category statistics")
    fh = open(category_stats_file, "w")
    print >> fh, '\t'.join(CategoryStats.header_fields())
    for statsobj in stats_dict.itervalues():
        fields = statsobj.to_fields()
        print >> fh, '\t'.join(map(str, fields))
    fh.close()
def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)):
    # split input gtf by library and mark test ids
    keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid))
    bufobj = BufferedFileSplitter(keyfunc, bufsize)
    ref_fileh = open(ref_gtf_file, "w")
    stats_dict = collections.defaultdict(lambda: CategoryStats())
    logging.info("Splitting transcripts by library")
    for line in open(gtf_file):
        f = GTFFeature.from_string(line)
        is_ref = bool(int(f.attrs[GTFAttr.REF]))
        if is_ref:
            print >> ref_fileh, str(f)
            continue
        library_id = f.attrs[GTFAttr.LIBRARY_ID]
        # keep statistics
        if f.feature_type == "transcript":
            is_test = bool(int(f.attrs[GTFAttr.TEST]))
            if is_test:
                category = Category.SAME_STRAND
            else:
                category = int(f.attrs[GTFAttr.CATEGORY])
            score = float(f.attrs[GTFAttr.SCORE])
            statsobj = stats_dict[library_id]
            statsobj.library_id = library_id
            statsobj.counts[category] += 1
            statsobj.signal[category] += score
        # write features from each library to separate files
        bufobj.write(library_id, line)
    # close open file handles
    ref_fileh.close()
    bufobj.close()
    logging.debug("Buffer flushes: %d" % (bufobj.flushes))
    # write library category statistics
    logging.info("Writing category statistics")
    fh = open(category_stats_file, "w")
    print >> fh, "\t".join(CategoryStats.header_fields())
    for statsobj in stats_dict.itervalues():
        fields = statsobj.to_fields()
        print >> fh, "\t".join(map(str, fields))
    fh.close()
def classify_transcripts(results, num_processors):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    # get tasks
    tasks = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        tasks.append((library_id, results.classify_dir))
    # use multiprocessing to parallelize
    pool = multiprocessing.Pool(processes=num_processors)
    result_iter = pool.imap_unordered(classify_library_transcripts, tasks)
    errors = False
    library_ids = []
    for retcode, library_id in result_iter:
        if retcode == 0:
            library_ids.append(library_id)
        else:
            errors = True
    pool.close()
    pool.join()
    if errors:
        logging.error("Errors occurred during classification")
    return int(errors)
def classify_transcripts(results, num_processors):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    # get tasks
    tasks = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        tasks.append((library_id, results.classify_dir))
    # use multiprocessing to parallelize
    pool = multiprocessing.Pool(processes=num_processors)
    result_iter = pool.imap_unordered(classify_library_transcripts, tasks)
    errors = False
    library_ids = []
    for retcode, library_id in result_iter:
        if retcode == 0:
            library_ids.append(library_id)
        else:
            errors = True
    pool.close()
    pool.join()
    if errors:
        logging.error("Errors occurred during classification")
    return int(errors)
def merge_transcripts(results):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    library_ids = []
    expressed_gtf_files = []
    background_gtf_files = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        library_ids.append(library_id)
        prefix = os.path.join(results.classify_dir, library_id)
        expressed_gtf_files.append(prefix + ".expr.gtf")
        background_gtf_files.append(prefix + ".bkgd.gtf")
    library_id_map = {}
    for line in open(results.library_id_map):
        fields = line.strip().split("\t")
        library_id_map[fields[0]] = fields[1]
    # make a classification report
    logging.info("Writing classification report")
    fileh = open(results.classify_report_file, "w")
    header_fields = [
        "library_id",
        "library_name",
        "category",
        "train.auc",
        "test.auc",
        "train.cutoff",
        "train.tp",
        "train.fp",
        "train.fn",
        "train.tn",
        "train.sens",
        "train.spec",
        "train.balacc",
        "test.tp",
        "test.fp",
        "test.fn",
        "test.tn",
        "test.sens",
        "test.spec",
        "test.balacc",
    ]
    print >> fileh, "\t".join(header_fields)
    for library_id in library_ids:
        prefix = os.path.join(results.classify_dir, library_id)
        library_name = library_id_map[library_id]
        intergenic_perf_file = prefix + ".intergenic.perf.txt"
        intronic_perf_file = prefix + ".intronic.perf.txt"
        input_fileh = open(intergenic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = [library_id, library_name, "intergenic"] + line.strip().split("\t")
            print >> fileh, "\t".join(fields)
        input_fileh.close()
        input_fileh = open(intronic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = [library_id, library_name, "intronic"] + line.strip().split("\t")
            print >> fileh, "\t".join(fields)
        input_fileh.close()
    fileh.close()
    # add reference gtf file
    expressed_gtf_files.append(results.ref_gtf_file)
    background_gtf_files.append(results.ref_gtf_file)
    # merge sort gtf files
    logging.info("Merging and sorting expressed GTF files")
    merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir)
    logging.info("Merging and sorting background GTF files")
    merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir)
    return 0