def merge_transcripts(results):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    library_ids = []
    expressed_gtf_files = []
    background_gtf_files = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        library_ids.append(library_id)
        prefix = os.path.join(results.classify_dir, library_id)
        expressed_gtf_files.append(prefix + ".expr.gtf")
        background_gtf_files.append(prefix + ".bkgd.gtf")
    library_id_map = {}
    for line in open(results.library_id_map):
        fields = line.strip().split('\t')
        library_id_map[fields[0]] = fields[1]
    # make a classification report
    logging.info("Writing classification report")
    fileh = open(results.classify_report_file, 'w')
    header_fields = [
        "library_id", "library_name", "category", "train.auc", "test.auc",
        "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn",
        "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp",
        "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc"
    ]
    print >> fileh, '\t'.join(header_fields)
    for library_id in library_ids:
        prefix = os.path.join(results.classify_dir, library_id)
        library_name = library_id_map[library_id]
        intergenic_perf_file = prefix + ".intergenic.perf.txt"
        intronic_perf_file = prefix + ".intronic.perf.txt"
        input_fileh = open(intergenic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = ([library_id, library_name, "intergenic"] +
                      line.strip().split('\t'))
            print >> fileh, '\t'.join(fields)
        input_fileh.close()
        input_fileh = open(intronic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = ([library_id, library_name, "intronic"] +
                      line.strip().split('\t'))
            print >> fileh, '\t'.join(fields)
        input_fileh.close()
    fileh.close()
    # add reference gtf file
    expressed_gtf_files.append(results.ref_gtf_file)
    background_gtf_files.append(results.ref_gtf_file)
    # merge sort gtf files
    logging.info("Merging and sorting expressed GTF files")
    merge_sort_gtf_files(expressed_gtf_files,
                         results.expressed_gtf_file,
                         tmp_dir=results.tmp_dir)
    logging.info("Merging and sorting background GTF files")
    merge_sort_gtf_files(background_gtf_files,
                         results.background_gtf_file,
                         tmp_dir=results.tmp_dir)
    return 0
def classify_transcripts(results, num_processors):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    # get tasks
    tasks = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        tasks.append((library_id, results.classify_dir))
    # use multiprocessing to parallelize
    pool = multiprocessing.Pool(processes=num_processors)
    result_iter = pool.imap_unordered(classify_library_transcripts, tasks)
    errors = False
    library_ids = []
    for retcode, library_id in result_iter:
        if retcode == 0:
            library_ids.append(library_id)
        else:
            errors = True
    pool.close()
    pool.join()
    if errors:
        logging.error("Errors occurred during classification")
    return int(errors)
def classify_transcripts(results, num_processors):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    # get tasks
    tasks = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        tasks.append((library_id, results.classify_dir))
    # use multiprocessing to parallelize
    pool = multiprocessing.Pool(processes=num_processors)
    result_iter = pool.imap_unordered(classify_library_transcripts, tasks)
    errors = False
    library_ids = []
    for retcode, library_id in result_iter:
        if retcode == 0:
            library_ids.append(library_id)
        else:
            errors = True
    pool.close()
    pool.join()
    if errors:
        logging.error("Errors occurred during classification")
    return int(errors)
def merge_transcripts(results):
    # read library category statistics
    stats_list = list(CategoryStats.from_file(results.category_stats_file))
    library_ids = []
    expressed_gtf_files = []
    background_gtf_files = []
    for statsobj in stats_list:
        library_id = statsobj.library_id
        library_ids.append(library_id)
        prefix = os.path.join(results.classify_dir, library_id)
        expressed_gtf_files.append(prefix + ".expr.gtf")
        background_gtf_files.append(prefix + ".bkgd.gtf")
    library_id_map = {}
    for line in open(results.library_id_map):
        fields = line.strip().split("\t")
        library_id_map[fields[0]] = fields[1]
    # make a classification report
    logging.info("Writing classification report")
    fileh = open(results.classify_report_file, "w")
    header_fields = [
        "library_id",
        "library_name",
        "category",
        "train.auc",
        "test.auc",
        "train.cutoff",
        "train.tp",
        "train.fp",
        "train.fn",
        "train.tn",
        "train.sens",
        "train.spec",
        "train.balacc",
        "test.tp",
        "test.fp",
        "test.fn",
        "test.tn",
        "test.sens",
        "test.spec",
        "test.balacc",
    ]
    print >> fileh, "\t".join(header_fields)
    for library_id in library_ids:
        prefix = os.path.join(results.classify_dir, library_id)
        library_name = library_id_map[library_id]
        intergenic_perf_file = prefix + ".intergenic.perf.txt"
        intronic_perf_file = prefix + ".intronic.perf.txt"
        input_fileh = open(intergenic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = [library_id, library_name, "intergenic"] + line.strip().split("\t")
            print >> fileh, "\t".join(fields)
        input_fileh.close()
        input_fileh = open(intronic_perf_file)
        input_fileh.next()
        for line in input_fileh:
            fields = [library_id, library_name, "intronic"] + line.strip().split("\t")
            print >> fileh, "\t".join(fields)
        input_fileh.close()
    fileh.close()
    # add reference gtf file
    expressed_gtf_files.append(results.ref_gtf_file)
    background_gtf_files.append(results.ref_gtf_file)
    # merge sort gtf files
    logging.info("Merging and sorting expressed GTF files")
    merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir)
    logging.info("Merging and sorting background GTF files")
    merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir)
    return 0