def merge_transcripts(results): # read library category statistics stats_list = list(CategoryStats.from_file(results.category_stats_file)) library_ids = [] expressed_gtf_files = [] background_gtf_files = [] for statsobj in stats_list: library_id = statsobj.library_id library_ids.append(library_id) prefix = os.path.join(results.classify_dir, library_id) expressed_gtf_files.append(prefix + ".expr.gtf") background_gtf_files.append(prefix + ".bkgd.gtf") library_id_map = {} for line in open(results.library_id_map): fields = line.strip().split('\t') library_id_map[fields[0]] = fields[1] # make a classification report logging.info("Writing classification report") fileh = open(results.classify_report_file, 'w') header_fields = [ "library_id", "library_name", "category", "train.auc", "test.auc", "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn", "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp", "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc" ] print >> fileh, '\t'.join(header_fields) for library_id in library_ids: prefix = os.path.join(results.classify_dir, library_id) library_name = library_id_map[library_id] intergenic_perf_file = prefix + ".intergenic.perf.txt" intronic_perf_file = prefix + ".intronic.perf.txt" input_fileh = open(intergenic_perf_file) input_fileh.next() for line in input_fileh: fields = ([library_id, library_name, "intergenic"] + line.strip().split('\t')) print >> fileh, '\t'.join(fields) input_fileh.close() input_fileh = open(intronic_perf_file) input_fileh.next() for line in input_fileh: fields = ([library_id, library_name, "intronic"] + line.strip().split('\t')) print >> fileh, '\t'.join(fields) input_fileh.close() fileh.close() # add reference gtf file expressed_gtf_files.append(results.ref_gtf_file) background_gtf_files.append(results.ref_gtf_file) # merge sort gtf files logging.info("Merging and sorting expressed GTF files") merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir) logging.info("Merging and sorting background GTF files") merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir) return 0
def classify_transcripts(results, num_processors): # read library category statistics stats_list = list(CategoryStats.from_file(results.category_stats_file)) # get tasks tasks = [] for statsobj in stats_list: library_id = statsobj.library_id tasks.append((library_id, results.classify_dir)) # use multiprocessing to parallelize pool = multiprocessing.Pool(processes=num_processors) result_iter = pool.imap_unordered(classify_library_transcripts, tasks) errors = False library_ids = [] for retcode, library_id in result_iter: if retcode == 0: library_ids.append(library_id) else: errors = True pool.close() pool.join() if errors: logging.error("Errors occurred during classification") return int(errors)
def merge_transcripts(results): # read library category statistics stats_list = list(CategoryStats.from_file(results.category_stats_file)) library_ids = [] expressed_gtf_files = [] background_gtf_files = [] for statsobj in stats_list: library_id = statsobj.library_id library_ids.append(library_id) prefix = os.path.join(results.classify_dir, library_id) expressed_gtf_files.append(prefix + ".expr.gtf") background_gtf_files.append(prefix + ".bkgd.gtf") library_id_map = {} for line in open(results.library_id_map): fields = line.strip().split("\t") library_id_map[fields[0]] = fields[1] # make a classification report logging.info("Writing classification report") fileh = open(results.classify_report_file, "w") header_fields = [ "library_id", "library_name", "category", "train.auc", "test.auc", "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn", "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp", "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc", ] print >> fileh, "\t".join(header_fields) for library_id in library_ids: prefix = os.path.join(results.classify_dir, library_id) library_name = library_id_map[library_id] intergenic_perf_file = prefix + ".intergenic.perf.txt" intronic_perf_file = prefix + ".intronic.perf.txt" input_fileh = open(intergenic_perf_file) input_fileh.next() for line in input_fileh: fields = [library_id, library_name, "intergenic"] + line.strip().split("\t") print >> fileh, "\t".join(fields) input_fileh.close() input_fileh = open(intronic_perf_file) input_fileh.next() for line in input_fileh: fields = [library_id, library_name, "intronic"] + line.strip().split("\t") print >> fileh, "\t".join(fields) input_fileh.close() fileh.close() # add reference gtf file expressed_gtf_files.append(results.ref_gtf_file) background_gtf_files.append(results.ref_gtf_file) # merge sort gtf files logging.info("Merging and sorting expressed GTF files") merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir) logging.info("Merging and sorting background GTF files") merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir) return 0