def main(): args = parse_args() con, cur = sql_lib.attach_databases(args.comparativeAnnotationDir, mode="transMap") highest_cov_dict = sql_lib.get_highest_cov_alns(cur, args.genomes, args.filterChroms) for biotype in sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=False): biotype_ids = sql_lib.get_biotype_ids(cur, args.refGenome, biotype, filter_chroms=args.filterChroms) transcript_gene_map = sql_lib.get_transcript_gene_map( cur, args.refGenome, biotype, filter_chroms=args.filterChroms) if len( biotype_ids ) > 50: # hardcoded cutoff to avoid issues where this biotype/gencode mix is nearly empty fail_pass_excel_dict = get_fail_pass_excel_dict( cur, args.refGenome, args.genomes, highest_cov_dict, biotype, args.filterChroms) out_path = os.path.join(args.outDir, "transmap_analysis", biotype) mkdir_p(out_path) cov_ident_wrapper(highest_cov_dict, args.genomes, out_path, biotype, args.gencode, biotype_ids) cat_plot_wrapper(cur, highest_cov_dict, args.genomes, out_path, biotype, args.gencode, biotype_ids) paralogy_plot(cur, args.genomes, out_path, biotype, biotype_ids, args.gencode) num_pass_excel(fail_pass_excel_dict, cur, args.refGenome, out_path, biotype, args.gencode, biotype_ids) num_pass_excel_gene_level(fail_pass_excel_dict, cur, args.refGenome, out_path, biotype, args.gencode, transcript_gene_map)
def load_db(target, hints, db_path, genome, genome_fasta, timeout=30000, intervals=120): """ Final database loading. NOTE: Once done on all genomes, you want to run load2sqlitedb --makeIdx --dbaccess ${db} """ cmd = "load2sqlitedb --noIdx --species={} --dbaccess={} {}" fa_cmd = cmd.format(genome, db_path, genome_fasta) hints_cmd = cmd.format(genome, db_path, hints) def handle_concurrency(cmd, timeout, intervals, start_time=None): if start_time is None: start_time = time.time() elif time.time() - start_time >= timeout: raise RuntimeError("hints database still locked after {} seconds".format(timeout)) p = subprocess.Popen(cmd, shell=True, bufsize=-1, stderr=subprocess.PIPE) _, ret = p.communicate() if p.returncode == 0: return 1 elif p.returncode == 1 and "locked" in ret: time.sleep(intervals) handle_concurrency(cmd, timeout, intervals, start_time) else: raise RuntimeError(ret) mkdir_p(os.path.dirname(db_path)) for cmd in [fa_cmd, hints_cmd]: ret = handle_concurrency(cmd, timeout, intervals)
def main(): args = parse_args() mkdir_p(args.outDir) biotype_tx_counter = DefaultOrderedDict(lambda: defaultdict(int)) biotype_gene_counter = DefaultOrderedDict(lambda: defaultdict(int)) gencode_biotype_bin_dict_str = "etc.config.{}".format(args.gencode) gencode_biotype_bin_dict = eval(gencode_biotype_bin_dict_str) con, cur = sql_lib.attach_databases(args.compAnnPath, mode="reference") biotypes = sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=True) for biotype in biotypes: tx_evals, gene_evals, gene_fail_evals, tx_dup_rate, tx_counts, gene_counts = load_evaluations( args.workDir, args.genomes, biotype) if len(tx_evals) == 0: # a biotype may have nothing continue if biotype in args.biotypes: for (evals, counts), mode in zip( *[[[tx_evals, tx_counts], [gene_evals, gene_counts]], ["transcripts", "genes"]]): transcript_gene_plot(evals, args.outDir, args.gencode, mode, biotype) size_plot(counts, args.outDir, args.gencode, mode, biotype) gene_fail_plot(gene_fail_evals, args.outDir, args.gencode, biotype) dup_rate_plot(tx_dup_rate, args.outDir, args.gencode, biotype) biotype_bin = gencode_biotype_bin_dict.get(biotype, "Other") tx_evals_collapsed = collapse_evals(tx_evals) gene_evals_collapsed = collapse_evals(gene_evals) for genome, tx_count in tx_evals_collapsed.iteritems(): biotype_tx_counter[genome][biotype_bin] += tx_count biotype_gene_counter[genome][biotype_bin] += gene_evals_collapsed[ genome] for mode, counter in zip(*[["transcript", "gene"], [biotype_tx_counter, biotype_gene_counter]]): biotype_stacked_plot(counter, args.outDir, args.gencode, mode)
def get_bed_paths(out_dir, query_name, genome): out_bed_dir = os.path.join(out_dir, "bedfiles", query_name, genome) out_bed_path = os.path.join(out_bed_dir, "{}.bed".format(genome)) out_big_bed_dir = os.path.join(out_dir, "bigBedfiles", query_name, genome) out_big_bed_path = os.path.join(out_big_bed_dir, "{}.bb".format(genome)) mkdir_p(out_bed_dir) mkdir_p(out_big_bed_dir) return out_bed_path, out_big_bed_path
def write_tx_bed(out_dir, to_investigate): mkdir_p(out_dir) with open(os.path.join(out_dir, "not_ok_all_chaining.bed"), "w") as outf: outf.write( 'track name="Transcripts OK in simpleChain and not OK in allChain"\n' ) for t in to_investigate: outf.write("\t".join(map(str, t.get_bed())) + "\n")
def dump_attribute_results_to_disk(self, results_dict): """ Dumps a attribute dict. """ db = "attributes" base_p = os.path.join(self.tmp_dir, db) mkdir_p(base_p) p = os.path.join(base_p, self.column) with open(p, "wb") as outf: pickle.dump(results_dict, outf)
def main(): args = parse_args() mkdir_p(args.outDir) cgp_additions, cgp_replace, new_isoforms, cgp_missing, cgp_join_genes, consensus_stats = load_evaluations( args.workDir, args.genomes) addition_plot(cgp_additions, args.outDir, args.gencode) replace_plot(cgp_replace, args.outDir, args.gencode) new_isoforms_plot(new_isoforms, args.outDir, args.gencode) missing_plot(cgp_missing, args.outDir, args.gencode) join_genes_plot(cgp_join_genes, args.outDir, args.gencode) consensus_stats_plot(consensus_stats, args.outDir, args.gencode)
def write_browser_bed(out_dir, all_dir, formatted_names): a_details_con = sql.connect(os.path.join(all_dir, "details.db")) a_details_cur = a_details_con.cursor() formatted_classifiers = ", ".join(tm_coding_classifiers) cmd = "SELECT {} FROM C57B6NJ WHERE AlignmentId in ({})".format(formatted_classifiers, formatted_names) recs = a_details_cur.execute(cmd).fetchall() mkdir_p(out_dir) with open(os.path.join(out_dir, "failed_classifiers.bed"), "w") as outf: outf.write('track name="Classifiers failed in allChain transcripts that were ok in simpleChain"\n') for r in parse_details(recs): outf.write(r)
def wrapper(target, genomes, ref_fasta, out_dir, fa_dir): for g in genomes: out_path = os.path.join(out_dir, "tmp", g) mkdir_p(out_path) for f in os.listdir(out_path): os.remove(os.path.join(out_path, f)) target_fasta = os.path.join(fa_dir, g + ".fa") faidx = os.path.join(fa_dir, g + ".fa.fai") aug_aIds = [x.split()[0] for x in open(faidx)] for chunk in chunker(aug_aIds, 200): target.addChildTargetFn(align, args=(g, target_fasta, chunk, ref_fasta, out_path)) target.setFollowOnTargetFn(wrapper2, args=(genomes, out_dir))
def dump_results_to_disk(self): """ Dumps a pair of classify/details dicts to disk in the globalTempDir for later merging. """ details_dict = sql_lib.collapse_details_dict(self.details_dict) for db, this_dict in itertools.izip( *[["details", "classify"], [details_dict, self.classify_dict] ]): base_p = os.path.join(self.tmp_dir, db) mkdir_p(base_p) p = os.path.join(base_p, self.column) with open(p, "wb") as outf: pickle.dump(this_dict, outf)
def munge_files(files, target_path, rename=None): mkdir_p(os.path.dirname(target_path)) records = set() for f in files: experiment = os.path.basename(f).replace("SJ.out.tab","") if len(experiment) == 0: experiment = os.path.dirname(f).split("/")[-1] for line in open(f): records.add(splice_junction_line_to_bed(line, experiment, rename)) records = sorted(records, key = lambda x: [x[0], x[1]]) with open(target_path, "w") as outf: for line in records: outf.write("\t".join(line) + "\n")
def split_ss_wrapper(target, args, ss_dict): """ Wrapper for split_ss, which splits a sufficient statistics files by windows and blocks to make manageable parts. """ if args.ref_fasta_path is None: args.ref_fasta_path = get_ref_genome_fasta(args.hal, args.ref_genome, target.getGlobalTempDir()) for chromosome, ss_path in ss_dict.iteritems(): out_dir = os.path.join(args.output_dir, chromosome, chromosome) mkdir_p(out_dir) target.addChildTargetFn(split_ss, args=(chromosome, ss_path, out_dir, args.ref_fasta_path, args.msa_split_options))
def write_browser_bed(out_dir, all_dir, formatted_names): a_details_con = sql.connect(os.path.join(all_dir, "details.db")) a_details_cur = a_details_con.cursor() formatted_classifiers = ", ".join(tm_coding_classifiers) cmd = "SELECT {} FROM C57B6NJ WHERE AlignmentId in ({})".format( formatted_classifiers, formatted_names) recs = a_details_cur.execute(cmd).fetchall() mkdir_p(out_dir) with open(os.path.join(out_dir, "failed_classifiers.bed"), "w") as outf: outf.write( 'track name="Classifiers failed in allChain transcripts that were ok in simpleChain"\n' ) for r in parse_details(recs): outf.write(r)
def main_ref_fn(target, comp_ann_path, gencode, ref_genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_transcript_classifiers" base_barplot_title = ("Classifiers failed by {} transcripts in the reference set {}\n") out_path = os.path.join(base_out_path, "clustering", ref_genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="reference") biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms) if len(biotype_ids) > 50: sql_data = sql_lib.load_data(con, ref_genome, etc.config.ref_classifiers, primary_key="TranscriptId") out_barplot_file = os.path.join(out_path, "reference_barplot_{}".format(gencode)) barplot_title = base_barplot_title.format(biotype.replace("_", " "), gencode) munged, stats = munge_data(sql_data, biotype_ids) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "reference_clustering_{}".format(gencode)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def write_human_readable_classifiers(out_dir, to_investigate, a_con): formatted_names = ", ".join(['"' + x.name + '"' for x in to_investigate]) formatted_classifiers = ", ".join(["AlignmentId"] + tm_coding_classifiers) cmd = "SELECT {} FROM C57B6NJ WHERE AlignmentId in ({})".format(formatted_classifiers, formatted_names) a_data = pd.read_sql(cmd, a_con) failures = defaultdict(list) for pos, row in a_data.iterrows(): for classifier, value in row.iteritems(): if classifier == "AlignmentId": name = value elif value == 1: failures[name].append(classifier) mkdir_p(out_dir) with open(os.path.join(out_dir, "failed_classifiers.tsv"), "w") as outf: for name, vals in failures.iteritems(): vals = ",".join(sorted(vals)) outf.write("\t".join([name, vals]) + "\n") return formatted_names
def main(): args = parse_args() con, cur = sql_lib.attach_databases(args.compAnnPath, mode=args.mode) biotypes = sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=True) transcript_gene_map = sql_lib.get_transcript_gene_map( cur, args.refGenome, biotype=None, filter_chroms=args.filterChroms) gps = load_gps( args.gps ) # load all Augustus and transMap transcripts into one big dict consensus_base_path = os.path.join(args.outDir, args.genome) stats = get_stats(cur, args.genome, args.mode) ref_gene_intervals = build_ref_intervals(cur, args.genome) tgt_intervals = build_tgt_intervals(gps) for biotype in biotypes: gene_transcript_map = sql_lib.get_gene_transcript_map( cur, args.refGenome, biotype=biotype, filter_chroms=args.filterChroms) binned_transcripts, consensus = consensus_by_biotype( cur, args.refGenome, args.genome, biotype, gps, transcript_gene_map, gene_transcript_map, stats, args.mode, ref_gene_intervals, tgt_intervals) deduplicated_consensus, dup_count = deduplicate_consensus( consensus, gps, stats) if len(deduplicated_consensus ) > 0: # some biotypes we may have nothing num_genes, num_txs = write_gps(deduplicated_consensus, gps, consensus_base_path, biotype, transcript_gene_map, args.mode) if biotype == "protein_coding": gene_transcript_evals = evaluate_coding_consensus( binned_transcripts, stats, ref_gene_intervals, gps, args.mode) else: gene_transcript_evals = evaluate_noncoding_consensus( binned_transcripts, stats, gps) p = os.path.join(args.workDir, "_".join([args.genome, biotype])) mkdir_p(os.path.dirname(p)) gene_transcript_evals["duplication_rate"] = dup_count gene_transcript_evals["gene_counts"] = num_genes gene_transcript_evals["tx_counts"] = num_txs with open(p, "w") as outf: pickle.dump(gene_transcript_evals, outf)
def write_human_readable_classifiers(out_dir, to_investigate, a_con): formatted_names = ", ".join(['"' + x.name + '"' for x in to_investigate]) formatted_classifiers = ", ".join(["AlignmentId"] + tm_coding_classifiers) cmd = "SELECT {} FROM C57B6NJ WHERE AlignmentId in ({})".format( formatted_classifiers, formatted_names) a_data = pd.read_sql(cmd, a_con) failures = defaultdict(list) for pos, row in a_data.iterrows(): for classifier, value in row.iteritems(): if classifier == "AlignmentId": name = value elif value == 1: failures[name].append(classifier) mkdir_p(out_dir) with open(os.path.join(out_dir, "failed_classifiers.tsv"), "w") as outf: for name, vals in failures.iteritems(): vals = ",".join(sorted(vals)) outf.write("\t".join([name, vals]) + "\n") return formatted_names
def write_gps(consensus, gps, consensus_base_path, biotype, transcript_gene_map, mode): """ Writes the final consensus gene set to a genePred, after fixing the names. Reports the number of genes and txs in the final set """ if mode == "transMap": p = os.path.join(consensus_base_path, biotype + ".transmap_gene_set.gp") else: p = os.path.join(consensus_base_path, biotype + ".augustus_consensus_gene_set.gp") mkdir_p(os.path.dirname(p)) gp_recs = [gps[aln_id] for aln_id in consensus] num_genes, num_txs, fixed_gp_recs = fix_gene_pred(gp_recs, transcript_gene_map) with open(p, "w") as outf: for rec in fixed_gp_recs: outf.write(rec) return num_genes, num_txs
def database(genome, db, db_path, tmp_dir, mode): data_dict = {} mkdir_p(os.path.dirname(db_path)) data_path = os.path.join(tmp_dir, db) for col in os.listdir(data_path): p = os.path.join(data_path, col) with open(p) as p_h: data_dict[col] = pickle.load(p_h) if mode == "reference": index_label = "TranscriptId" elif mode == "transMap": index_label = "AlignmentId" else: index_label = "AugustusAlignmentId" # Hack to add transMap alignment ID column to Augustus databases. aug_ids = data_dict.itervalues().next().viewkeys() data_dict["AlignmentId"] = { x: psl_lib.remove_augustus_alignment_number(x) for x in aug_ids } sql_lib.write_dict(data_dict, db_path, genome, index_label)
def main_augustus_fn(target, comp_ann_path, gencode, genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_augustus_classifiers" base_barplot_title = ("Augustus classifiers failed by {:,} transcripts derived from transMap\n" "on the reference set {} with Augustus {}") out_path = os.path.join(base_out_path, "augustus_classifier_breakdown", genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="augustus") highest_cov_dict = sql_lib.highest_cov_aln(cur, genome) highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0]) sql_data = sql_lib.load_data(con, genome, etc.config.aug_classifiers, primary_key="AugustusAlignmentId", table="augustus") base_filter_set = {x for x in sql_data.index if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids} for mode in ["1", "2"]: i = "I{}".format(mode) aug_mode = "trusting RNAseq more" if mode == "2" else "trusting RNAseq less" filter_set = {x for x in base_filter_set if i in x} out_barplot_file = os.path.join(out_path, "augustus_barplot_{}_{}_{}".format(genome, gencode, i)) barplot_title = base_barplot_title.format(len(filter_set), gencode, aug_mode) munged, stats = munge_data(sql_data, filter_set) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "augustus_clustering_{}_{}_{}".format(genome, gencode, i)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def main_fn(target, comp_ann_path, gencode, genome, ref_genome, base_out_path, filter_chroms): clust_title = "Hierarchical_clustering_of_transMap_classifiers" base_barplot_title = ("Classifiers failed by {} transcripts in the category {} in transMap analysis\n" "Genome: {}. Gencode set: {}. {:,} ({:0.2f}%) of transcripts") out_path = os.path.join(base_out_path, "classifier_breakdown", genome) mkdir_p(out_path) con, cur = sql_lib.attach_databases(comp_ann_path, mode="transMap") fail_ids, passing_specific_ids, excellent_ids = sql_lib.get_fail_passing_excel_ids(cur, ref_genome, genome, biotype) biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms) if len(biotype_ids) > 50: sql_data = sql_lib.load_data(con, genome, etc.config.clustering_classifiers) num_original_introns = sql_lib.load_data(con, genome, ["NumberIntrons"], table="attributes") for mode, ids in zip(*[["Fail", "Pass/NotExcellent"], [fail_ids, passing_specific_ids]]): mode_underscore = mode.replace("/", "_") out_barplot_file = os.path.join(out_path, "barplot_{}_{}_{}".format(genome, biotype, mode_underscore)) percentage_of_set = 100.0 * len(ids) / len(biotype_ids) barplot_title = base_barplot_title.format(biotype.replace("_" , " "), mode, genome, gencode, len(ids), percentage_of_set) munged, stats = munge_intron_data(sql_data, num_original_introns, ids) plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title) data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString()) munged.to_csv(data_path) out_cluster_file = os.path.join(out_path, "clustering_{}_{}_{}".format(genome, biotype, mode_underscore)) target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def write_tx_bed(out_dir, to_investigate): mkdir_p(out_dir) with open(os.path.join(out_dir, "not_ok_all_chaining.bed"), "w") as outf: outf.write('track name="Transcripts OK in simpleChain and not OK in allChain"\n') for t in to_investigate: outf.write("\t".join(map(str, t.get_bed())) + "\n")
os.symlink(bai_path, os.path.join(target_folder, genome, bam + ".bai")) # make symlinks for all alignments done against target assemblies by Sanger file_map = {} for base_path, dirs, files in os.walk("/hive/groups/recon/projs/mus_strain_cactus/data/rel-1509-rna-seq/ftp-mouse.sanger.ac.uk/REL-1509-Assembly-RNA-Seq"): if files: genome, institute, tissue = base_path.split("/")[-3:] #genome = name_map[genome] for f in files: if f.endswith(".bam"): experiment = f.split("Aligned")[0] bam_path = os.path.join(base_path, f) bai_path = bam_path + ".bai" if (genome, institute, tissue) not in file_map: file_map[(genome, institute, tissue)] = [] file_map[(genome, institute, tissue)].append([experiment, bam_path, bai_path]) target_folder = "/cluster/home/ifiddes/mus_strain_data/pipeline_data/rnaseq/munged_STAR_data/REL-1509-chromosomes" for (genome, institute, tissue), files in file_map.iteritems(): for experiment, bam_path, bai_path in files: bam = institute + "_" + tissue + "_" + experiment + ".sortedByCoord.bam" mkdir_p(os.path.join(target_folder, genome)) tgt_bam = os.path.join(target_folder, genome, bam) tgt_bai = os.path.join(target_folder, genome, bam + ".bai") if not os.path.exists(tgt_bam): os.symlink(bam_path, tgt_bam) print "linked {}".format(tgt_bam) if not os.path.exists(tgt_bai): os.symlink(bai_path, tgt_bai)
return phased_reads bam_path = "/hive/users/ifiddes/longranger-1.2.0/{}/PHASER_SVCALLER_CS/PHASER_SVCALLER/ATTACH_PHASING/fork0/files/phased_possorted_bam.bam".format(genome) bam_handle = pysam.Samfile(bam_path) v_h = vcf.Reader(open("/hive/users/ifiddes/notch2nl_suns/Notch2NL_SUN_UniqueIndels.vcf.gz")) phased_read_holder = {} bam_handle = pysam.Samfile(bamfile) for chrom, start, stop, name in regions: phased_read_holder[name] = bin_phased_reads(chrom, start, stop, bam_handle) out_dir = "/hive/users/ifiddes/longranger-1.2.0/notch2nl_10x/linked_bam_analysis/split_bams/{}".format(genome) mkdir_p(out_dir) for para in phased_read_holder: for tag in phased_read_holder[para]: with pysam.Samfile(os.path.join(out_dir, "{}.{}.bam".format(para, "_".join(map(str, tag)))), "wb", template=bam_handle) as outf: for read in phased_read_holder[para][tag]: outf.write(read) # validate these phasing results according to SUN positions def build_vcf_intervals(reads, vcf_recs, bam_handle): """ Find if any of these reads match a known SUN/indel by simple bedtools intersections """ vcf_bed_recs = [ChromosomeInterval(x.CHROM, x.start, x.end, None) for x in vcf_recs]
if 'PS' in tags: t = (tags['PS'], tags['HP']) phased_reads[t].append(rec) return phased_reads bam_path = "/hive/users/ifiddes/longranger-1.2.0/{}/PHASER_SVCALLER_CS/PHASER_SVCALLER/ATTACH_PHASING/fork0/files/phased_possorted_bam.bam".format(genome) bam_handle = pysam.Samfile(bam_path) phased_read_holder = {} bam_handle = pysam.Samfile(bamfile) for chrom, start, stop, name in regions: phased_read_holder[name] = bin_phased_reads(chrom, start, stop, bam_handle) out_dir = "/hive/users/ifiddes/longranger-1.2.0/notch2nl_10x/linked_bam_analysis/split_bams/{}".format(genome) mkdir_p(out_dir) for para in phased_read_holder: for tag in phased_read_holder[para]: with pysam.Samfile(os.path.join(out_dir, "{}.{}.bam".format(para, "_".join(map(str, tag)))), "wb", template=bam_handle) as outf: for read in phased_read_holder[para][tag]: outf.write(read) fastq_dir = "/hive/users/ifiddes/longranger-1.2.0/notch2nl_10x/linked_bam_analysis/split_fastqs/{}".format(genome) mkdir_p(fastq_dir) fastqs = {} for bam in [x for x in os.listdir(out_dir) if x.endswith("bam")]: outbase = os.path.join(fastq_dir, bam.replace(".bam", "")) bampath = os.path.join(out_dir, bam) outpaired = outbase + ".paired.fq" outsingle = outbase + ".single.fq" fastqs[tuple(bam.replace(".bam", "").split("."))] = [outpaired, outsingle]