def main():
    args = parse_args()
    con, cur = sql_lib.attach_databases(args.comparativeAnnotationDir,
                                        mode="transMap")
    highest_cov_dict = sql_lib.get_highest_cov_alns(cur, args.genomes,
                                                    args.filterChroms)
    for biotype in sql_lib.get_all_biotypes(cur,
                                            args.refGenome,
                                            gene_level=False):
        biotype_ids = sql_lib.get_biotype_ids(cur,
                                              args.refGenome,
                                              biotype,
                                              filter_chroms=args.filterChroms)
        transcript_gene_map = sql_lib.get_transcript_gene_map(
            cur, args.refGenome, biotype, filter_chroms=args.filterChroms)
        if len(
                biotype_ids
        ) > 50:  # hardcoded cutoff to avoid issues where this biotype/gencode mix is nearly empty
            fail_pass_excel_dict = get_fail_pass_excel_dict(
                cur, args.refGenome, args.genomes, highest_cov_dict, biotype,
                args.filterChroms)
            out_path = os.path.join(args.outDir, "transmap_analysis", biotype)
            mkdir_p(out_path)
            cov_ident_wrapper(highest_cov_dict, args.genomes, out_path,
                              biotype, args.gencode, biotype_ids)
            cat_plot_wrapper(cur, highest_cov_dict, args.genomes, out_path,
                             biotype, args.gencode, biotype_ids)
            paralogy_plot(cur, args.genomes, out_path, biotype, biotype_ids,
                          args.gencode)
            num_pass_excel(fail_pass_excel_dict, cur, args.refGenome, out_path,
                           biotype, args.gencode, biotype_ids)
            num_pass_excel_gene_level(fail_pass_excel_dict, cur,
                                      args.refGenome, out_path, biotype,
                                      args.gencode, transcript_gene_map)
示例#2
0
def load_db(target, hints, db_path, genome, genome_fasta, timeout=30000, intervals=120):
    """
    Final database loading.
    NOTE: Once done on all genomes, you want to run load2sqlitedb --makeIdx --dbaccess ${db}
    """
    cmd = "load2sqlitedb --noIdx --species={} --dbaccess={} {}"
    fa_cmd = cmd.format(genome, db_path, genome_fasta)
    hints_cmd = cmd.format(genome, db_path, hints)
    def handle_concurrency(cmd, timeout, intervals, start_time=None):
        if start_time is None:
            start_time = time.time()
        elif time.time() - start_time >= timeout:
            raise RuntimeError("hints database still locked after {} seconds".format(timeout))
        p = subprocess.Popen(cmd, shell=True, bufsize=-1, stderr=subprocess.PIPE)
        _, ret = p.communicate()
        if p.returncode == 0:
            return 1
        elif p.returncode == 1 and "locked" in ret:
            time.sleep(intervals)
            handle_concurrency(cmd, timeout, intervals, start_time)
        else:
            raise RuntimeError(ret)
    mkdir_p(os.path.dirname(db_path))
    for cmd in [fa_cmd, hints_cmd]:
        ret = handle_concurrency(cmd, timeout, intervals)
示例#3
0
def main():
    args = parse_args()
    mkdir_p(args.outDir)
    biotype_tx_counter = DefaultOrderedDict(lambda: defaultdict(int))
    biotype_gene_counter = DefaultOrderedDict(lambda: defaultdict(int))
    gencode_biotype_bin_dict_str = "etc.config.{}".format(args.gencode)
    gencode_biotype_bin_dict = eval(gencode_biotype_bin_dict_str)
    con, cur = sql_lib.attach_databases(args.compAnnPath, mode="reference")
    biotypes = sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=True)
    for biotype in biotypes:
        tx_evals, gene_evals, gene_fail_evals, tx_dup_rate, tx_counts, gene_counts = load_evaluations(
            args.workDir, args.genomes, biotype)
        if len(tx_evals) == 0:  # a biotype may have nothing
            continue
        if biotype in args.biotypes:
            for (evals, counts), mode in zip(
                    *[[[tx_evals, tx_counts], [gene_evals, gene_counts]],
                      ["transcripts", "genes"]]):
                transcript_gene_plot(evals, args.outDir, args.gencode, mode,
                                     biotype)
                size_plot(counts, args.outDir, args.gencode, mode, biotype)
            gene_fail_plot(gene_fail_evals, args.outDir, args.gencode, biotype)
            dup_rate_plot(tx_dup_rate, args.outDir, args.gencode, biotype)
        biotype_bin = gencode_biotype_bin_dict.get(biotype, "Other")
        tx_evals_collapsed = collapse_evals(tx_evals)
        gene_evals_collapsed = collapse_evals(gene_evals)
        for genome, tx_count in tx_evals_collapsed.iteritems():
            biotype_tx_counter[genome][biotype_bin] += tx_count
            biotype_gene_counter[genome][biotype_bin] += gene_evals_collapsed[
                genome]
    for mode, counter in zip(*[["transcript", "gene"],
                               [biotype_tx_counter, biotype_gene_counter]]):
        biotype_stacked_plot(counter, args.outDir, args.gencode, mode)
示例#4
0
def get_bed_paths(out_dir, query_name, genome):
    out_bed_dir = os.path.join(out_dir, "bedfiles", query_name, genome)
    out_bed_path = os.path.join(out_bed_dir, "{}.bed".format(genome))
    out_big_bed_dir = os.path.join(out_dir, "bigBedfiles", query_name, genome)
    out_big_bed_path = os.path.join(out_big_bed_dir, "{}.bb".format(genome))
    mkdir_p(out_bed_dir)
    mkdir_p(out_big_bed_dir)
    return out_bed_path, out_big_bed_path
def write_tx_bed(out_dir, to_investigate):
    mkdir_p(out_dir)
    with open(os.path.join(out_dir, "not_ok_all_chaining.bed"), "w") as outf:
        outf.write(
            'track name="Transcripts OK in simpleChain and not OK in allChain"\n'
        )
        for t in to_investigate:
            outf.write("\t".join(map(str, t.get_bed())) + "\n")
示例#6
0
 def dump_attribute_results_to_disk(self, results_dict):
     """
     Dumps a attribute dict.
     """
     db = "attributes"
     base_p = os.path.join(self.tmp_dir, db)
     mkdir_p(base_p)
     p = os.path.join(base_p, self.column)
     with open(p, "wb") as outf:
         pickle.dump(results_dict, outf)
def main():
    args = parse_args()
    mkdir_p(args.outDir)
    cgp_additions, cgp_replace, new_isoforms, cgp_missing, cgp_join_genes, consensus_stats = load_evaluations(
        args.workDir, args.genomes)
    addition_plot(cgp_additions, args.outDir, args.gencode)
    replace_plot(cgp_replace, args.outDir, args.gencode)
    new_isoforms_plot(new_isoforms, args.outDir, args.gencode)
    missing_plot(cgp_missing, args.outDir, args.gencode)
    join_genes_plot(cgp_join_genes, args.outDir, args.gencode)
    consensus_stats_plot(consensus_stats, args.outDir, args.gencode)
def write_browser_bed(out_dir, all_dir, formatted_names):
    a_details_con = sql.connect(os.path.join(all_dir, "details.db"))
    a_details_cur = a_details_con.cursor()
    formatted_classifiers = ", ".join(tm_coding_classifiers)
    cmd = "SELECT {} FROM C57B6NJ WHERE AlignmentId in ({})".format(formatted_classifiers, formatted_names)
    recs = a_details_cur.execute(cmd).fetchall()
    mkdir_p(out_dir)
    with open(os.path.join(out_dir, "failed_classifiers.bed"), "w") as outf:
        outf.write('track name="Classifiers failed in allChain transcripts that were ok in simpleChain"\n')
        for r in parse_details(recs):
            outf.write(r)
示例#9
0
def wrapper(target, genomes, ref_fasta, out_dir, fa_dir):
    for g in genomes:
        out_path = os.path.join(out_dir, "tmp", g)
        mkdir_p(out_path)
        for f in os.listdir(out_path):
            os.remove(os.path.join(out_path, f))
        target_fasta = os.path.join(fa_dir, g + ".fa")
        faidx = os.path.join(fa_dir, g + ".fa.fai")
        aug_aIds = [x.split()[0] for x in open(faidx)]
        for chunk in chunker(aug_aIds, 200):
            target.addChildTargetFn(align, args=(g, target_fasta, chunk, ref_fasta, out_path))
    target.setFollowOnTargetFn(wrapper2, args=(genomes, out_dir))
示例#10
0
 def dump_results_to_disk(self):
     """
     Dumps a pair of classify/details dicts to disk in the globalTempDir for later merging.
     """
     details_dict = sql_lib.collapse_details_dict(self.details_dict)
     for db, this_dict in itertools.izip(
             *[["details", "classify"], [details_dict, self.classify_dict]
               ]):
         base_p = os.path.join(self.tmp_dir, db)
         mkdir_p(base_p)
         p = os.path.join(base_p, self.column)
         with open(p, "wb") as outf:
             pickle.dump(this_dict, outf)
def munge_files(files, target_path, rename=None):
    mkdir_p(os.path.dirname(target_path))
    records = set()
    for f in files:
        experiment = os.path.basename(f).replace("SJ.out.tab","")
        if len(experiment) == 0:
            experiment = os.path.dirname(f).split("/")[-1]
        for line in open(f):
            records.add(splice_junction_line_to_bed(line, experiment, rename))
    records = sorted(records, key = lambda x: [x[0], x[1]])
    with open(target_path, "w") as outf:
        for line in records:
            outf.write("\t".join(line) + "\n")
def split_ss_wrapper(target, args, ss_dict):
    """
    Wrapper for split_ss, which splits a sufficient statistics files by windows and blocks to make manageable parts.
    """
    if args.ref_fasta_path is None:
        args.ref_fasta_path = get_ref_genome_fasta(args.hal, args.ref_genome,
                                                   target.getGlobalTempDir())
    for chromosome, ss_path in ss_dict.iteritems():
        out_dir = os.path.join(args.output_dir, chromosome, chromosome)
        mkdir_p(out_dir)
        target.addChildTargetFn(split_ss,
                                args=(chromosome, ss_path, out_dir,
                                      args.ref_fasta_path,
                                      args.msa_split_options))
def write_browser_bed(out_dir, all_dir, formatted_names):
    a_details_con = sql.connect(os.path.join(all_dir, "details.db"))
    a_details_cur = a_details_con.cursor()
    formatted_classifiers = ", ".join(tm_coding_classifiers)
    cmd = "SELECT {} FROM C57B6NJ WHERE AlignmentId in ({})".format(
        formatted_classifiers, formatted_names)
    recs = a_details_cur.execute(cmd).fetchall()
    mkdir_p(out_dir)
    with open(os.path.join(out_dir, "failed_classifiers.bed"), "w") as outf:
        outf.write(
            'track name="Classifiers failed in allChain transcripts that were ok in simpleChain"\n'
        )
        for r in parse_details(recs):
            outf.write(r)
def main_ref_fn(target, comp_ann_path, gencode, ref_genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_transcript_classifiers"
    base_barplot_title = ("Classifiers failed by {} transcripts in the reference set {}\n")
    out_path = os.path.join(base_out_path, "clustering", ref_genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="reference")
    biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms)
    if len(biotype_ids) > 50:
        sql_data = sql_lib.load_data(con, ref_genome, etc.config.ref_classifiers, primary_key="TranscriptId")
        out_barplot_file = os.path.join(out_path, "reference_barplot_{}".format(gencode))
        barplot_title = base_barplot_title.format(biotype.replace("_", " "), gencode)
        munged, stats = munge_data(sql_data, biotype_ids)
        plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
        data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
        munged.to_csv(data_path)
        out_cluster_file = os.path.join(out_path, "reference_clustering_{}".format(gencode))
        target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def write_human_readable_classifiers(out_dir, to_investigate, a_con):
    formatted_names = ", ".join(['"' + x.name + '"' for x in to_investigate])
    formatted_classifiers = ", ".join(["AlignmentId"] + tm_coding_classifiers)
    cmd = "SELECT {} FROM C57B6NJ WHERE AlignmentId in ({})".format(formatted_classifiers, formatted_names)
    a_data = pd.read_sql(cmd, a_con)
    failures = defaultdict(list)
    for pos, row in a_data.iterrows():
        for classifier, value in row.iteritems():
            if classifier == "AlignmentId":
                name = value
            elif value == 1:
                failures[name].append(classifier)
    mkdir_p(out_dir)
    with open(os.path.join(out_dir, "failed_classifiers.tsv"), "w") as outf:
        for name, vals in failures.iteritems():
            vals = ",".join(sorted(vals))
            outf.write("\t".join([name, vals]) + "\n")
    return formatted_names
def main():
    args = parse_args()
    con, cur = sql_lib.attach_databases(args.compAnnPath, mode=args.mode)
    biotypes = sql_lib.get_all_biotypes(cur, args.refGenome, gene_level=True)
    transcript_gene_map = sql_lib.get_transcript_gene_map(
        cur, args.refGenome, biotype=None, filter_chroms=args.filterChroms)
    gps = load_gps(
        args.gps
    )  # load all Augustus and transMap transcripts into one big dict
    consensus_base_path = os.path.join(args.outDir, args.genome)
    stats = get_stats(cur, args.genome, args.mode)
    ref_gene_intervals = build_ref_intervals(cur, args.genome)
    tgt_intervals = build_tgt_intervals(gps)
    for biotype in biotypes:
        gene_transcript_map = sql_lib.get_gene_transcript_map(
            cur,
            args.refGenome,
            biotype=biotype,
            filter_chroms=args.filterChroms)
        binned_transcripts, consensus = consensus_by_biotype(
            cur, args.refGenome, args.genome, biotype, gps,
            transcript_gene_map, gene_transcript_map, stats, args.mode,
            ref_gene_intervals, tgt_intervals)
        deduplicated_consensus, dup_count = deduplicate_consensus(
            consensus, gps, stats)
        if len(deduplicated_consensus
               ) > 0:  # some biotypes we may have nothing
            num_genes, num_txs = write_gps(deduplicated_consensus, gps,
                                           consensus_base_path, biotype,
                                           transcript_gene_map, args.mode)
            if biotype == "protein_coding":
                gene_transcript_evals = evaluate_coding_consensus(
                    binned_transcripts, stats, ref_gene_intervals, gps,
                    args.mode)
            else:
                gene_transcript_evals = evaluate_noncoding_consensus(
                    binned_transcripts, stats, gps)
            p = os.path.join(args.workDir, "_".join([args.genome, biotype]))
            mkdir_p(os.path.dirname(p))
            gene_transcript_evals["duplication_rate"] = dup_count
            gene_transcript_evals["gene_counts"] = num_genes
            gene_transcript_evals["tx_counts"] = num_txs
            with open(p, "w") as outf:
                pickle.dump(gene_transcript_evals, outf)
def write_human_readable_classifiers(out_dir, to_investigate, a_con):
    formatted_names = ", ".join(['"' + x.name + '"' for x in to_investigate])
    formatted_classifiers = ", ".join(["AlignmentId"] + tm_coding_classifiers)
    cmd = "SELECT {} FROM C57B6NJ WHERE AlignmentId in ({})".format(
        formatted_classifiers, formatted_names)
    a_data = pd.read_sql(cmd, a_con)
    failures = defaultdict(list)
    for pos, row in a_data.iterrows():
        for classifier, value in row.iteritems():
            if classifier == "AlignmentId":
                name = value
            elif value == 1:
                failures[name].append(classifier)
    mkdir_p(out_dir)
    with open(os.path.join(out_dir, "failed_classifiers.tsv"), "w") as outf:
        for name, vals in failures.iteritems():
            vals = ",".join(sorted(vals))
            outf.write("\t".join([name, vals]) + "\n")
    return formatted_names
def write_gps(consensus, gps, consensus_base_path, biotype,
              transcript_gene_map, mode):
    """
    Writes the final consensus gene set to a genePred, after fixing the names. Reports the number of genes and txs
    in the final set
    """
    if mode == "transMap":
        p = os.path.join(consensus_base_path,
                         biotype + ".transmap_gene_set.gp")
    else:
        p = os.path.join(consensus_base_path,
                         biotype + ".augustus_consensus_gene_set.gp")
    mkdir_p(os.path.dirname(p))
    gp_recs = [gps[aln_id] for aln_id in consensus]
    num_genes, num_txs, fixed_gp_recs = fix_gene_pred(gp_recs,
                                                      transcript_gene_map)
    with open(p, "w") as outf:
        for rec in fixed_gp_recs:
            outf.write(rec)
    return num_genes, num_txs
示例#19
0
def database(genome, db, db_path, tmp_dir, mode):
    data_dict = {}
    mkdir_p(os.path.dirname(db_path))
    data_path = os.path.join(tmp_dir, db)
    for col in os.listdir(data_path):
        p = os.path.join(data_path, col)
        with open(p) as p_h:
            data_dict[col] = pickle.load(p_h)
    if mode == "reference":
        index_label = "TranscriptId"
    elif mode == "transMap":
        index_label = "AlignmentId"
    else:
        index_label = "AugustusAlignmentId"
        # Hack to add transMap alignment ID column to Augustus databases.
        aug_ids = data_dict.itervalues().next().viewkeys()
        data_dict["AlignmentId"] = {
            x: psl_lib.remove_augustus_alignment_number(x)
            for x in aug_ids
        }
    sql_lib.write_dict(data_dict, db_path, genome, index_label)
def main_augustus_fn(target, comp_ann_path, gencode, genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_augustus_classifiers"
    base_barplot_title = ("Augustus classifiers failed by {:,} transcripts derived from transMap\n"
                          "on the reference set {} with Augustus {}")
    out_path = os.path.join(base_out_path, "augustus_classifier_breakdown", genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="augustus")
    highest_cov_dict = sql_lib.highest_cov_aln(cur, genome)
    highest_cov_ids = set(zip(*highest_cov_dict.itervalues())[0])
    sql_data = sql_lib.load_data(con, genome, etc.config.aug_classifiers, primary_key="AugustusAlignmentId", 
                                 table="augustus")
    base_filter_set = {x for x in sql_data.index if psl_lib.remove_augustus_alignment_number(x) in highest_cov_ids}
    for mode in ["1", "2"]:
        i = "I{}".format(mode)
        aug_mode = "trusting RNAseq more" if mode == "2" else "trusting RNAseq less"
        filter_set = {x for x in base_filter_set if i in x}
        out_barplot_file = os.path.join(out_path, "augustus_barplot_{}_{}_{}".format(genome, gencode, i))
        barplot_title = base_barplot_title.format(len(filter_set), gencode, aug_mode)
        munged, stats = munge_data(sql_data, filter_set)
        plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
        data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
        munged.to_csv(data_path)
        out_cluster_file = os.path.join(out_path, "augustus_clustering_{}_{}_{}".format(genome, gencode, i))
        target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def main_fn(target, comp_ann_path, gencode, genome, ref_genome, base_out_path, filter_chroms):
    clust_title = "Hierarchical_clustering_of_transMap_classifiers"
    base_barplot_title = ("Classifiers failed by {} transcripts in the category {} in transMap analysis\n"
                          "Genome: {}.  Gencode set: {}.  {:,} ({:0.2f}%) of transcripts")
    out_path = os.path.join(base_out_path, "classifier_breakdown", genome)
    mkdir_p(out_path)
    con, cur = sql_lib.attach_databases(comp_ann_path, mode="transMap")
    fail_ids, passing_specific_ids, excellent_ids = sql_lib.get_fail_passing_excel_ids(cur, ref_genome, genome, biotype)
    biotype_ids = sql_lib.get_biotype_ids(cur, ref_genome, biotype, filter_chroms=filter_chroms)
    if len(biotype_ids) > 50:
        sql_data = sql_lib.load_data(con, genome, etc.config.clustering_classifiers)
        num_original_introns = sql_lib.load_data(con, genome, ["NumberIntrons"], table="attributes")
        for mode, ids in zip(*[["Fail", "Pass/NotExcellent"], [fail_ids, passing_specific_ids]]):
            mode_underscore = mode.replace("/", "_")
            out_barplot_file = os.path.join(out_path, "barplot_{}_{}_{}".format(genome, biotype, mode_underscore))
            percentage_of_set = 100.0 * len(ids) / len(biotype_ids)
            barplot_title = base_barplot_title.format(biotype.replace("_" , " "), mode, genome, gencode, len(ids), 
                                                      percentage_of_set)
            munged, stats = munge_intron_data(sql_data, num_original_introns, ids)
            plot_lib.barplot(stats, out_path, out_barplot_file, barplot_title)
            data_path = os.path.join(target.getGlobalTempDir(), getRandomAlphaNumericString())
            munged.to_csv(data_path)
            out_cluster_file = os.path.join(out_path, "clustering_{}_{}_{}".format(genome, biotype, mode_underscore))
            target.addChildTargetFn(r_wrapper, args=[data_path, clust_title, out_cluster_file])
def write_tx_bed(out_dir, to_investigate):
    mkdir_p(out_dir)
    with open(os.path.join(out_dir, "not_ok_all_chaining.bed"), "w") as outf:
        outf.write('track name="Transcripts OK in simpleChain and not OK in allChain"\n')
        for t in to_investigate:
            outf.write("\t".join(map(str, t.get_bed())) + "\n")
        os.symlink(bai_path, os.path.join(target_folder, genome, bam + ".bai"))

# make symlinks for all alignments done against target assemblies by Sanger

file_map = {}
for base_path, dirs, files in os.walk("/hive/groups/recon/projs/mus_strain_cactus/data/rel-1509-rna-seq/ftp-mouse.sanger.ac.uk/REL-1509-Assembly-RNA-Seq"):
    if files:
        genome, institute, tissue = base_path.split("/")[-3:]
        #genome = name_map[genome]
        for f in files:
            if f.endswith(".bam"):
                experiment = f.split("Aligned")[0]
                bam_path = os.path.join(base_path, f)
                bai_path = bam_path + ".bai"
                if (genome, institute, tissue) not in file_map:
                    file_map[(genome, institute, tissue)] = []
                file_map[(genome, institute, tissue)].append([experiment, bam_path, bai_path])


target_folder = "/cluster/home/ifiddes/mus_strain_data/pipeline_data/rnaseq/munged_STAR_data/REL-1509-chromosomes"
for (genome, institute, tissue), files in file_map.iteritems():
    for experiment, bam_path, bai_path in files:
        bam = institute + "_" + tissue + "_" + experiment + ".sortedByCoord.bam"
        mkdir_p(os.path.join(target_folder, genome))
        tgt_bam = os.path.join(target_folder, genome, bam)
        tgt_bai = os.path.join(target_folder, genome, bam + ".bai")
        if not os.path.exists(tgt_bam):
            os.symlink(bam_path, tgt_bam)
            print "linked {}".format(tgt_bam)
        if not os.path.exists(tgt_bai):
            os.symlink(bai_path, tgt_bai)
示例#24
0
    return phased_reads


bam_path = "/hive/users/ifiddes/longranger-1.2.0/{}/PHASER_SVCALLER_CS/PHASER_SVCALLER/ATTACH_PHASING/fork0/files/phased_possorted_bam.bam".format(genome)
bam_handle = pysam.Samfile(bam_path)
v_h = vcf.Reader(open("/hive/users/ifiddes/notch2nl_suns/Notch2NL_SUN_UniqueIndels.vcf.gz"))


phased_read_holder = {}
bam_handle = pysam.Samfile(bamfile)
for chrom, start, stop, name in regions:
    phased_read_holder[name] = bin_phased_reads(chrom, start, stop, bam_handle)


out_dir = "/hive/users/ifiddes/longranger-1.2.0/notch2nl_10x/linked_bam_analysis/split_bams/{}".format(genome)
mkdir_p(out_dir)
for para in phased_read_holder:
    for tag in phased_read_holder[para]:
        with pysam.Samfile(os.path.join(out_dir, "{}.{}.bam".format(para, "_".join(map(str, tag)))), "wb", template=bam_handle) as outf:
            for read in phased_read_holder[para][tag]:
                outf.write(read)


# validate these phasing results according to SUN positions


def build_vcf_intervals(reads, vcf_recs, bam_handle):
    """
    Find if any of these reads match a known SUN/indel by simple bedtools intersections
    """
    vcf_bed_recs = [ChromosomeInterval(x.CHROM, x.start, x.end, None) for x in vcf_recs]
示例#25
0
        if 'PS' in tags:
            t = (tags['PS'], tags['HP'])
            phased_reads[t].append(rec)
    return phased_reads


bam_path = "/hive/users/ifiddes/longranger-1.2.0/{}/PHASER_SVCALLER_CS/PHASER_SVCALLER/ATTACH_PHASING/fork0/files/phased_possorted_bam.bam".format(genome)
bam_handle = pysam.Samfile(bam_path)
phased_read_holder = {}
bam_handle = pysam.Samfile(bamfile)
for chrom, start, stop, name in regions:
    phased_read_holder[name] = bin_phased_reads(chrom, start, stop, bam_handle)


out_dir = "/hive/users/ifiddes/longranger-1.2.0/notch2nl_10x/linked_bam_analysis/split_bams/{}".format(genome)
mkdir_p(out_dir)
for para in phased_read_holder:
    for tag in phased_read_holder[para]:
        with pysam.Samfile(os.path.join(out_dir, "{}.{}.bam".format(para, "_".join(map(str, tag)))), "wb", template=bam_handle) as outf:
            for read in phased_read_holder[para][tag]:
                outf.write(read)

fastq_dir = "/hive/users/ifiddes/longranger-1.2.0/notch2nl_10x/linked_bam_analysis/split_fastqs/{}".format(genome)
mkdir_p(fastq_dir)
fastqs = {}
for bam in [x for x in os.listdir(out_dir) if x.endswith("bam")]:
    outbase = os.path.join(fastq_dir, bam.replace(".bam", ""))
    bampath = os.path.join(out_dir, bam)
    outpaired = outbase + ".paired.fq"
    outsingle = outbase + ".single.fq"
    fastqs[tuple(bam.replace(".bam", "").split("."))] = [outpaired, outsingle]