def read_aligner_output(rm_out, gtffile, is_stranded, tmp_dir, resume,
                        call_all):
    """
	Use bedtools to get transcripts/genes with multi-mapped reads.
	Returns a list of transcripts/genes.
	"""
    if not (resume and os.path.isfile(tmp_dir + '/gtf2multireads.bed')):
        rm_bed = pybedtools.BedTool(rm_out)
        gtf = pybedtools.BedTool(gtffile)
        gtf_bed_rm = gtf.intersect(
            rm_bed, s=True, u=True) if is_stranded else gtf.intersect(rm_bed,
                                                                      u=True)
        gtf_bed_rm.saveas(tmp_dir + '/gtf2multireads.bed')
        pybedtools.cleanup()

    tid_list = []
    if call_all:
        gtf_to_read = gtffile
    else:
        gtf_to_read = tmp_dir + '/gtf2multireads.bed'
    with open(gtf_to_read, 'r') as f:
        for line in f:
            ele = line.rstrip().split('\t')
            gene_id = ele[3]
            gene_chr, gene_start, gene_end = ele[0], int(ele[1]), int(ele[2])
            gene_strand = ele[5]
            tid_list.append(
                [gene_id, gene_chr, gene_strand, gene_start, gene_end])
    print_time_stamp('Read transcripts with multi-reads: ' +
                     str(len(tid_list)))
    return tid_list
示例#2
0
def calculate_coverage(bamfile_name, output_dir):
    os.makedirs(f'{output_dir}/tmp', exist_ok=True)
    pybedtools.set_tempdir(f'{output_dir}/tmp')
    bed = pybedtools.BedTool(bamfile_name)
    df = bed.genome_coverage(dz = True).to_dataframe(names=['contig','pos', 'depth'])
    pybedtools.cleanup()
    return df
示例#3
0
def test_cleanup():
    """
    make sure the tempdir and cleanup work
    """
    assert os.path.abspath(pybedtools.get_tempdir()) == os.path.abspath('.')

    # make a fake tempfile, not created during this pybedtools session
    testfn = 'pybedtools.TESTING.tmp'
    os.system('touch %s' % testfn)
    assert os.path.exists(testfn)

    # make some temp files
    a = pybedtools.BedTool(os.path.join(testdir, 'data', 'a.bed'))
    b = pybedtools.BedTool(os.path.join(testdir, 'data', 'b.bed'))
    c = a.intersect(b)

    # after standard cleanup, c's fn should be gone but the fake one still
    # there...
    pybedtools.cleanup(verbose=True)
    assert os.path.exists(testfn)
    assert not os.path.exists(c.fn)

    # Unless we force the removal of all temp files.
    pybedtools.cleanup(remove_all=True)
    assert not os.path.exists(testfn)

    # a.fn and b.fn better be there still!
    assert os.path.exists(a.fn)
    assert os.path.exists(b.fn)
示例#4
0
def GetRatioGenome():
    pulldown = pysam.Samfile(args.pulldown)
    control = pysam.Samfile(args.control)
    if args.tot_pulldown is not None:
        tot_pulldown = args.tot_pulldown
        tot_control = args.control
    else:
        tot_pulldown = pulldown.mapped + pulldown.unmapped
        tot_control = control.mapped + control.unmapped
    print >> sys.stderr, "Total number of reads in pulldown sample: %d" % (
        tot_pulldown)
    print >> sys.stderr, "Total number of reads in control sample: %d" % (
        tot_control)
    # get spike-in read within bed file
    pulldown_bam = pybedtools.BedTool(args.pulldown)
    control_bam = pybedtools.BedTool(args.control)
    spike_pulldown = pulldown_bam.intersect(args.pos, f=0.5).count()
    spike_control = control_bam.intersect(args.pos, f=0.5).count()
    print >> sys.stderr, "Total number of reads mapped in spike-in in pulldown sample: %d" % (
        spike_pulldown)
    print >> sys.stderr, "Total number of reads mapped in spike-in in control sample: %d" % (
        spike_control)
    ratio = float(spike_control) / float(spike_pulldown) * float(
        tot_pulldown) / float(tot_control)
    print >> sys.stderr, "Ratio is %.6f" % (ratio)
    pulldown.close()
    control.close()
    pybedtools.cleanup()
示例#5
0
def vcf_to_df_worker(arg):
    """Convert CANVAS vcf to a dict, single thread"""
    canvasvcf, exonbed, i = arg
    logging.debug("Working on job {}: {}".format(i, canvasvcf))
    samplekey = op.basename(canvasvcf).split(".")[0].rsplit("_", 1)[0]
    d = {"SampleKey": samplekey}

    exons = BedTool(exonbed)
    cn = parse_segments(canvasvcf)
    overlaps = exons.intersect(cn, wao=True)
    gcn_store = {}
    for ov in overlaps:
        # Example of ov.fields:
        # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5',
        # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene',
        # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0']
        gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5]))
        if gene_name not in gcn_store:
            gcn_store[gene_name] = defaultdict(int)

        cn = ov.fields[-2]
        if cn == ".":
            continue
        cn = int(cn)
        if cn > 10:
            cn = 10
        amt = int(ov.fields[-1])
        gcn_store[gene_name][cn] += amt

    for k, v in sorted(gcn_store.items()):
        v_mean, v_median = counter_mean_and_median(v)
        d[k + ".avgcn"] = v_mean
        d[k + ".medcn"] = v_median
    cleanup()
    return d
def main():
    """
    Third quick example from the documentation -- count reads introns and
    exons, in parallel
    """
    ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                 usage=__doc__)
    ap.add_argument('--gff', required=True,
                    help='GFF or GTF file containing annotations')
    ap.add_argument('--bam', required=True,
                    help='BAM file containing reads to be counted')
    ap.add_argument('--stranded', action='store_true',
                    help='Use strand-specific merging and overlap. '
                         'Default is to ignore strand')
    ap.add_argument('--no-parallel', dest='noparallel', action='store_true',
                    help='Disables parallel computation')
    ap.add_argument('-o', '--output',
                    help='Optional file to which results will be written; '
                         'default is stdout')
    ap.add_argument('-v', '--verbose', action='store_true',
                    help='Verbose (goes to stderr)')
    args = ap.parse_args()

    gff = args.gff
    bam = args.bam
    stranded = args.stranded
    parallel = not args.noparallel

    # Some GFF files have invalid entries -- like chromosomes with negative
    # coords or features of length = 0.  This line removes them and saves the
    # result in a tempfile
    g = pybedtools.BedTool(gff).remove_invalid().saveas()

    # Decide which version of map to use.  If parallel, we only need 3
    # processes.
    pool = multiprocessing.Pool(processes=3)

    # Get separate files for introns and exons in parallel (if specified)
    featuretypes = ('intron', 'exon')
    introns, exons = pool.map(subset_featuretypes, featuretypes)

    # Perform some genome algebra to get unique and shared regions
    exon_only = exons.subtract(introns).merge().remove_invalid().saveas()
    intron_only = introns.subtract(exons).merge().remove_invalid().saveas()
    intron_and_exon = exons\
            .intersect(introns).merge().remove_invalid().saveas()

    # Do intersections with BAM file in parallel
    features = (exon_only, intron_only, intron_and_exon)
    results = pool.map(count_reads_in_features, features)

    labels = ('      exon only:',
              '    intron only:',
              'intron and exon:')

    for label, reads in zip(labels, results):
        print('%s %s' % (label, reads))

    pybedtools.cleanup(verbose=False)
示例#7
0
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[],
                        max_interval_size=SPADES_MAX_INTERVAL_SIZE,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX,
                        svs_to_assemble=SVS_ASSEMBLY_SUPPORTED,
                        stop_on_fail=False, max_read_pairs=EXTRACTION_MAX_READ_PAIRS):
    pybedtools.set_tempdir(work)

    logger.info("Running SPAdes on the intervals in %s" % bed)
    if not bed:
        logger.info("No BED file specified")
        return None, None

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                                all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, max_interval_size=max_interval_size, svs_to_assemble=svs_to_assemble),
                               all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail,
                       "max_read_pairs": max_read_pairs}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    if os.path.getsize(assembled_fasta) > 0:
        logger.info("Indexing the assemblies")
        pysam.faidx(assembled_fasta)
    else:
        logger.error("No assembly generated")
        assembled_fasta = None

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed
def get_sequence(reference_fasta, coordinates, strand):
    """Takes coordinates and returns sequence
    bed_coor is space separated"""
    bed_coor = pybedtools.BedTool(coordinates, from_string=True)
    fasta = pybedtools.example_filename(reference_fasta)
    seq = bed_coor.sequence(fi=fasta)
    seq_str = open(seq.seqfn, 'r').read()
    pybedtools.cleanup(remove_all=True)
    return seq_str.replace('>', '').split('\n')[0:-1]
示例#9
0
def getMatching(table, name, exonBed, intermediate_file_dir):
    table.to_csv(os.path.join(intermediate_file_dir, 'junc.bed'), sep='\t', index=None, header=False)
    bed = pybedtools.example_bedtool(os.path.join(intermediate_file_dir, 'junc.bed'))
    OverlapExon = bed.intersect(exonBed, wb=True)
    OverlapExon.saveas(os.path.join(intermediate_file_dir, 'OverlapExon.tsv'))
    matching = pd.read_csv(os.path.join(intermediate_file_dir, 'OverlapExon.tsv'), sep='\t',
                                  names=[name+'_chr', name+'_start', name+'_end', 'juncID', name+'_strand', name+'_fb',
                                         name+'gene_chromosome', name+'gene_start', name+'gene_end', name+'gene_strand', name+'_gene_id',name+'_gene', name+'gene_type'])
    pybedtools.cleanup(remove_all=True)
    return matching
def bootstrapRandom(num):
    #print(num)
    global file1, file2, genome_chrom_sizes
    shuffled_file1 = file1.shuffle(g=genome_chrom_sizes, chrom=True)
    shuffled_file2 = file2.shuffle(g=genome_chrom_sizes, chrom=True)
    f1_sorted = shuffled_file1.sort()
    f2_sorted = shuffled_file2.sort()
    shuffled_result = f1_sorted.jaccard(f2_sorted)

    pybedtools.cleanup()
    return shuffled_result['jaccard']
示例#11
0
def main():
    args = argparser()
    forwardsites = findSites(args.input, args.reference, "+")
    revsites = findSites(args.input, args.reference, "-")
    try:
        with open(args.output,'wb') as of:
            for l in forwardsites + revsites:
                of.write(l + "\n")
    except TypeError:
        for l in forwardsites + revsites:
            args.output.write(l + "\n")
    pybedtools.cleanup(remove_all=True)
def main():
    args = argparser()
    forwardsites = findSites(args.input, args.reference, "+")
    revsites = findSites(args.input, args.reference, "-")
    try:
        with open(args.output, 'wb') as of:
            for l in forwardsites + revsites:
                of.write(l + "\n")
    except TypeError:
        for l in forwardsites + revsites:
            args.output.write(l + "\n")
    pybedtools.cleanup(remove_all=True)
示例#13
0
 def on_epoch_end(self, epoch, logs=None):
     """ monitor PR """
     x_val, y_val = self.validation_data[0], self.validation_data[1]
     predictions = self.model.predict(x_val)
     au_prc = average_precision_score(y_val, predictions)
     print("\nau-PRC:", au_prc)
     self.val_auprc.append(au_prc)
     # Tmp bedfiles taking up huge amount of disk space.
     # Cleaning up after every 10 epochs.
     print(epoch)
     if (epoch+1) % 5 == 0:
         pybedtools.cleanup(verbose=0)
示例#14
0
def preprocess_bam_to_bed(bam, output):
    '''
    Given local bam file, convert reads to set of 101bp intervals and output as bed file. Filter for reads thats are 
    '''
		# convert bam to bed
		vprint("Converting bam to bed...")
		bam = BedTool(bam)
		bed = bam.bam_to_bed()

		# filter intervals
		vprint("Filter reads by size...")
		bed_chunk_iter = bed.to_dataframe(chunksize=10000000)  # chunk large file
		chunks = []
		for chunk in bed_chunk_iter:
				keep = (
						chunk[["start", "end"]]
						.swifter.progress_bar(enable=True, desc=bam)
						.apply(lambda row: is_valid_interval(row["start"], row["end"]), axis=1)
				)

				chunks.append(chunk[keep])

		bed_df = pd.concat(chunks)

		# 101bp interval for input
		vprint("Define 101bp intervals...")
		bed_df["end"] = (
				bed_df["start"].swifter.progress_bar(
						enable=True).apply(define_interval)
		)
		bed_df["name"] = "-"

		# remove duplicates
		vprint("Drop duplicate intervals...")
		bed_df.drop_duplicates(inplace=True)

		# TODO extraneous chromosomes?
		vprint("Remove extra chromosomes...")
		chromosomes = list(range(1, 23))
		chromosomes.append('X')
		chromosomes.append('Y')
		chromosomes = [f'chr{c}' for c in chromosomes]
		bed_df = bed_df.loc[bed_df['chrom'].isin(chromosomes)]

		# Save result
		vprint(f"Saving {bed_df.shape[0]} intervals...")
		BedTool.from_dataframe(bed_df).moveto(output)

		# cleanup tmp files
		pybedtools.cleanup(remove_all=True)

		vprint("Done.")
示例#15
0
def find_variant_fragments(query_fp, fragment_fp, output_fp, db_url, table):
    """ Create table of variant fragment IDs.
    """
    fragment_bed = pybedtools.BedTool(fragment_fp)
    desc = 'Finding variant fragments...'
    bar_format = '{desc}: {n_fmt} {unit}'
    t = tqdm(total=0,
             unit='variants',
             desc=desc,
             disable=False,
             bar_format=bar_format)
    chunksize = 200000
    cols = ['chr', 'frag_id', 'id', 'variant_id']
    pybedtools.set_tempdir(os.path.dirname(output_fp))
    db = create_engine(db_url, echo=False)
    idx = 1
    for query_df in pd.read_csv(query_fp,
                                sep='\t',
                                compression='gzip',
                                chunksize=chunksize,
                                usecols=['chr', 'variant_pos', 'variant_id']):
        query_df = query_df.rename(columns={'variant_pos': 'end'})
        query_df['id'] = range(idx, idx + len(query_df))
        query_df['start'] = query_df['end'] - 1
        query_bed = pybedtools.BedTool.from_dataframe(
            query_df[['chr', 'start', 'end', 'id', 'variant_id']])
        df = fragment_bed.intersect(query_bed, wb=True)
        df = df.to_dataframe(names=[
            'frag_chr', 'frag_start', 'frag_end', 'frag_id', 'chrom', 'start',
            'end', 'id', 'variant_id'
        ])
        cols = ['frag_id', 'chrom', 'id']
        mode = 'w' if t.total == 0 else 'a'
        header = 'True' if t.total == 0 else None
        df[cols].to_csv(output_fp, sep='\t', header=header, mode=mode)
        if table:
            if_exists = 'replace' if t.total == 0 else 'append'
            df[cols].to_sql(table, con=db, if_exists=if_exists, index=False)
        idx += len(query_df)
        t.total += len(query_df)
        t.update(len(query_df))
    t.close()
    pybedtools.cleanup(remove_all=True)
    if not table:
        return
    create_index(table, db)
示例#16
0
def find_gene_fragments(query_fp, fragment_fp, output_fp, db_url, table):
    """ Create table of gene fragment IDs.
    """
    fragment_bed = pybedtools.BedTool(fragment_fp)
    desc = 'Finding gene fragments...'
    bar_format = '{desc}: {n_fmt} {unit}'
    t = tqdm(total=0,
             unit='genes',
             desc=desc,
             disable=False,
             bar_format=bar_format)
    chunksize = 2000
    pybedtools.set_tempdir(os.path.dirname(output_fp))
    db = create_engine(db_url, echo=False)
    for query_df in pd.read_csv(query_fp,
                                sep='\t',
                                compression='infer',
                                chunksize=chunksize):
        #query_df.columns = ['id', 'chr', 'start', 'end', 'gene', 'gencode_id']
        query_bed = pybedtools.BedTool.from_dataframe(
            query_df[['chrom', 'start', 'end', 'id', 'gencode_id']])
        #    print(query_bed)
        df = fragment_bed.intersect(query_bed, wb=True)
        df = df.to_dataframe(names=[
            'frag_chr', 'frag_start', 'frag_end', 'frag_id', 'chrom', 'start',
            'end', 'id', 'gencode_id'
        ])
        cols = ['frag_id', 'chrom', 'id']
        mode = 'w' if t.total == 0 else 'a'
        header = 'True' if t.total == 0 else None
        df[cols].to_csv(output_fp,
                        sep='\t',
                        header=header,
                        mode=mode,
                        index=False)
        if table:
            if_exists = 'replace' if t.total == 0 else 'append'
            df[cols].to_sql(table, con=db, if_exists=if_exists, index=False)
        t.total = len(query_df)
        t.update(len(query_df))
    t.close()
    pybedtools.cleanup(remove_all=True)
    if not table:
        return
    create_index(table, db)
示例#17
0
def retrieve_peaks(peak_file, peak_kwd, group_name, seq_dict):
    peaks = pybedtools.BedTool(peak_file)
    num_files = len(seq_dict[peak_kwd][group_name].keys())
    print_time("{} batches to process".format(num_files), start_time)

    for file in seq_dict[peak_kwd][group_name].keys():
        if __check_exist(file):
            print_time("{} exists -- skip".format(file), start_time)
            continue

        # Initialize output signal.
        sample_index, start, stop = __file_attribute(
            seq_dict[peak_kwd][group_name][file])
        signal = np.empty((stop - start, args.sequence_length + 3))
        signal[:] = np.NaN
        signal[:, -3:-1] = sample_index

        # Find peaks that overlap with each sample sequence.
        for k in tqdm(range(start, stop)):
            ks = k - start
            signal[ks, -1] = k
            signal[ks, :args.sequence_length] = 0
            sample = Sample(seq_dict["input"][k])
            entry = "{} {} {}".format(sample.chrom, sample.start, sample.stop)
            a = pybedtools.BedTool(entry, from_string=True)
            apeaks = a.intersect(peaks)
            for p in apeaks:
                s = p.start - sample.start
                t = p.stop - sample.start
                signal[ks, s:t] = 1
            if (k + 1) % 1000 == 0:
                pybedtools.cleanup(remove_all=True)

        # Save batch data file to disk.
        np.savez_compressed(
            file,
            group_name=group_name,
            peak_type=peak_kwd,
            start=start,
            stop=stop,
            data=signal,
        )
        print_time("{} targets saved in {}".format(peak_kwd, file), start_time)
示例#18
0
def retrieve_signal(
    peak_file, bigWig_file, seq_dict, tmp_file, group_name, assay_type
):
    peaks = pybedtools.BedTool(peak_file)
    num_samples = len(seq_dict["input"])
    writer = FeatureWriter(
        args.batch_size,
        args.sequence_length,
        num_samples,
        group_name,
        assay_type,
    )

    for k in tqdm(seq_dict["input"].keys()):
        # Initialize signal track.
        signal = np.zeros(args.sequence_length)

        # Construct BedTool input from sample sequence location.
        sample = Sample(seq_dict["input"][k])
        entry = "{} {} {}".format(sample.chrom, sample.start, sample.stop)
        a = pybedtools.BedTool(entry, from_string=True)

        # Retrieve sample bigwig signal that fall within peak regions.
        apeaks = a.intersect(peaks)
        for p in apeaks:
            cmd = "bigWigToBedGraph -chrom={} -start={} -end={} {} {}".format(
                sample.chrom, p.start, p.stop, bigWig_file, tmp_file
            )
            check_call(cmd, shell=True)
            with open(tmp_file, "rb") as wigs:
                for line in wigs:
                    record = line.strip().decode("utf-8").split("\t")
                    s = int(record[1]) - sample.start
                    t = int(record[2]) - sample.start
                    signal[s:t] = float(record[3])
        # Write signal track to disk.
        writer.write_feature(
            signal, k, seq_dict["input"][k][assay_type][group_name]
        )
        # Clean up tmp files generated by pybedtools.
        if (k + 1) % 1000 == 0:
            pybedtools.cleanup(remove_all=True)
    return writer
示例#19
0
def run_spades_parallel(bam=None, spades=None, bed=None, work=None, pad=SPADES_PAD, nthreads=1, chrs=[], max_interval_size=50000,
                        timeout=SPADES_TIMEOUT, isize_min=ISIZE_MIN, isize_max=ISIZE_MAX, disable_deletion_assembly=False, stop_on_fail=False):
    pybedtools.set_tempdir(work)

    bedtool = pybedtools.BedTool(bed)
    total = bedtool.count()

    chrs = set(chrs)
    all_intervals = [interval for interval in bedtool] if not chrs else [interval for interval in bedtool if
                                                                         interval.chrom in chrs]
    selected_intervals = filter(partial(should_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)
    ignored_intervals = filter(partial(shouldnt_be_assembled, disable_deletion_assembly=disable_deletion_assembly), all_intervals)

    pool = multiprocessing.Pool(nthreads)
    assembly_fastas = []
    for i in xrange(nthreads):
        intervals = [interval for (j, interval) in enumerate(selected_intervals) if (j % nthreads) == i]
        kwargs_dict = {"intervals": intervals, "bam": bam, "spades": spades, "work": "%s/%d" % (work, i), "pad": pad,
                       "timeout": timeout, "isize_min": isize_min, "isize_max": isize_max, "stop_on_fail": stop_on_fail}
        pool.apply_async(run_spades_single, kwds=kwargs_dict,
                         callback=partial(run_spades_single_callback, result_list=assembly_fastas))

    pool.close()
    pool.join()

    logger.info("Merging the contigs from %s" % (str(assembly_fastas)))
    assembled_fasta = os.path.join(work, "spades_assembled.fa")
    with open(assembled_fasta, "w") as assembled_fd:
        for line in fileinput.input(assembly_fastas):
            assembled_fd.write("%s\n" % (line.strip()))

    logger.info("Indexing the assemblies")
    pysam.faidx(assembled_fasta)

    ignored_bed = None
    if ignored_intervals:
        ignored_bed = os.path.join(work, "ignored.bed")
        pybedtools.BedTool(ignored_intervals).each(add_breakpoints).saveas(ignored_bed)

    pybedtools.cleanup(remove_all=True)

    return assembled_fasta, ignored_bed
def GetRatioGenome():
    pulldown = pysam.Samfile(args.pulldown)
    control = pysam.Samfile(args.control)
    if args.tot_pulldown is not None:
        tot_pulldown = args.tot_pulldown
        tot_control = args.control
    else:
        tot_pulldown = pulldown.mapped + pulldown.unmapped
        tot_control = control.mapped + control.unmapped
    print >>sys.stderr, "Total number of reads in pulldown sample: %d" % (tot_pulldown)
    print >>sys.stderr, "Total number of reads in control sample: %d" % (tot_control)
    # get spike-in read within bed file
    pulldown_bam = pybedtools.BedTool(args.pulldown)
    control_bam = pybedtools.BedTool(args.control)
    spike_pulldown = pulldown_bam.intersect(args.pos,f=0.5).count()
    spike_control = control_bam.intersect(args.pos,f=0.5).count()
    print >>sys.stderr, "Total number of reads mapped in spike-in in pulldown sample: %d" % (spike_pulldown)
    print >>sys.stderr, "Total number of reads mapped in spike-in in control sample: %d" % (spike_control)
    ratio = float(spike_control)/float(spike_pulldown)*float(tot_pulldown)/float(tot_control)
    print >>sys.stderr, "Ratio is %.6f" % (ratio)
    pulldown.close()
    control.close()
    pybedtools.cleanup()
示例#21
0
文件: cnv.py 项目: xuanblo/jcvi
def vcf_to_df_worker(arg):
    """ Convert CANVAS vcf to a dict, single thread
    """
    canvasvcf, exonbed, i = arg
    logging.debug("Working on job {}: {}".format(i, canvasvcf))
    samplekey = op.basename(canvasvcf).split(".")[0].rsplit('_', 1)[0]
    d = {'SampleKey': samplekey}

    exons = BedTool(exonbed)
    cn = parse_segments(canvasvcf)
    overlaps = exons.intersect(cn, wao=True)
    gcn_store = {}
    for ov in overlaps:
        # Example of ov.fields:
        # [u'chr1', u'11868', u'12227', u'ENSG00000223972.5',
        # u'ENST00000456328.2', u'transcribed_unprocessed_pseudogene',
        # u'DDX11L1', u'.', u'-1', u'-1', u'.', u'0']
        gene_name = "|".join((ov.fields[6], ov.fields[3], ov.fields[5]))
        if gene_name not in gcn_store:
            gcn_store[gene_name] = defaultdict(int)

        cn = ov.fields[-2]
        if cn == ".":
            continue
        cn = int(cn)
        if cn > 10:
            cn = 10
        amt = int(ov.fields[-1])
        gcn_store[gene_name][cn] += amt

    for k, v in sorted(gcn_store.items()):
        v_mean, v_median = counter_mean_and_median(v)
        d[k + ".avgcn"] = v_mean
        d[k + ".medcn"] = v_median
    cleanup()
    return d
示例#22
0
文件: bin.py 项目: wresch/4C
def bin_frag(outdir, bin_bed, all_frag_bed, fragcount_bed):
    logging.info("output directory: %s", outdir)
    logging.info("Bin file: %s", bin_bed)
    logging.info("All restriction fragments bed file: %s", all_frag_bed)
    logging.info("Number of fragdata files: %d", len(fragcount_bed))
    os.mkdir(outdir)

    # open bins file
    bins = pbt.BedTool(bin_bed)
    logging.info("read in %8d bins", len(bins))

    # open all frag file
    all_frag = pbt.BedTool(all_frag_bed)
    logging.info("read in %8d restriction fragments", len(all_frag))

    # match up bins with restriction fragments
    #TODO: stats on the result
    bins_with_any_frag = count_frags_per_bin(bins, all_frag)
    logging.info("bins that contained any fragments: %d", len(bins_with_any_frag))

    make_bedgraph_files(fragcount_bed, bins_with_any_frag, outdir)

    # cleanup
    pbt.cleanup(remove_all = True)
示例#23
0
def extract_target_genes_transcripts(dicoNiourk):
    if dicoNiourk["target"]!="":
        print("\x1b[0;38;2;"+dicoNiourk["color"]["light1"]+"m") ; sys.stdout.write("\033[F")
        dicoNiourk["spinner"].text = "    • Extract target genes"
        dicoNiourk["spinner"].start()
        # Find intersecation between gff and target bed
        bed = BedTool(dicoNiourk["target"])
        genes = BedTool(dicoNiourk["refseq_gff"])
        dicoNiourk["target_gene"] = {} # id to name
        dicoNiourk["target_transcript"] = {} # id to name
        dico_intersect_transcript = {}
        # Search gff exons intresection
        for intersect_elem in genes+bed:
            if intersect_elem.fields[2]=="exon":
                exon = dicoNiourk["db_gff"][intersect_elem.attrs["ID"]]
                # retrieve correspunding transcript
                for rna in dicoNiourk["db_gff"].parents(exon, featuretype='mRNA', order_by='start'):
                    try: dico_intersect_transcript[rna]+=1
                    except: dico_intersect_transcript[rna] = 1
        #***** FILTER transcript which all coding exons in target *****#
        for rna in dico_intersect_transcript:
            # retrieve parent gene
            gene = list(dicoNiourk["db_gff"].parents(rna, featuretype='gene', order_by='start'))[0]
            cds_start = list(dicoNiourk["db_gff"].children(rna, featuretype='CDS', order_by='start'))[0].start
            cds_end = list(dicoNiourk["db_gff"].children(rna, featuretype='CDS', order_by='start'))[-1].end
            # Count coding exon
            nb_coding_exons = 0
            for exon in dicoNiourk["db_gff"].children(rna, featuretype='exon', order_by='start'):
                if exon.end>cds_start and exon.start<cds_end: nb_coding_exons+=1
            # Filtre transcripts and genes
            if dico_intersect_transcript[rna]>=nb_coding_exons:
                dicoNiourk["target_transcript"][rna.attributes["Name"][0]] = rna.id
                for gene in dicoNiourk["db_gff"].parents(rna, featuretype='gene', order_by='start'): dicoNiourk["target_gene"][gene.attributes["Name"][0]] = gene.id
        dicoNiourk["spinner"].stop()
        printcolor("    • "+str(len(dicoNiourk["target_gene"]))+"genes/"+str(len(dicoNiourk["target_transcript"]))+"rnas extracted\n","0",dicoNiourk["color"]["light1"],None,dicoNiourk["color"]["bool"])
        cleanup(remove_all=True) # delete created temp file
示例#24
0
def Jaccard_stats(bed_fname1, bed_fname2, genome):
    """Compute Jaccard index.

    Parameters
    ----------
    bed_fname1 : string
        Name of file with CNV calls from experiment 1 in BED format.
    bed_fname2 : string
        Name of file with CNV calls from experiment 2 in BED format.
    genome : string
        Name of genome file.

    Returns
    -------
    jacc_idx : float
        Jaccard index.
    """

    exp1 = pybedtools.BedTool(bed_fname1).merge()
    exp2 = pybedtools.BedTool(bed_fname2).merge()
    res = exp1.jaccard(exp2, sorted=True, g=genome)
    pybedtools.cleanup(remove_all=True)
    jacc_idx = res['jaccard']
    return jacc_idx
示例#25
0
def MultiThreadRun(index, iboolDict, args, kwargs):
    if iboolDict['bam']:
        tempFile = tempfile.NamedTemporaryFile(suffix='.tmp',
                                               prefix='pybedtools.tempfile',
                                               dir=args.temp,
                                               delete=True)
    if iboolDict['both']:
        peakBed = pybedtools.BedTool(args.bed[index]).sort()
        bamFile = args.bam[index]
        inputBedDict = BamToBed(bamFile, peakBed, tempFile.name, args)
    elif iboolDict['bed']:
        inputBedDict = RebuildBed(args.bed[index], args.method, args.extend)
    else:
        peakBed = None
        bamFile = args.bam[index]
        inputBedDict = BamToBed(bamFile, peakBed, tempFile.name, args)
    ## retrieve bin-value relationships
    sampleName = args.name[index]
    binValDict = RunMetagene(inputBedDict, args, kwargs)
    ## Deletes all temp files from the current session
    pybedtools.cleanup(verbose=False, remove_all=False)
    if iboolDict['bam']:
        tempFile.close()
    return [sampleName, binValDict]
示例#26
0
def parallel_generate_sc_intervals(bams, chromosomes, skip_bed, workdir, num_threads=1,
                                   min_avg_base_qual=SC_MIN_AVG_BASE_QUAL,
                                   min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP,
                                   pad=SC_PAD,
                                   min_support_ins=MIN_SUPPORT_INS, min_support_frac_ins=MIN_SUPPORT_FRAC_INS, 
                                   max_intervals=MAX_INTERVALS, max_nm=SC_MAX_NM, min_matches=SC_MIN_MATCHES, 
                                   isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD,
                                   svs_to_softclip=SVS_SOFTCLIP_SUPPORTED,
                                   overlap_ratio=OVERLAP_RATIO, mean_read_length=MEAN_READ_LENGTH,
                                   mean_read_coverage=MEAN_READ_COVERAGE, min_ins_cov_frac=MIN_INS_COVERAGE_FRAC,
                                   max_ins_cov_frac=MAX_INS_COVERAGE_FRAC):
    func_logger = logging.getLogger(
        "%s-%s" % (parallel_generate_sc_intervals.__name__, multiprocessing.current_process()))

    if not os.path.isdir(workdir):
        func_logger.info("Creating directory %s" % workdir)
        os.makedirs(workdir)

    if not chromosomes:
        func_logger.info("Chromosome list unspecified. Inferring from the BAMs")
        for bam in bams:
            bamfile = pysam.Samfile(bam, "rb")
            chromosomes += list(bamfile.references)
            bamfile.close()
        chromosomes = sorted(list(set(chromosomes)))
        func_logger.info("Chromosome list inferred as %s" % (str(chromosomes)))

    if not chromosomes:
        func_logger.error("Chromosome list empty")
        return None


    merge_max_dist = -int(1 * pad)


    func_logger.info("SVs to soft-clip: %s" % (svs_to_softclip))

    pool = multiprocessing.Pool(num_threads)

    bed_files = []
    for index, (bam, chromosome) in enumerate(itertools.product(bams, chromosomes)):
        process_workdir = os.path.join(workdir, str(index))
        if not os.path.isdir(process_workdir):
            os.makedirs(process_workdir)

        args_list = [bam, chromosome, process_workdir]
        kwargs_dict = {"min_avg_base_qual": min_avg_base_qual, "min_mapq": min_mapq, "min_soft_clip": min_soft_clip,
                       "pad": pad, "min_support_ins": min_support_ins,
                       "min_support_frac_ins": min_support_frac_ins, "max_nm": max_nm, "min_matches": min_matches, 
                       "isize_mean": isize_mean, "isize_sd": isize_sd, "svs_to_softclip": svs_to_softclip, 
                       "merge_max_dist": merge_max_dist, "mean_read_length": mean_read_length,
                       "mean_read_coverage": mean_read_coverage, "min_ins_cov_frac": min_ins_cov_frac,
                       "max_ins_cov_frac": max_ins_cov_frac}
        pool.apply_async(generate_sc_intervals, args=args_list, kwds=kwargs_dict,
                         callback=partial(generate_sc_intervals_callback, result_list=bed_files))

    pool.close()
    pool.join()

    # Remove empty BED files, which can cause merging issues with pybedtools
    bed_files = [bed_file for bed_file in bed_files if os.path.exists(bed_file) and os.path.getsize(bed_file) > 0]

    func_logger.info("Following BED files will be merged: %s" % (str(bed_files)))

    if not bed_files:
        func_logger.warn("No intervals generated")
        return None

    pybedtools.set_tempdir(workdir)
    bedtool = pybedtools.BedTool(bed_files[0])

    for bed_file in bed_files[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)

    bedtool = bedtool.sort().moveto(os.path.join(workdir, "all_intervals.bed"))
    
    func_logger.info("Selecting the top %d intervals based on normalized read support" % max_intervals)
    top_intervals_all_cols_file = os.path.join(workdir, "top_intervals_all_cols.bed")
    if bedtool.count() <= max_intervals:
        bedtool = bedtool.saveas(top_intervals_all_cols_file)
    else:
        # Sample the top intervals
        top_fraction_cutoff = \
            sorted([find_coverage_frac(interval.fields[7], interval.fields[6]) for interval in bedtool], reverse=True)[
                max_intervals - 1]
        func_logger.info("Normalized read support threshold: %0.3f" % top_fraction_cutoff)
        bedtool = bedtool.filter(lambda x: find_coverage_frac(x.fields[7],x.fields[6]) >= top_fraction_cutoff).moveto(
            top_intervals_all_cols_file)

    # Filter out the extra column added to simplify life later on
    bedtool = bedtool.cut(xrange(6)).saveas(os.path.join(workdir, "top_intervals.bed"))

    interval_bed = os.path.join(workdir, "intervals.bed")
    if skip_bed:
        skip_bedtool = pybedtools.BedTool(skip_bed)
        sc_skip_bed = os.path.join(workdir, "sc_metasv.bed")
        if "INS" in svs_to_softclip:
            skip_bedtool = skip_bedtool.each(partial(add_INS_padding,pad=pad)).saveas(sc_skip_bed)
        nonsc_skip_bed = os.path.join(workdir, "non_sc_metasv.bed")
        func_logger.info(
            "Merging %d features with %d features from %s" % (bedtool.count(), skip_bedtool.count(), skip_bed))
        nonsc_skip_bedtool = skip_bedtool.filter(lambda x: x.name.split(',')[1] not in svs_to_softclip).saveas(nonsc_skip_bed)
        sc_skip_bedtool = skip_bedtool.filter(lambda x: x.name.split(',')[1] in svs_to_softclip).saveas(interval_bed)
        if len(sc_skip_bedtool) > 0:
            bedtool = bedtool.cat(sc_skip_bedtool, postmerge=False)
        bedtool = bedtool.sort()
        bedtool = merge_for_each_sv(bedtool,c="4",o="collapse",svs_to_softclip=svs_to_softclip,
                                  overlap_ratio=overlap_ratio, reciprocal_for_2bp=True, d=merge_max_dist)
        bedtool = bedtool.each(partial(fix_merged_fields,inter_tools=True)).sort().moveto(interval_bed)
        if len(nonsc_skip_bedtool) > 0:
            bedtool = bedtool.cat(nonsc_skip_bedtool, postmerge=False).sort().moveto(interval_bed)
        func_logger.info("After merging with %s %d features" % (skip_bed, bedtool.count()))
    else:
        bedtool = bedtool.saveas(interval_bed)

    pybedtools.cleanup(remove_all=True)

    return bedtool.fn
示例#27
0
def main():
    opt_parser = ArgumentParser(
        description="Annotate genomic intervals with RefSeq or Ensembl databases.",
        prog="region_analysis.py")
    opt_parser.add_argument('-i', '--input', action='store',
                            help='Input region file must assume the first 3 columns contain (chr, start, end)')
    opt_parser.add_argument('-d', '--database', action='store',
                            help='Choose database: refseq(default) or ensembl',
                            default='refseq')
    opt_parser.add_argument('-r', '--rhead', action='store_true',
                            help='Whether the input file contains column header', default=False)
    opt_parser.add_argument('-g', '--genome', action='store',
                            help='Choose genome: mm10(default)',
                            default='mm10')
    opt_parser.add_argument('-rv', '--RAver', action='store',
                            help='Version of Region Analysis databases, default is the newest',
                            default=None)
    opt_parser.add_argument('-v', '--version', action='store_true',
                            help='Version of Region_Analysis package')
    options = opt_parser.parse_args()
    if options.version:
        sys.stdout.write("Region_Analysis Version: %s\n" %
                         regionanalysis.packageinfo.__version__)
        opt_parser.print_help()
        return 0
    module_dir = os.path.dirname(os.path.realpath(regionanalysis.__file__))
    # db_path = os.path.join(module_dir, "database/")
    input_file_name = options.input
    anno_db = options.database
    rhead = options.rhead
    genome = options.genome
    rv = options.RAver
    if (input_file_name is None) or (len(input_file_name) == 0):
        opt_parser.error(
            "Please assign proper input file!\n--help will show the help information.")
    genome_info = regionanalysis.annotationdb.getAnnoDBPath(
        module_dir, genome, anno_db, rv)
    try:
        if genome_info is None:
            raise SystemExit
        db_path = genome_info["path"]
    except SystemExit:
        if rv is None:
            sys.stderr.write("%s not in the genome database!\n" % genome)
            return 1
        else:
            sys.stderr.write("%s, RAver %s not in the genome database!\n" %
                             (genome, rv))
            return 1

    # create a tmp bed file with index column.
    in_f = file(input_file_name)
    # filter the comment lines
    input_filtered = [
        line for line in in_f if not (line.lstrip().startswith("#") or len(line.strip())==0)]
    # if there is header, store it and remove it from the query BED.
    if rhead:
        headlineL = input_filtered[0].strip().split("\t")
        del input_filtered[0]
    # add index column to the bed lines
    input_indexed = ['%s\t%d\n' % (line.strip(), i)
                     for i, line in enumerate(input_filtered)]
    in_f.close()

    # read all annotations into a dictionary, for the further output.
    anno_bed = os.path.join(
        db_path, genome + "." + anno_db + ".biotype_region_ext.bed")
    try:
        if not os.path.exists(anno_bed):
            raise SystemExit
    except SystemExit:
        sys.stderr.write("%s genome not properly installed!\n" % genome)
        return 1

    # use saveas() to convert the BedTool objects to file-based objects,
    # so they could be used multiple times.
    # When debug, we may use saveas("tss.tmp"), and the output of bedtools
    # could be saved.
    pybedtools.set_tempdir("./")
    anno = pybedtools.BedTool(anno_bed).saveas()
    gd = pybedtools.BedTool(
        os.path.join(db_path, genome + "_geneDesert.bed")).saveas()
    pc = pybedtools.BedTool(
        os.path.join(db_path, genome + "_pericentromere.bed")).saveas()
    st = pybedtools.BedTool(
        os.path.join(db_path, genome + "_subtelomere.bed")).saveas()

    # load the input intervals to be annotated.
    try:
        input_bed = pybedtools.BedTool(
            "".join(input_indexed), from_string=True).saveas()
    except:
        sys.stderr.write("Error in input file! Please check the format!")
        return 1
    list_input = [x.fields[:] for x in input_bed]
    col_no_input = input_bed.field_count()
    # get the midpoint of the intervals.
    # there is a bug in midpoint function of pybedtools 0.6.3, so here an alternative function was used.
    # input_bed_mid = input_bed.each(pybedtools.featurefuncs.midpoint).saveas()
    input_bed_mid = pybedtools.BedTool(
        "".join([regionanalysis.analysis.midpoint(x) for x in input_indexed]), from_string=True).saveas()

    # intersectBed with annotations.
    input_GB = input_bed_mid.intersect(anno, wao=True).saveas()
    list_GB = [x.fields[:] for x in input_GB]
    input_gd = input_bed_mid.intersect(gd, c=True, f=0.5).saveas()
    list_gd = [x.fields[col_no_input + 0] for x in input_gd]
    input_pc = input_bed_mid.intersect(pc, c=True, f=0.5).saveas()
    list_pc = [x.fields[col_no_input + 0] for x in input_pc]
    input_st = input_bed_mid.intersect(st, c=True, f=0.5).saveas()
    list_st = [x.fields[col_no_input + 0] for x in input_st]

    # groupby the intersectBed results based on the index column.
    input_idx = key = lambda s: s[col_no_input - 1]
    GB_dict = {}
    for key, GB_hits in groupby(list_GB, key=input_idx):
        GB_dict[key] = list(v for v in GB_hits)

    output_file_best = file(input_file_name + ".annotated", "w")
    output_file = file(input_file_name + ".full.annotated", "w")
    output_file_json = file(input_file_name + ".full.annotated.json", "w")
    # Output the header.
    if rhead:
        output_file.write("\t".join(
            headlineL + ["GName", "TName", "Strand", "TSS", "TES", "Feature", "D2TSS", "Biotype", "GeneSymbol"]) + "\n")
        output_file_best.write("\t".join(
            headlineL + ["GName", "TName", "Strand", "TSS", "TES", "Feature", "D2TSS", "Biotype", "GeneSymbol"]) + "\n")
    # write to the output: input.bed.annotated, input.bed.full.annotated.
    json_dict = {}
    for i in range(0, len(input_bed)):
        output_lineL = list_input[i][:-1]  # original input line
        json_dict[str(i)] = {}
        json_dict[str(i)]["query_interval"] = output_lineL
        formatted, best_hit = regionanalysis.analysis.getBestHit(
            anno_db, col_no_input, GB_dict[str(i)], list_gd[i], list_st[i], list_pc[i])
        output_file_best.write("\t".join(output_lineL + best_hit) + "\n")
        json_dict[str(i)]["best_hit"] = best_hit
        for j in formatted:
            output_file.write("\t".join(output_lineL + j) + "\n")
        json_dict[str(i)]["all_hits"] = formatted
    output_file_best.close()
    output_file.close()
    json.dump(json_dict, output_file_json, sort_keys=True, indent=2)
    output_file_json.close()
    pybedtools.cleanup()
    return 0
示例#28
0
def teardown():
    pybedtools.cleanup(remove_all=True)
示例#29
0
def main():
    
    parser=OptionParser()
    
    parser.add_option('-i', '--inputBAM', dest='inputBAM', 
                      help='Aligned BAM from zUMI filtering+mapping steps with cell barcode and umi barcode correction.')
    
    parser.add_option('-c', '--config', dest='config', 
                      help='A configuration file for required files and parameters.')
    
    parser.add_option('-e', '--experiment', dest='experiment', 
                      help='Experiment name.')
    
    parser.add_option('-o', '--outputDir', dest='outputDir', default='ss3rnaseq',
                      help='The output directory for the experiment.')
    
    parser.add_option('-p', '--process', dest='process', default=8,
                      help='The number of processes for parallel computing.')
    
    parser.add_option('-s', '--species', dest='species', default='hg38',
                      help='The species under study.')
    
    parser.add_option("-P", "--Preprocess", action="store_true", dest='preprocess',
                      help="Preprocess the input BAM for downstream analysis.")
    
    parser.add_option("-Q", "--Quantification", action="store_true", dest='quantification',
                      help="Run isoform reconstruction and quantification.")


    (op, args) = parser.parse_args()
    inputBAM = op.inputBAM
    conf = op.config
    experiment = op.experiment
    outdir = op.outputDir
    nprocess = int(op.process)

    if op.species == 'hg38' or op.species == 'hg19': species = 'hsa'
    elif op.species == 'mm9' or op.species == 'mm10': species = 'mmu'
    
    config = configparser.ConfigParser()
    config.read(conf)
    conf_data = config._sections
    
    if not os.path.exists(outdir): os.makedirs(outdir)
    if not os.path.exists('%s/%s' %(outdir, species)): os.makedirs('%s/%s' %(outdir, species))
    if not os.path.exists('%s/%s/%s' %(outdir, species, experiment)): os.makedirs('%s/%s/%s' %(outdir, species, experiment))
    
    umi_file_prefix = 'UBfix.sort.bam'
    if op.preprocess:
        print('Preprocessing on input BAM ...')
        preDir = os.path.join(outdir, species, experiment, "preprocess")
        if not os.path.exists(preDir): os.makedirs(preDir)
        
        cmd = 'samtools sort -m %s -O bam -@ %s -o %s/%s %s' %(conf_data['preprocess']['memory'], nprocess, preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), inputBAM)
        os.system(cmd)
        
        cmd = 'samtools view -b -q 255 %s/%s > %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM)))
        os.system(cmd)
        
        cmd = 'samtools view -h %s/%s | awk \'$12 != "NH:i:1"\' | samtools view -bS - > %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted.bam',os.path.basename(inputBAM)), preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM)))
        os.system(cmd)
        
        os.system('samtools index %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM))))
        os.system('samtools index %s/%s' %(preDir, re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM))))

    if op.quantification:
        
        print('Collect informative reads per gene...')
        in_bam_uniq = '%s/%s' %(os.path.join(outdir, species, experiment, "preprocess"), re.sub(umi_file_prefix,'UBfix.coordinateSorted_unique.bam',os.path.basename(inputBAM)))
        in_bam_multi = '%s/%s' %(os.path.join(outdir, species, experiment, "preprocess"), re.sub(umi_file_prefix,'UBfix.coordinateSorted_multi.bam',os.path.basename(inputBAM)))
    
        out_path = os.path.join(outdir, species, experiment, "expression_%s" %(conf_data['annotation']['gtf_source']))
        if not os.path.exists(out_path): os.makedirs(out_path)
        
        sys_tmp_dir = '%s/.tmp' %(out_path)
        if not os.path.exists(sys_tmp_dir): os.makedirs(sys_tmp_dir)
        pybedtools.set_tempdir(sys_tmp_dir)
        pybedtools.cleanup(remove_all=True)
        
        fetch_gene_reads(in_bam_uniq, in_bam_multi, conf_data, op.species, out_path)
        
        print('Build reference isoforms...')
        ref = build_reference(conf_data, out_path)
        
        print('Start isoform reconstruction and quantification...')
        get_isoforms(conf_data, out_path, ref)
def generate_sc_intervals(bam, chromosome, workdir, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL, min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP,
                          max_soft_clip=SC_MAX_SOFT_CLIP, pad=SC_PAD, min_support=MIN_SUPPORT, max_isize=1000000000,
                          min_support_frac=MIN_SUPPORT_FRAC):
    func_logger = logging.getLogger("%s-%s" % (generate_sc_intervals.__name__, multiprocessing.current_process()))

    if not os.path.isdir(workdir):
        func_logger.error("Working directory %s doesn't exist" % workdir)
        return None

    func_logger.info("Generating candidate insertion intervals from %s for chromsome %s" % (bam, chromosome))
    pybedtools.set_tempdir(workdir)

    unmerged_intervals = []
    start_time = time.time()
    try:
        sam_file = pysam.Samfile(bam, "rb")
        for aln in sam_file.fetch(reference=chromosome):
            if abs(aln.tlen) > max_isize:
                continue
            if not is_good_candidate(aln, min_avg_base_qual=min_avg_base_qual, min_mapq=min_mapq,
                                     min_soft_clip=min_soft_clip, max_soft_clip=max_soft_clip): continue
            interval = get_interval(aln, pad=pad)
            soft_clip_location = sum(interval) / 2
            strand = "-" if aln.is_reverse else "+"
            name = "%d,%s" % (soft_clip_location, strand)
            unmerged_intervals.append(
                pybedtools.Interval(chromosome, interval[0], interval[1], name=name, score="1", strand=strand))

        if not unmerged_intervals:
            sam_file.close()
            func_logger.warn("No intervals generated")
            return None

        unmerged_bed = os.path.join(workdir, "unmerged.bed")
        bedtool = pybedtools.BedTool(unmerged_intervals).sort().moveto(unmerged_bed)
        func_logger.info("%d candidate reads" % (bedtool.count()))

        merged_bed = os.path.join(workdir, "merged.bed")
        bedtool = bedtool.merge(c="4,5", o="collapse,sum", d=-500).moveto(merged_bed)
        func_logger.info("%d merged intervals" % (bedtool.count()))

        filtered_bed = os.path.join(workdir, "filtered_merged.bed")
        bedtool = bedtool.filter(lambda x: int(x.score) >= min_support).each(partial(merged_interval_features, bam_handle=sam_file)).moveto(
            filtered_bed)
        func_logger.info("%d filtered intervals" % (bedtool.count()))

        # Now filter based on coverage
        coverage_filtered_bed = os.path.join(workdir, "coverage_filtered_merged.bed")
        bedtool = bedtool.filter(lambda x: float(x.fields[6]) * min_support_frac <= float(x.score)).moveto(coverage_filtered_bed)
        func_logger.info("%d coverage filtered intervals" % (bedtool.count()))

        sam_file.close()
    except Exception as e:
        func_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    pybedtools.cleanup(remove_all=True)
    func_logger.info("Generated intervals in %g seconds" % (time.time() - start_time))

    return coverage_filtered_bed
示例#31
0
import pybedtools

pybedtools.set_tempdir('.')
a = pybedtools.bedtool('a.bed')
a.intersect(a)
pybedtools.cleanup(verbose=True)
示例#32
0
def teardown_module():
    if os.path.exists(test_tempdir):
        os.system('rm -r %s' % test_tempdir)
    pybedtools.cleanup()
def main():
    # READING ARGUMENTS
    parser = argparse.ArgumentParser(
        description='Arguments for Activity from Contacts')

    parser.add_argument('--enhancers',
                        type=str,
                        help='Enhancer BedFile with the activity reported')
    parser.add_argument('--tss', type=str, help='TSS BedFile')
    parser.add_argument('--promoters',
                        type=str,
                        help='activity for promoter regions')

    parser.add_argument(
        '--hic',
        type=str,
        help='Hi-C regularized counts',
        default='../data/external/K562_filteredRegularized_contactCount.tsv')
    parser.add_argument('--bincoord',
                        type=str,
                        help='Coordinates for bins',
                        default='../data/external/K562_filteredBins.bed')
    parser.add_argument('--chain',
                        type=str,
                        help='Chain file for coordinate liftover',
                        default='../data/external/hg38ToHg19.over.chain')

    parser.add_argument('--chromap',
                        type=str,
                        help='Chromosome mappping file',
                        default='../data/external/GRCh37_UCSC2ensembl.txt')

    parser.add_argument('-p',
                        type=int,
                        help='Cores to use during processing',
                        default=1)
    parser.add_argument('--scaler',
                        type=int,
                        help='Values to multiply for re-scaling',
                        default=100)
    parser.add_argument('--closer',
                        type=int,
                        help='Cutoff for enhancer vecinity',
                        default=5_000_000)
    parser.add_argument('--gamma',
                        type=int,
                        help='Gamma powerlaw parameter',
                        default=-0.7764771175681618)
    parser.add_argument('--scaleparam',
                        type=int,
                        help='Scale powerlaw parameter',
                        default=10.787505424121239)
    parser.add_argument('--mindist',
                        type=int,
                        help='Minimum distance for powerlaw',
                        default=1_000_000)
    parser.add_argument('--promlength',
                        type=int,
                        help='Promoter length',
                        default=500)
    parser.add_argument('--cutoff',
                        type=int,
                        help='Cutoff probability',
                        default=0)
    parser.add_argument('--outfile', type=str, help='Output file name')

    args = parser.parse_args()

    # ASSIGNING ARGUMENTS TO VARIABLES
    enhancer_bedfile = args.enhancers
    tss_bedfile = args.tss
    promoters_bedfile = args.promoters
    hic_file = args.hic

    num_cores = args.p
    chromosome_mapping = args.chromap
    coord_conversion = args.chain
    filtered_coords = args.bincoord
    SCALER_VALUE = args.scaler
    CLOSER_CUTOFF = args.closer
    SCALE_PL = args.scaleparam
    GAMMA_PL = args.gamma
    DISTMIN_PL = args.mindist
    PROMOTER_LEN = args.promlength
    CUTOFF_POSITIVE = args.cutoff
    output = args.outfile
    '''
    Reading file from input
    '''
    print('Reading files...')
    tss_df = pd.read_csv(
        tss_bedfile,
        sep='\t',
        header=None,
        names=['chr', 'start', 'end', 'gene_id', 'score', 'strand'])
    promoters_df = pd.read_csv(promoters_bedfile, sep='\t')
    enhancer_df = pd.read_csv(enhancer_bedfile, sep='\t')

    # For some reason, the indexing is faster when read from csv.
    hic_counts = pd.read_csv(hic_file, sep='\t')

    filtered_bins = pybedtools.BedTool(filtered_coords)

    lift_file = LiftOver(coord_conversion)
    chromap_file = pd.read_csv(chromosome_mapping,
                               sep='\t',
                               header=None,
                               names=['chromosome', 'ensembl_chr'],
                               index_col='chromosome')
    try:
        enhancer_df = enhancer_process(enhancer_info=enhancer_df,
                                       filtered_bincoord=filtered_bins,
                                       coordinate_liftover=lift_file,
                                       chromosome_relation=chromap_file)

        tss_df = tss_process(tss_info=tss_df,
                             filtered_bincoord=filtered_bins,
                             coordinate_liftover=lift_file,
                             chromosome_relation=chromap_file)

        tss_enhancer_intersected = intersect_elements(
            tss_intersect=tss_df,
            enhancer_intersect=enhancer_df,
            closer_value=CLOSER_CUTOFF)

        rescaled_data = rescale_rows(processed_df=tss_df,
                                     s_value=SCALER_VALUE,
                                     regularized_counts=hic_counts,
                                     num_p=num_cores)

        denom_dict, tss_enhancer_newinf = calculate_denominator(
            enhancer_tss_info=tss_enhancer_intersected,
            promoter_info=promoters_df,
            scaled_counts=rescaled_data,
            gamma_powerlaw=GAMMA_PL,
            scale_powerlaw=SCALE_PL,
            s_value=SCALER_VALUE,
            distance_min=DISTMIN_PL,
            promoter_length=PROMOTER_LEN,
            num_p=num_cores)

        calculate_abc(enhancer_tss_info=tss_enhancer_newinf,
                      denominator_values=denom_dict,
                      num_p=num_cores,
                      positive_cutoff=CUTOFF_POSITIVE,
                      output_name=output)

    finally:
        pybedtools.cleanup()
示例#34
0
def teardown_module():
    if os.path.exists(tempdir):
        os.system("rm -r %s" % tempdir)
    pybedtools.cleanup()
示例#35
0
def main():
    """
    Third quick example from the documentation -- count reads introns and
    exons, in parallel
    """
    ap = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
                                 usage=__doc__)
    ap.add_argument('--gff',
                    required=True,
                    help='GFF or GTF file containing annotations')
    ap.add_argument('--bam',
                    required=True,
                    help='BAM file containing reads to be counted')
    ap.add_argument('--stranded',
                    action='store_true',
                    help='Use strand-specific merging and overlap. '
                    'Default is to ignore strand')
    ap.add_argument('--no-parallel',
                    dest='noparallel',
                    action='store_true',
                    help='Disables parallel computation')
    ap.add_argument('-o',
                    '--output',
                    help='Optional file to which results will be written; '
                    'default is stdout')
    ap.add_argument('-v',
                    '--verbose',
                    action='store_true',
                    help='Verbose (goes to stderr)')
    args = ap.parse_args()

    gff = args.gff
    bam = args.bam
    stranded = args.stranded
    parallel = not args.noparallel

    # Some GFF files have invalid entries -- like chromosomes with negative
    # coords or features of length = 0.  This line removes them and saves the
    # result in a tempfile
    g = pybedtools.BedTool(gff).remove_invalid().saveas()

    # Decide which version of map to use.  If parallel, we only need 3
    # processes.
    pool = multiprocessing.Pool(processes=3)

    # Get separate files for introns and exons in parallel (if specified)
    featuretypes = ('intron', 'exon')
    introns, exons = pool.map(subset_featuretypes, featuretypes)

    # Perform some genome algebra to get unique and shared regions
    exon_only = exons.subtract(introns).merge().remove_invalid().saveas()
    intron_only = introns.subtract(exons).merge().remove_invalid().saveas()
    intron_and_exon = exons\
            .intersect(introns).merge().remove_invalid().saveas()

    # Do intersections with BAM file in parallel
    features = (exon_only, intron_only, intron_and_exon)
    results = pool.map(count_reads_in_features, features)

    labels = ('      exon only:', '    intron only:', 'intron and exon:')

    for label, reads in zip(labels, results):
        print('%s %s' % (label, reads))

    pybedtools.cleanup(verbose=False)
示例#36
0
                                 bed=True,
                                 stream=True).count()


    # Decide which version of map to use.  If parallel, we only need 3
    # processes.
    pool = multiprocessing.Pool(processes=3)

    # Get separate files for introns and exons in parallel (if specified)
    featuretypes = ('intron', 'exon')
    introns, exons = pool.map(subset_featuretypes, featuretypes)

    # Perform some genome algebra to get unique and shared regions
    exon_only       = exons.subtract(introns).merge().remove_invalid().saveas()
    intron_only     = introns.subtract(exons).merge().remove_invalid().saveas()
    intron_and_exon = exons.intersect(introns).merge().remove_invalid().saveas()

    # Do intersections with BAM file in parallel
    features = (exon_only, intron_only, intron_and_exon)
    results = pool.map(count_reads_in_features, features)

    labels = ('      exon only:',
              '    intron only:',
              'intron and exon:')

    for label, reads in zip(labels, results):
        print '%s %s' % (label, reads)

    pybedtools.cleanup(verbose=False)

示例#37
0
def mergeVCFfiles(chrom, A, B, file1, file2):

    ## Write headers to a file. Needed later for creating a correct VCF of the intersected files. 
    header1 = gzip.open((file1+chrom+'.vcf.gz'), 'r')
    header2 = gzip.open((file2+chrom+'.vcf.gz'), 'r')

    headerFile = (path+'HEADER_'+A+'_'+B+'_.vcf.gz')
    f = gzip.open(headerFile, 'ab')

    for line in header1:
        if line[0:2] != '##':
            break;
        f.write(line)
    header1.close()

    for line in header2:
        if line[0:2] != '##':
            break;
        f.write(line)
    header1.close()

    f.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO');
    f.close()

    ## Intersects files -LOJ

    a = BedTool((file1+chrom+'.vcf.gz'))
    b = BedTool((file2+chrom+'.vcf.gz'))

    a_b = a.intersect(b, header=True,loj=True)
    ab  = a_b.saveas((path+'LOJ_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'))

    print (ab.fn)
    cleanup(remove_all=True)

    ## Intersects files -V

    a = BedTool((file1+chrom+'.vcf.gz'))
    b = BedTool((file2+chrom+'.vcf.gz'))

    b_a = b.intersect(a, header=True, v=True)
    ba  = b_a.saveas((path+'V_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'))

    print (ba.fn)
    cleanup(remove_all=True)


    ## CAT LOJ file and -v File

    LOJ = path+'LOJ_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'
    V = path+'V_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'
    out = path+'concated_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'

    os.system(("vcf-concat "+LOJ+" "+V+" | gzip -c > "+out))
    #print "ok"



    ## correct to vcf, merge equal type samples
    out2 = 'stage2/unsorted_'+A+'_'+B+'_chr'+chrom+'.vcf.gz'

    import correctToVCF
    goVCF = correctToVCF.CorrectToVCF()
    goVCF.writeHeader(headerFile, out2)
    goVCF.correctFile(out, out2)

    ## sort the VCF file

    os.system(("vcf-sort "+out2+"| gzip -c > "+("germlineVCF/"+A+'_'+B+'_chr'+chrom+'.vcf.gz')))

    cleanup(remove_all=True)
示例#38
0
def teardown():
    pybedtools.cleanup(remove_all=True)
示例#39
0
# special case of deletions - if the deleted area ("ispan") is a TE, more likely a TE insertion in the reference
possible_te_reference_insertions = long_indel_intra_calls.pair_to_bed(tes, type="ispan", f=.75).saveas()
filtered_possible_te_reference_insertions = possible_te_reference_insertions.filter(bedpe_reciprocal_overlap_ispan_filter, 0.75).saveas()
print "POSSIBLE_TE_INSERTIONS_IN_REFERENCE\tALL\t" + str(len(filtered_possible_te_reference_insertions))
log.write("POSSIBLE_TE_INSERTIONS_IN_REFERENCE\tALL\t" + str(len(filtered_possible_te_reference_insertions)) + "\n")
save_output(master_out_bed, filtered_possible_te_reference_insertions, output_dir, "possible_te_reference_insertions", sample_name, "POSSIBLE_TE_INSERTIONS_IN_REFERENCE", seg_dups, cent_tel)

# deletions that match population variants
if common_deletions is not None:
    common_deletions = long_indel_intra_calls.pair_to_bed(common_deletions, type="ispan", f=common_deletion_overlap_pct).saveas()
    filtered_possible_common_deletions = common_deletions.filter(bedpe_reciprocal_overlap_ispan_filter, common_deletion_overlap_pct).saveas()
    print "COMMON_DELETIONS\tALL\t" + str(len(filtered_possible_common_deletions))
    log.write("COMMON_DELETIONS\tALL\t" + str(len(filtered_possible_common_deletions)) + "\n")
    save_output(master_out_bed, filtered_possible_common_deletions, output_dir, "possible_common_deletions", sample_name, "COMMON_DELETIONS", seg_dups, cent_tel)

# insertions are shorter than the fragment size
short_indel_intra_calls = expected_orientation.filter(bedpe_lt_length_filter, insert_size).saveas()
print "INSERTIONS\tALL\t" + str(len(short_indel_intra_calls))
log.write("INSERTIONS\tALL\t" + str(len(short_indel_intra_calls)) + "\n")
save_output(master_out_bed, short_indel_intra_calls, output_dir, "insertions", sample_name, "INSERTIONS", seg_dups, cent_tel)

# inversions are what's left
unexpected_orientation = intra_calls.filter(expected_orientation_filter, matches=False).saveas()
print "INVERSION\tALL\t" + str(len(unexpected_orientation))
log.write("INVERSION\tALL\t" + str(len(unexpected_orientation)) + "\n")
save_output(master_out_bed, unexpected_orientation, output_dir, "inversions", sample_name,  "INVERSIONS", seg_dups, cent_tel)

pybedtools.cleanup()
log.close()
示例#40
0
def generate_sc_intervals(bam, chromosome, workdir, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL, min_mapq=SC_MIN_MAPQ,
                          min_soft_clip=SC_MIN_SOFT_CLIP,
                          pad=SC_PAD, min_support_ins=MIN_SUPPORT_INS, max_considered_isize=1000000000, 
                          min_support_frac_ins=MIN_SUPPORT_FRAC_INS, max_nm=SC_MAX_NM, min_matches=SC_MIN_MATCHES, 
                          isize_mean=ISIZE_MEAN, isize_sd=ISIZE_SD, svs_to_softclip=SVS_SOFTCLIP_SUPPORTED,
                          overlap_ratio=OVERLAP_RATIO,merge_max_dist=-int(1*SC_PAD), 
                          mean_read_length=MEAN_READ_LENGTH, mean_read_coverage=MEAN_READ_COVERAGE, 
                          min_ins_cov_frac=MIN_INS_COVERAGE_FRAC, max_ins_cov_frac=MAX_INS_COVERAGE_FRAC,
                          num_sd = 2):
    func_logger = logging.getLogger("%s-%s" % (generate_sc_intervals.__name__, multiprocessing.current_process()))

    if not os.path.isdir(workdir):
        func_logger.error("Working directory %s doesn't exist" % workdir)
        return None

    func_logger.info("Generating candidate intervals from %s for chromsome %s" % (bam, chromosome))
    pybedtools.set_tempdir(workdir)

    
    min_isize = isize_mean - num_sd * isize_sd
    max_isize = isize_mean + num_sd * isize_sd

    unmerged_intervals = []
    start_time = time.time()
    ignore_none = True
    try:
        sam_file = pysam.Samfile(bam, "rb")
        for aln in sam_file.fetch(reference=str(chromosome)):
            if abs(aln.tlen) > max_considered_isize:
                continue
            if not is_good_candidate(aln, min_avg_base_qual=min_avg_base_qual, min_mapq=min_mapq,
                                     min_soft_clip=min_soft_clip, max_nm=max_nm,
                                     min_matches=min_matches): continue
            interval = get_interval(aln, pad=pad)
            soft_clip_location = sum(interval) / 2
            strand = "-" if aln.is_reverse else "+"
            svtype = infer_svtype(aln, min_isize, max_isize)
            
            if svtype == "CTX;INS":
                # TODO : Should be fixed to handle CTX
                svtype = "INS"
            
            if svtype == "DUP;ITX":
                # TODO : Should be fixed to handle ITX
                svtype = "DUP"

            soft_clip_tuple = find_softclip(aln)
            if not soft_clip_tuple:    
                continue
            soft_clip, dist_L_end, dist_R_end = soft_clip_tuple 
            other_bp = find_other_bp(aln,isize_mean, svtype, soft_clip, dist_L_end,
                                                 dist_R_end, soft_clip_location)
            if other_bp is None: continue

            name = "%d,%d,%s" % (soft_clip_location, other_bp, strand)
            if ignore_none and svtype == "NONE":
                continue
            if svtype not in svs_to_softclip:
                continue

            unmerged_intervals.append(
                pybedtools.Interval(chromosome, interval[0], interval[1], name=name, score="1", strand=strand, otherfields=[svtype]))

        if not unmerged_intervals:
            sam_file.close()
            func_logger.warn("No intervals generated")
            return None

        unmerged_bed = os.path.join(workdir, "unmerged.bed")
        bedtool = pybedtools.BedTool(unmerged_intervals).sort().moveto(unmerged_bed)
        func_logger.info("%d candidate reads" % (bedtool.count()))

        bedtool_lr={"L":bedtool.filter(lambda x: int(x.name.split(",")[0])<=int(x.name.split(",")[1])).sort(),
                    "R":bedtool.filter(lambda x: int(x.name.split(",")[0])>int(x.name.split(",")[1])).sort()}
        bp_merged_intervals = []
        for k_bt,bt in bedtool_lr.iteritems():
            merged_bed = os.path.join(workdir, "merged_%s.bed"%k_bt)
            m_bt=merge_for_each_sv(bt,c="4,5,6,7",o="collapse,sum,collapse,collapse",
                                        svs_to_softclip=svs_to_softclip,d=merge_max_dist,
                                        reciprocal_for_2bp=False, sv_type_field = [6,0])
            m_bt = m_bt.moveto(merged_bed)
            func_logger.info("%d merged intervals with left bp support" % (m_bt.count()))

            # Check if the other break point also can be merged for the merged intervals (for 2bp SVs)
            for interval in m_bt:
                sv_type = interval.fields[6].split(',')[0]
                if len(set(interval.fields[6].split(',')))!=1:
                    func_logger.warn("More than one svtypes: %s",(str(interval)))
                if  sv_type == "INS":
                    bp_merged_intervals.append(interval)
                else:
                    name_fields_0 = interval.name.split(',')
                    other_bps = map(lambda x:int(name_fields_0[3*x+1]), range(len(name_fields_0)/3))
                    if (min(other_bps)+2*pad-max(other_bps))>(-merge_max_dist):
                        bp_merged_intervals.append(interval)
                        continue

                    other_bp_bedtool=bt.filter(lambda x: x.name in interval.name and x.fields[6]==sv_type).each(partial(generate_other_bp_interval,pad=pad)).sort().merge(c="4,5,6,7", o="collapse,sum,collapse,collapse", d=merge_max_dist)
                    if len(other_bp_bedtool)==1:
                        bp_merged_intervals.append(interval)
                    else:
                        for intvl in other_bp_bedtool:
                            bp_merged_intervals.extend(bt.filter(lambda x: x.name in intvl.name and x.fields[6]==sv_type).sort().merge(c="4,5,6,7", o="collapse,sum,collapse,collapse", d=merge_max_dist))
                    
        bp_merged_bed = os.path.join(workdir, "bp_merged.bed")
        bedtool=pybedtools.BedTool(bp_merged_intervals).each(partial(add_other_bp_fields,pad=pad)).sort().moveto(bp_merged_bed)       
        func_logger.info("%d BP merged intervals" % (bedtool.count()))

        filtered_bed = os.path.join(workdir, "filtered.bed")
        bedtool = bedtool.filter(lambda x: int(x.score) >= MIN_SUPPORT_SC_ONLY).each(
            partial(merged_interval_features, bam_handle=sam_file)).moveto(
            filtered_bed)
        func_logger.info("%d filtered intervals" % (bedtool.count()))
        
        # Now filter based on coverage
        coverage_filtered_bed = os.path.join(workdir, "coverage_filtered.bed")
        bedtool = bedtool.filter(lambda x: (x.fields[3].split(",")[1]!="INS" or 
                                           ((min_ins_cov_frac*mean_read_coverage)<=(float(x.fields[6])/abs(x.start-x.end+1)*mean_read_length)<=(max_ins_cov_frac*mean_read_coverage)))).moveto(coverage_filtered_bed)
        func_logger.info("%d coverage filtered intervals" % (bedtool.count()))


        thr_sv={"INS":min_support_frac_ins, "INV":MIN_SUPPORT_FRAC_INV, 
                "DEL":MIN_SUPPORT_FRAC_DEL, "DUP": MIN_SUPPORT_FRAC_DUP}

        # Add number of neighbouring reads that support SC
        bedtool=bedtool.each(partial(add_neighbour_support,bam_handle=sam_file, min_mapq=min_mapq, 
                                     min_soft_clip=min_soft_clip, max_nm=max_nm, min_matches=min_matches,
                                     skip_soft_clip=False, isize_mean=isize_mean, min_isize=min_isize, max_isize=max_isize)).sort().moveto(coverage_filtered_bed)

        neigh_coverage_filtered_bed = os.path.join(workdir, "neigh_filtered.bed")
        bedtool = bedtool.each(partial(filter_low_frac_support,thr_sv=thr_sv)).each(
                               partial(filter_low_neigh_read_support_INS,min_support_ins=min_support_ins)).sort().moveto(
                               neigh_coverage_filtered_bed)
        func_logger.info("%d neighbour support filtered intervals" % (bedtool.count()))

        # For 2bp SVs, the interval will be the cover of two intervals on the BP
        full_filtered_bed = os.path.join(workdir, "full_neigh_filtered.bed")
        bedtool = bedtool.each(partial(get_full_interval,pad=pad)).sort().moveto(full_filtered_bed)
        func_logger.info("%d full filtered intervals" % (bedtool.count()))

        # Now merge on full intervals
        merged_full_filtered_bed = os.path.join(workdir, "merged_full.bed")
        if bedtool.count()>0:
            bedtool=merge_for_each_sv(bedtool,c="4,5,6,7,9",o="collapse,collapse,collapse,collapse,collapse",
                                      svs_to_softclip=svs_to_softclip,
                                      overlap_ratio=overlap_ratio,
                                      reciprocal_for_2bp=True, 
                                      sv_type_field = [3,1], d=merge_max_dist)
        bedtool = bedtool.each(partial(fix_merged_fields,inter_tools=False)).each(partial(fine_tune_bps,pad=pad))
        bedtool = bedtool.filter(lambda x: x.score != "-1").sort().moveto(merged_full_filtered_bed)
        func_logger.info("%d merged full intervals" % (bedtool.count()))
        
        sam_file.close()
    except Exception as e:
        func_logger.error('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e

    pybedtools.cleanup(remove_all=True)
    func_logger.info("Generated intervals in %g seconds for region %s" % ((time.time() - start_time), chromosome))

    return merged_full_filtered_bed
示例#41
0
chr1    1       1       Region_A        2       +
chr1    2       2       Region_B        1       +
chr1    3       3       Region_C        3       +
chr1    4       5
""".strip()

    b = pybedtools.BedTool(bed, from_string=True).intervals

    result = b.all_hits(Interval('chr1', 3, 3))
    assert len(result) == 1, len(result)

def test_missing_files():
    """
    previously this would crash the interpreter due to an exit(1) call in
    bedFile.cpp
    """
    a = pybedtools.BedTool('chrA 1 10', from_string=True).saveas("this_file_should_raise_BEDTools_Error")
    result = list(iter(a))
    os.unlink(a.fn)
    from pybedtools.cbedtools import BedToolsFileError
    def crashes():
        list(iter(a))

    assert_raises(BedToolsFileError, crashes)

if __name__ == "__main__":
    unittest.main()
    pybedtools.cleanup(remove_all=True)


def main():
    options = get_options()

    # pangenome dictionary
    rd = {}
    if options.roary is not None:
        roary = pd.read_table(options.roary, sep=',', low_memory=False)
        roary.set_index('Gene', inplace=True)
        # Drop the other info columns
        roary.drop(list(roary.columns[:13]), axis=1, inplace=True)
        roary.reset_index(inplace=True)
        for strain in roary.columns[1:]:
            for x, y, in roary.set_index(
                    strain)['Gene'].dropna().to_dict().items():
                if str(x) == 'nan':
                    continue
                # be aware of paralogs
                for g in x.split('\t'):
                    rd[g] = y

    # tmp file locations
    remaining_tmp = options.tmp_prefix + "/remaining_kmers.txt"
    remaining_next_tmp = options.tmp_prefix + "/remaining_kmers_next.txt"
    remaining_fa_tmp = options.tmp_prefix + "/remaining_kmers.fa"
    remaining_fa_next_tmp = options.tmp_prefix + "/remaining_kmers_next.fa"
    pybedtools.helpers.set_tempdir(options.tmp_prefix)

    # read references and drafts into list
    references = []
    with open(options.references, 'r') as reference_files:
        for reference in reference_files:
            (fa, gff, ref) = reference.rstrip().split()
            references.append((fa, gff, ref))

    output_file = open(options.output, 'w')

    # Open seer results
    # seer_remaining = seer_results
    seer_remaining = open(options.kmers, 'r')
    header = seer_remaining.readline()

    # Write out kmer fasta file, keep track of count
    kmers_remaining = 0
    with open(remaining_fa_tmp, 'w') as kmer_fa:
        for kmer in seer_remaining:
            kmers_remaining += 1
            kmer_fa.write(">" + str(kmers_remaining) + "\n")
            kmer_fa.write(kmer.split("\t")[0] + "\n")

    seer_remaining.seek(0)
    seer_remaining.readline()

    # for each reference, then draft
    ref_id = 0
    for reference in references:
        (ref_fa, ref_gff, ref_type) = reference
        ref_id += 1

        # print number of kmers remaining. if zero, break
        if kmers_remaining == 0:
            break
        sys.stderr.write(str(kmers_remaining) + " kmers remain\n")
        if ref_type == "ref":
            sys.stderr.write("Reference " + str(ref_id) + "\n")
        else:
            sys.stderr.write("Draft reference " + str(ref_id) + "\n")

        # index reference sequence
        bwa_index(ref_fa)
        if ref_type == "ref":
            bwa_algorithms = ["mem", "fastmap"]
        elif ref_type == "draft":
            bwa_algorithms = ["fastmap"]
        else:
            bwa_algorithms = ["fastmap"]
            sys.stderr.write("Unknown reference type " + ref_type + " for " +
                             ref_fa + ". Assuming draft\n")

        # Fix ref annotation
        tmp_bed = tempfile.NamedTemporaryFile(prefix=options.tmp_prefix + "/")
        subprocess.run("gff2bed < " + ref_gff + " > " + tmp_bed.name,
                       shell=True,
                       check=True)
        ref_annotation = pybedtools.BedTool(tmp_bed.name)
        filtered_ref = ref_annotation.filter(
            lambda x: True if x[7] == "CDS" else False).saveas('tmp_bed')
        ref_annotation = pybedtools.BedTool('tmp_bed')

        for bwa_algorithm in bwa_algorithms:
            next_seer_remaining = open(remaining_next_tmp, 'w')
            next_fasta_remaining = open(remaining_fa_next_tmp, 'w')

            # run bwa mem -k 8 for ref, bwa fastmap for draft of remaining.fa
            new_idx = 0
            kmer_lines = []
            map_pos = {}

            mapped_kmers = bwa_iter(ref_fa, remaining_fa_tmp, bwa_algorithm)
            with tempfile.NamedTemporaryFile('w',
                                             prefix=options.tmp_prefix +
                                             "/") as query_bed:
                kmer_idx = 0
                for mapping, kmer_line in zip(mapped_kmers, seer_remaining):
                    if mapping.mapped:
                        kmers_remaining -= 1
                        kmer_lines.append(kmer_line.rstrip())
                        map_pos[kmer_idx] = []
                        for hit_idx, (contig, start, end,
                                      strand) in enumerate(mapping.positions):
                            map_pos[kmer_idx].append(contig + ":" +
                                                     str(start) + "-" +
                                                     str(end))
                            query_bed.write('\t'.join([
                                contig,
                                str(start),
                                str(end),
                                str(kmer_idx) + "_" + str(hit_idx), '0', strand
                            ]) + "\n")
                        kmer_idx += 1
                    else:
                        # if unmapped write to seer_remaining and remaining.fa
                        next_seer_remaining.write(kmer_line)

                        new_idx += 1
                        next_fasta_remaining.write(">" + str(new_idx) + "\n")
                        next_fasta_remaining.write(
                            kmer_line.split("\t")[0] + "\n")

                query_bed.flush()
                query_interval = pybedtools.BedTool(query_bed.name)
                sorted_query = query_interval.sort()

                in_genes = extract_genes(query_interval.intersect(
                    b=ref_annotation, s=False, stream=True, wb=True),
                                         rd,
                                         id=options.id)
                up_genes = extract_genes(sorted_query.closest(b=ref_annotation,
                                                              s=False,
                                                              D="ref",
                                                              iu=True,
                                                              stream=True),
                                         rd,
                                         id=options.id)
                down_genes = extract_genes(sorted_query.closest(
                    b=ref_annotation, s=False, D="ref", id=True, stream=True),
                                           rd,
                                           id=options.id)
                pybedtools.cleanup()  # delete the bed file

                for kmer_idx, kmer_line in enumerate(kmer_lines):
                    annotations = []
                    for hit_idx, hit in enumerate(map_pos[kmer_idx]):
                        annotation = hit + ";"
                        if kmer_idx in down_genes and hit_idx in down_genes[
                                kmer_idx]:
                            annotation += down_genes[kmer_idx][hit_idx]
                        annotation += ";"
                        if kmer_idx in in_genes and hit_idx in in_genes[
                                kmer_idx]:
                            annotation += in_genes[kmer_idx][hit_idx]
                        annotation += ";"
                        if kmer_idx in up_genes and hit_idx in up_genes[
                                kmer_idx]:
                            annotation += up_genes[kmer_idx][hit_idx]
                        annotations.append(annotation)

                    output_file.write(
                        "\t".join([kmer_line, ",".join(annotations)]) + "\n")

            # Clean up
            seer_remaining.close()
            next_seer_remaining.close()
            next_fasta_remaining.close()
            os.rename(remaining_next_tmp, remaining_tmp)
            os.rename(remaining_fa_next_tmp, remaining_fa_tmp)

            # Open next kmer file
            seer_remaining = open(remaining_tmp, 'r')

        tmp_bed.close()
        os.remove('tmp_bed')

    sys.stderr.write(str(kmers_remaining) + " kmers remain unannotated\n")
示例#43
0
def cleanBedTool(tempPath):
    # do best to erase temporary bedtool files if necessary
    # (tempPath argument must have been created with initBedTool())
    assert "TempBedTool_" in tempPath
    pybedtools.cleanup(remove_all=True)
    runShellCommand("rm -rf %s" % tempPath)
示例#44
0
def score_edits(annotated_edits_file, bg_edits_file, output_file, conf,
                gene_positions_dict, genome_fa, flank, chrom_sizes_file, rdd):
    """
    1. Reads and filters our (annotated) editing site (fg). The "name" (edit_frac) MUST contain the edited,totalcov for each site.
    2. Creates a bedtools interval (interval) containing gene coordinates. 
    3. Subsets our (annotated) editing list (fg) to get only the edits across one gene, for every gene. If a gene has no edit sites, pass. 
        A. For this step, we're relying on what's been annotated by annotator. So we are only counting edits that are unambiguously assigned
           (edits at a given position that overlaps multiple genes in the same region are not counted). 
    4. Filter out bg_edits_file from fg_edits_file. 
    5. Open a window centered around every edit site.
    6. Intersect with all edits from (3) to collect all edits that exist within the window.
    7. Add up all the edited-reads and total-reads across edited sites and calculate the "edit/editedc" fraction.
    8. Calculate the coverage across all C's in each window
    """

    chrom_sizes_dict = create_chrom_sizes_dict(chrom_sizes_file)
    # (1) Reads and filters our (annotated) editing site (fg). This file MUST have a conf value in the 4th column.
    fg = read_and_filter_editing_sites(annotated_edits_file, conf)
    progress = trange(len(set(fg['gene_id'])))
    all_scores_df = pd.DataFrame(columns=[
        'chrom', 'start', 'end', 'name', 'score', 'strand', 'edit_coverage',
        'editable_coverage', 'edited_over_edited_c', 'all_c_coverage',
        'edited_over_all_c'
    ])
    all_scores = []
    for gene_id in set(fg['gene_id']):
        try:
            # (2) Creates a bedtools interval (interval) containing gene coordinates.
            interval = pybedtools.create_interval_from_list([
                gene_positions_dict[gene_id]['chrom'],
                gene_positions_dict[gene_id]['start'],
                gene_positions_dict[gene_id]['end'],
                gene_id,
                '0',
                gene_positions_dict[gene_id]['strand'],
            ])
            # (3) Subsets our (annotated) editing list (fg) to get only the edits across one gene, for every gene.
            fg_sites_in_region = fg[fg['gene_id'] == gene_id]

            if fg_sites_in_region.shape[0] >= 1:

                # thickStart = edited #
                fg_sites_in_region.loc[:, 'thickStart'] = fg_sites_in_region[
                    'edit_frac'].apply(lambda x: int(x.split(',')[0]))
                # thickEnd = total coverage #
                fg_sites_in_region.loc[:, 'thickEnd'] = fg_sites_in_region[
                    'edit_frac'].apply(lambda x: int(x.split(',')[1]))
                fg_sites_in_region.loc[:,'name'] = fg_sites_in_region.loc[:,'gene_id'] + \
                    "|" + fg_sites_in_region.loc[:,'region']
                fg_sites_in_region = fg_sites_in_region[[
                    'chrom', 'start', 'end', 'name', 'conf', 'strand',
                    'thickStart', 'thickEnd'
                ]]
                # (4) Filter out bg_edits_file from fg_edits_file.
                fg_prefiltered_sites_bedtool = pybedtools.BedTool.from_dataframe(
                    fg_sites_in_region)
                if bg_edits_file is not None:
                    bg_sites_bedtool = pybedtools.BedTool(bg_edits_file)
                    fg_sites_bedtool = fg_prefiltered_sites_bedtool.sort(
                    ).intersect(bg_sites_bedtool.sort(), s=True, v=True)
                else:
                    fg_sites_bedtool = fg_prefiltered_sites_bedtool
                if len(
                        fg_sites_bedtool
                ) > 0:  # If the background file totally removes all edits from the foreground file, we might get an EmptyDataFrame
                    # (5) Open a window centered around every edit site.
                    fg_windows_bedtool = create_window_intervals(
                        fg_sites_bedtool, flank, chrom_sizes_dict)
                    # (6) Intersect with all edits from (3) to collect all edits that exist within the window.
                    intersected_edits = fg_windows_bedtool.intersect(
                        fg_sites_bedtool, s=True, wa=True,
                        loj=True).to_dataframe(names=[
                            'chrom', 'start', 'end', 'name', 'score', 'strand',
                            'edit_chrom', 'edit_start', 'edit_end',
                            'edit_name', 'edit_score', 'edit_strand',
                            'edit_coverage', 'editable_coverage'
                        ])
                    # (7) Add up all the edited-reads and total-reads across edited sites and calculate the "edit/editedc" fraction.
                    summed_confs = pd.DataFrame(
                        intersected_edits.groupby([
                            'chrom', 'start', 'end', 'name', 'score', 'strand'
                        ])['edit_score'].sum()).reset_index()

                    # blockCount is the "number of reads supporting an edit site"
                    summed_edits = pd.DataFrame(
                        intersected_edits.groupby([
                            'chrom', 'start', 'end', 'name', 'score', 'strand'
                        ])['edit_coverage'].sum()).reset_index()
                    # editable_coverage (blockSizes) is the "total number of reads at the edited site"
                    summed_total_coverage = pd.DataFrame(
                        intersected_edits.groupby([
                            'chrom', 'start', 'end', 'name', 'score', 'strand'
                        ])['editable_coverage'].sum()).reset_index()
                    df = pd.merge(summed_edits,
                                  summed_total_coverage,
                                  how='outer',
                                  left_on=[
                                      'chrom', 'start', 'end', 'name', 'score',
                                      'strand'
                                  ],
                                  right_on=[
                                      'chrom', 'start', 'end', 'name', 'score',
                                      'strand'
                                  ])
                    df['edited_over_edited_c'] = df['edit_coverage'] / df[
                        'editable_coverage']

                    # (8) Calculate the coverage across all C's in each window
                    df['all_c_coverage'] = df.apply(get_total_c_coverage,
                                                    args=(
                                                        rdd,
                                                        genome_fa,
                                                    ),
                                                    axis=1)
                    df['edited_over_all_c'] = df['edit_coverage'] / df[
                        'all_c_coverage']

                    # reorder columns to match
                    df = df[[
                        'chrom', 'start', 'end', 'name', 'score', 'strand',
                        'edit_coverage', 'editable_coverage',
                        'edited_over_edited_c', 'all_c_coverage',
                        'edited_over_all_c'
                    ]]
                    all_scores.append(df)
                    # all_scores = pd.concat([all_scores, df])
            pybedtools.cleanup()
        except KeyError as e:
            pass
        progress.update(1)

    for score_df in all_scores:
        all_scores_df = pd.concat([all_scores_df, score_df])
    all_scores_df.sort_values(by=['chrom', 'start', 'end', 'strand']).to_csv(
        output_file, sep='\t', index=False, header=True)
示例#45
0
def teardown():
    pybedtools.cleanup()
示例#46
0
def teardown():
    # always run this!
    pybedtools.cleanup(remove_all=True)
示例#47
0
def run_age_parallel(intervals_bed=None, reference=None, assembly=None, pad=AGE_PAD, age=None, age_workdir=None,
                     timeout=AGE_TIMEOUT, keep_temp=False, assembly_tool="spades", chrs=[], nthreads=1,
                     min_contig_len=AGE_MIN_CONTIG_LENGTH,
                     max_region_len=AGE_MAX_REGION_LENGTH, sv_types=[], 
                     min_del_subalign_len=MIN_DEL_SUBALIGN_LENGTH, min_inv_subalign_len=MIN_INV_SUBALIGN_LENGTH,
                     age_window = AGE_WINDOW_SIZE):
    func_logger = logging.getLogger("%s-%s" % (run_age_parallel.__name__, multiprocessing.current_process()))

    if not os.path.isdir(age_workdir):
        func_logger.info("Creating %s" % age_workdir)
        os.makedirs(age_workdir)

    if assembly:
        if not os.path.isfile("%s.fai" % assembly):
            func_logger.info("Assembly FASTA wasn't indexed. Will attempt to index now.")
            pysam.faidx(assembly)

        func_logger.info("Loading assembly contigs from %s" % assembly)
        with open(assembly) as assembly_fd:
            if assembly_tool == "spades":
                contigs = [SpadesContig(line[1:]) for line in assembly_fd if line[0] == '>']
            elif assembly_tool == "tigra":
                contigs = [TigraContig(line[1:]) for line in assembly_fd if line[0] == '>']
    else:
        contigs = []

    chrs = set(chrs)
    sv_types = set(sv_types)
    contig_dict = {contig.sv_region.to_tuple(): [] for contig in contigs if (len(
        chrs) == 0 or contig.sv_region.chrom1 in chrs) and contig.sequence_len >= min_contig_len and contig.sv_region.length() <= max_region_len and (
                       len(sv_types) == 0 or contig.sv_type in sv_types)}

    func_logger.info("Generating the contig dictionary for parallel execution")
    small_contigs_count = 0
    for contig in contigs:
        if contig.sv_region.length() > max_region_len: 
            func_logger.info("Too large SV region length: %d > %d" % (contig.sv_region.length(),max_region_len))
            continue
        if (len(chrs) == 0 or contig.sv_region.chrom1 in chrs) and (len(sv_types) == 0 or contig.sv_type in sv_types):
            if contig.sequence_len >= min_contig_len:
                contig_dict[contig.sv_region.to_tuple()].append(contig)
            else:
                small_contigs_count += 1

    region_list = sorted(contig_dict.keys())
    nthreads = min(nthreads, len(region_list))

    if nthreads == 0:
        func_logger.warning("AGE not run since no contigs found")
        return None

    func_logger.info("Will process %d regions with %d contigs (%d small contigs ignored) using %d threads" % (
        len(region_list), sum([len(value) for value in contig_dict.values()]), small_contigs_count, nthreads))

    pybedtools.set_tempdir(age_workdir)
    pool = multiprocessing.Pool(nthreads)

    breakpoints_beds = []
    for i in xrange(nthreads):
        region_sublist = [region for (j, region) in enumerate(region_list) if (j % nthreads) == i]
        kwargs_dict = {"intervals_bed": intervals_bed, "region_list": region_sublist, "contig_dict": contig_dict,
                       "reference": reference, "assembly": assembly, "pad": pad, "age": age, "age_workdir": age_workdir,
                       "timeout": timeout, "keep_temp": keep_temp, "myid": i, 
                       "min_del_subalign_len": min_del_subalign_len, "min_inv_subalign_len": min_inv_subalign_len,
                       "age_window" : age_window}
        pool.apply_async(run_age_single, args=[], kwds=kwargs_dict,
                         callback=partial(run_age_single_callback, result_list=breakpoints_beds))

    pool.close()
    pool.join()

    func_logger.info("Finished parallel execution")

    func_logger.info("Will merge the following breakpoints beds %s" % (str(breakpoints_beds)))

    pybedtools.cleanup(remove_all=True)

    if not breakpoints_beds:
        return None

    bedtool = pybedtools.BedTool(breakpoints_beds[0])
    for bed_file in breakpoints_beds[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)

    bedtool = bedtool.moveto(os.path.join(age_workdir, "breakpoints_unsorted.bed"))
    merged_bed = os.path.join(age_workdir, "breakpoints.bed")
    bedtool.sort().saveas(merged_bed)

    return merged_bed
def format_hartwig(mutation_file, cnvs_file, purity_file, outfile):

    # load files and preformat them
    df, cnv_bed, purity_score, gender = load_files(mutation_file, cnvs_file,
                                                   purity_file)

    # this is the sample column
    lastcol = list(df.columns)[-1]

    # get total reads
    df_reads = df.apply(get_reads, axis=1, args=([lastcol]))

    # select whether we have SNVs or others
    df_reads['len_alt'] = df_reads['ALT'].str.len()

    # number of characters in ref
    df_reads['len_ref'] = df_reads['REF'].str.len()

    # first classification between SNV and others
    df_reads['TYPE'] = df_reads.apply(lambda x: 'SNV' if (
        (x['len_alt'] == 1) and (x['len_ref'] == 1) and (x['ALT'] != '-') and
        (x['REF'] != '-')) else 'INDEL',
                                      axis=1)

    df_reads['pos-1'] = df_reads['POS'] - 1

    # get the triplet
    df_reads['TRIPLET'] = df_reads.apply(
        lambda x: hg19(x['CHROM'], x['pos-1'], 3), axis=1)
    df_reads['EXTENDED'] = df_reads.apply(
        lambda x: hg19(x['CHROM'],
                       int(x['POS']) - 2, 5), axis=1)

    snv_df = df_reads[df_reads['TYPE'] != 'INDEL']
    snv_df['CLASS'] = 'SNV'
    snv_df['VARIANT_CLASS'] = snv_df.apply(create_snv_class, axis=1)

    # classify indels
    indel_df = df_reads[df_reads['TYPE'] == 'INDEL']
    indels = indels_classification(indel_df)
    columns = indels.columns

    df_reads_merged = pd.concat([snv_df, indels], sort=True)
    df_reads_merged = df_reads_merged[columns]

    # assing the name of the sample
    df_reads_merged['sample'] = lastcol

    # create bed file
    mut_bed = BedTool.from_dataframe(df_reads_merged[[
        'CHROM', 'pos-1', 'POS', 'ref_reads', 'var_reads', 'VAF',
        'total_reads', 'REF', 'ALT', 'sample', 'TYPE', 'CLASS',
        'VARIANT_CLASS', 'TRIPLET', 'EXTENDED'
    ]])

    # Remove unmappable regions
    mapped = get_mappable_regions(mut_bed)

    # intersect with CN data
    out = mapped.intersect(cnv_bed, wao=True)

    # merge to dataframe
    merge = out.to_dataframe(names=[
        'CHROM', 'POS-1', 'POS', 'REF_COUNTS', 'VAR_COUNTS', 'VAF',
        'TOTAL_READS', 'REF', 'ALT', 'SAMPLE', 'TYPE', 'CLASS',
        'VARIANT_CLASS', 'TRIPLET', 'EXTENDED', 'c1', 'p1', 'p2',
        'MAJOR_CN_TEMP', 'actual_Baf', 'overlapp'
    ])

    # get the normal copy number values
    sex_chrom = ('Y', 'X')

    # get normal CN in the chromosome
    merge['NORMAL_CN'] = merge['CHROM'].apply(
        lambda x: 1 if x in sex_chrom and gender == "MALE" else 2)

    # add the purity score we got from PURPLE
    merge['PURITY'] = purity_score
    merge['GENDER'] = gender

    # get number of CNAs, if no overlapp then get the normal count
    merge['TOTAL_CN'] = merge.apply(get_major_cn, axis=1)

    # formula of allele specific copy number according to hartwig's people
    merge['MAJOR_CN'] = round(merge['actual_Baf'] *
                              merge['TOTAL_CN']).astype(int)
    merge['MINOR_CN'] = round(
        (1 - merge['actual_Baf']) * merge['TOTAL_CN']).astype(int)

    merge['CHROM'] = merge['CHROM'].apply(lambda x: 'chr{}'.format(x))

    # save files
    merge.dropna()[[
        'CHROM', 'POS', 'REF', 'ALT', 'TRIPLET', 'EXTENDED', 'CLASS',
        'VARIANT_CLASS', 'SAMPLE', 'MAJOR_CN', 'MINOR_CN', 'TOTAL_CN',
        'NORMAL_CN', 'VAR_COUNTS', 'REF_COUNTS', 'GENDER', 'PURITY'
    ]].to_csv(outfile, sep='\t', index=False, header=True, compression='gzip')

    # clean BedTools temp files
    pybedtools.cleanup()
def parallel_generate_sc_intervals(bams, chromosomes, skip_bed, workdir, num_threads=1, min_avg_base_qual=SC_MIN_AVG_BASE_QUAL,
                                   min_mapq=SC_MIN_MAPQ, min_soft_clip=SC_MIN_SOFT_CLIP, max_soft_clip=SC_MAX_SOFT_CLIP, pad=SC_PAD,
                                   min_support=MIN_SUPPORT, min_support_frac=MIN_SUPPORT_FRAC, max_intervals=MAX_INTERVALS):
    func_logger = logging.getLogger(
        "%s-%s" % (parallel_generate_sc_intervals.__name__, multiprocessing.current_process()))

    if not os.path.isdir(workdir):
        func_logger.info("Creating directory %s" % workdir)
        os.makedirs(workdir)

    if not chromosomes:
        func_logger.info("Chromosome list unspecified. Inferring from the BAMs")
        for bam in bams:
            bamfile = pysam.Samfile(bam, "rb")
            chromosomes += list(bamfile.references)
            bamfile.close()
        chromosomes = sorted(list(set(chromosomes)))
        func_logger.info("Chromosome list inferred as %s" % (str(chromosomes)))

    if not chromosomes:
        func_logger.error("Chromosome list empty")
        return None

    pool = multiprocessing.Pool(num_threads)

    bed_files = []
    for index, (bam, chromosome) in enumerate(itertools.product(bams, chromosomes)):
        process_workdir = os.path.join(workdir, str(index))
        if not os.path.isdir(process_workdir):
            os.makedirs(process_workdir)

        args_list = [bam, chromosome, process_workdir]
        kwargs_dict = {"min_avg_base_qual": min_avg_base_qual, "min_mapq": min_mapq, "min_soft_clip": min_soft_clip,
                       "max_soft_clip": max_soft_clip, "pad": pad, "min_support": min_support,
                       "min_support_frac": min_support_frac}
        pool.apply_async(generate_sc_intervals, args=args_list, kwds=kwargs_dict,
                         callback=partial(generate_sc_intervals_callback, result_list=bed_files))

    pool.close()
    pool.join()

    func_logger.info("Following BED files will be merged: %s" % (str(bed_files)))

    if not bed_files:
        func_logger.warn("No intervals generated")
        return None

    pybedtools.set_tempdir(workdir)
    bedtool = pybedtools.BedTool(bed_files[0])

    for bed_file in bed_files[1:]:
        bedtool = bedtool.cat(pybedtools.BedTool(bed_file), postmerge=False)

    bedtool = bedtool.moveto(os.path.join(workdir, "all_intervals.bed"))

    func_logger.info("Selecting the top %d intervals based on normalized read support" % max_intervals)
    top_intervals_all_cols_file = os.path.join(workdir, "top_intervals_all_cols.bed")
    if bedtool.count() <= max_intervals:
        bedtool = bedtool.saveas(top_intervals_all_cols_file)
    else:
        # Sample the top intervals
        top_fraction_cutoff = sorted([float(interval.score) / float(interval.fields[6]) for interval in bedtool], reverse=True)[max_intervals-1]
        bedtool = bedtool.filter(lambda x: float(x.score) / float(x.fields[6]) >= top_fraction_cutoff).moveto(top_intervals_all_cols_file)

    # Filter out the extra column added to simplify life later on
    bedtool = bedtool.cut(xrange(6)).saveas(os.path.join(workdir, "top_intervals.bed"))

    if skip_bed:
        skip_bedtool = pybedtools.BedTool(skip_bed)
        func_logger.info(
            "Merging %d features with %d features from %s" % (bedtool.count(), skip_bedtool.count(), skip_bed))
        bedtool = skip_bedtool.cat(bedtool, postmerge=False).sort()
        func_logger.info("After merging with %s %d features" % (skip_bed, bedtool.count()))

    bedtool = bedtool.saveas(os.path.join(workdir, "intervals.bed"))

    pybedtools.cleanup(remove_all=True)

    return bedtool.fn
示例#50
0
def teardown():
    pybedtools.cleanup()
示例#51
0
def filter_by_covr(cov_filename, cov_site_min, cov_gene_min, gtffile,
                   genomic_regions, tmp_dir):
    """
	Wrapper for filtering nodes. Filter based on minimum unique read counts and minimum gene expression.
	"""
    node_list = [
        chr_strand + ':' + str(start) + ':' + str(end)
        for chr_strand in genomic_regions
        for start, end in genomic_regions[chr_strand]
    ]
    covfile = pysam.Samfile(cov_filename)
    gene_preserved = deque()
    site_preserved = deque()

    if cov_site_min > 0:
        k = 0
        for chr_strand in genomic_regions:
            chr, strand = chr_strand.split(':')
            print_time_stamp('filtering site: ' + chr_strand)
            for start, end in genomic_regions[chr_strand]:
                k += 1
                if not k % 10000:
                    print_time_stamp('filtering site count: ' + str(k) + '/' +
                                     str(len(node_list)))
                node = chr_strand + ':' + str(start) + ':' + str(end)
                num_reads = sum([
                    1 for x in covfile.fetch(chr, int(start), int(end))
                    if x.pos > start and x.pos < end
                ])
                if num_reads >= cov_site_min:
                    site_preserved.add(node)
    else:
        site_preserved = set(node_list)

    if cov_gene_min > 0:
        genomic_regions_list = [
            (chr_strand.split(':')[0], int(start), int(end),
             chr_strand + ':' + ':'.join([str(start), str(end)]), 'X',
             chr_strand.split(':')[1]) for chr_strand in genomic_regions
            for start, end in genomic_regions[chr_strand]
        ]
        genomic_regions_bed = pybedtools.BedTool(genomic_regions_list)
        gtf = pybedtools.BedTool(gtffile)
        overlap_transcripts = genomic_regions_bed.intersect(gtf,
                                                            wo=True,
                                                            s=True)
        overlap_transcripts.saveas(tmp_dir + '/genomic_regions.gtf.bed')
        total = len(overlap_transcripts)
        pybedtools.cleanup()
        del overlap_transcripts
        del gtf
        del genomic_regions_list

        cov_scale = sum([
            int(x.split('\t')[2])
            for x in pysam.idxstats(cov_filename).split('\n') if len(x) > 0
        ]) / 1000000.0

        #gene_fpkm=read_cufflinks('/u/home/f/frankwoe/scratch/Ule_RNAseq_hnRNPC/cufflinks_output_star/genes.fpkm_tracking')
        gene_rpkm = {}
        k = 0
        f = open(tmp_dir + '/genomic_regions.gtf.bed', 'r')
        for ele in f:
            line = ele.split()
            k += 1
            if not k % 10000:
                print_time_stamp('filtering gene RPKM: ' + str(k) + '/' +
                                 str(total))
            node = line[3]
            #if not node in site_preserved:
            #	continue
            gene_id = line[9]
            #RPKM = gene_fpkm[gene_id] if gene_id in gene_fpkm else 0
            if gene_id in gene_rpkm:
                RPKM = gene_rpkm[gene_id]
            else:
                chr, start, end = line[6], line[7], line[8]
                transcript_count = covfile.count(chr, int(start), int(end))
                block_sizes = [int(x) for x in line[16].split(',') if x != '']
                gene_len = sum(block_sizes) / 1000.0
                RPKM = transcript_count / cov_scale / gene_len
                gene_rpkm[gene_id] = RPKM

            if RPKM >= cov_gene_min:
                gene_preserved.append(node)
        gene_preserved = set(gene_preserved)
        f.close()
    else:
        gene_preserved = set(node_list)

    return gene_preserved.intersection(site_preserved)
示例#52
0
            if options.snpformat == "VCFID":
                snpid = str(line_vcf[2])
            else:
                snpid = str(line_vcf[0].lstrip("chr")) + ":" + str(
                    line_vcf[1]) + ":" + str(line_vcf[3]) + ":" + str(
                        line_vcf[4])
            if snpid in allsnplist:
                counts = findcarriers(line_vcf, options.gtfield,
                                      options.snpformat, sampleindices,
                                      options.maxAC, options.maxAF,
                                      options.minAN)
                if counts[2] > 0:
                    count_table[snpid] = [
                        snpid, counts[0], counts[1], counts[2]
                    ]
pybedtools.cleanup()

#Generate output counts
outfile = open(options.outfilename, "w")
outfile.write(
    "#GENE\tCASE_COUNT_HET\tCASE_COUNT_CH\tCASE_COUNT_HOM\tCASE_TOTAL_AC\n")
snpfile = open(options.snpfilename, "r")
for line_s1 in snpfile:
    line_s = line_s1.rstrip('\n').split('\t')
    if line_s[0][0] != "#":
        genesnplist = list(set(line_s[1].split(',')))
        counts = calculatecount(genesnplist, count_table)
        outfile.write(line_s[0] + "\t" + str(counts[0]) + "\t" +
                      str(counts[1]) + "\t" + str(counts[2]) + "\t" +
                      str(counts[3]) + '\n')
outfile.close()
示例#53
0
    ## retrieve bin-value relationships
    sampleName = args.name[index]
    binValDict = RunMetagene(inputBedDict, args, kwargs)
    ## Deletes all temp files from the current session
    pybedtools.cleanup(verbose=False, remove_all=False)
    if iboolDict['bam']:
        tempFile.close()
    return [sampleName, binValDict]


# main program
if __name__ == '__main__':
    ## setting temporary dir for pybedtools
    pybedtools.set_tempdir(args.temp)
    if args.deltmp:
        pybedtools.cleanup(verbose=False, remove_all=True)
    ## judge arguments
    FileExist([args.anno], 'anno')
    if args.reverse and args.strand:
        sys.exit('--reverse and --strand should not be True at the same time!')
    if args.gene != 'protein_coding' and args.feature in [
            'coding', 'utr5', 'cds', 'utr3'
    ]:
        sys.exit(
            '--feature should not be ["coding", "utr5", "cds", "utr3"] when --gene is not "protein_coding"!'
        )
    ## bam and bed judgement
    iboolDict = defaultdict(bool)
    ibool = 0
    if bool(args.bed):
        FileExist(args.bed, 'bed')