Пример #1
0
def genes_to_bedtool(gene_collection, hgnc_ids=None, ensembl_ids=None, build="GRCh37"):
    """Create a Bedtool object with gene coordinates from a list of genes contained in the database

    Accepts:
        hgnc_ids(list): a list of hgnc genes ids
        ensembl_ids(list): a list of ensembl gene ids
        gene_collection(pymongo.collection.Collection)
        build(str): genome build, GRCh37 or GRCh38

    Returns:
        bt(pybedtools.bedtool.BedTool): a BedTool object containing gene intervals
    """
    if not (hgnc_ids or ensembl_ids):
        return None  # No gene was specified to filter VCF file with

    query = {"build": build}
    if hgnc_ids:
        query["hgnc_id"] = {"$in": hgnc_ids}
    elif ensembl_ids:  # either HGNC or ENSEMBL IDs, not both in the query dictionary
        query["ensembl_id"] = {"$in": ensembl_ids}
    # Query database for genes coordinates
    results = gene_collection.find(query)
    # Create a string containing gene intervals to initialize a Bedtool object with
    bedtool_string = ""
    for gene in results:
        bedtool_string += (
            "\t".join([gene["chromosome"], str(gene["start"]), str(gene["end"])]) + "\n"
        )
    if bedtool_string == "":
        return None
    bt = BedTool(bedtool_string, from_string=True)
    return bt
def aggregate_by_tad(all_TADs_by_celltype,
                     aggregations,
                     other,
                     extension=0.1,
                     n_windows=100):
    tot_windows = n_windows + int(n_windows * extension) * 2
    tad_start_window = int(n_windows * extension)
    tad_end_window = n_windows + int(n_windows * extension)

    regions = all_TADs_by_celltype[coords + ['tad_uid']].copy()
    regions['tad_uid'] = regions.tad_uid.map(lambda x: x.replace("_", "-"))
    windows = BedTool().window_maker(b=BedTool.from_dataframe(regions)\
                                     .slop(l=extension, r=extension,
                                           pct=True, genome="hg19"),
                                     n=tot_windows, i='srcwinnum')\
                           .to_dataframe(names=coords + ['window_uid'])
    windows_idxs = windows.window_uid.str.split("_", expand=True)
    windows_idxs.columns = ['tad_uid', 'win_num']
    windows = pd.concat((windows, windows_idxs), axis=1)
    windows['win_num'] = windows['win_num'].astype(int)
    windows = windows.sort_values(coords).reset_index(drop=True)

    windows_with_ctcfs = coverage_by_window(windows, other, aggregations)
    aggregations_by_tad = {}
    for c in aggregations.keys():
        print(" " * 100, end='\r')
        print(c, end="\r")
        cagg = windows_with_ctcfs.pivot_table(index='tad_uid',
                                              columns='win_num',
                                              values=c).sort_index(axis=1)
        cagg = cagg.sort_index(axis=1)
        aggregations_by_tad[c] = cagg
    return aggregations_by_tad, tad_start_window, tad_end_window
Пример #3
0
def align_list_to_bed(*, align_list):

    temp_align_list = align_list.assign(assembly_id_sequence_id=lambda x: x[
        'assembly_id'] + ',' + x['sequence_id'])
    temp_bed = BedTool.from_dataframe(temp_align_list[[
        'assembly_id_sequence_id', 'sequence_from', 'sequence_to'
    ]])

    temp_merged_bed = temp_bed.sort().merge()
    if temp_merged_bed.count() > 0:
        temp_merged_bed_df = temp_merged_bed.to_dataframe()
        temp_merged_bed_df = pandas.concat([
            temp_merged_bed_df['chrom'].str.split(
                ',', n=1, expand=True).rename(columns={
                    0: 'assembly_id',
                    1: 'sequence_id'
                }), temp_merged_bed_df[['start', 'end']]
        ],
                                           axis=1)
        bed = BedTool.from_dataframe(
            temp_merged_bed_df[['sequence_id', 'start', 'end', 'assembly_id']])
        os.remove(temp_merged_bed.fn)
    else:
        bed = BedTool('', from_string=True)

    os.remove(temp_bed.fn)

    return bed
Пример #4
0
def windowing_by_number(all_TADs_by_celltype, n_windows):
    windows = BedTool().window_maker(b=BedTool.from_dataframe(all_TADs_by_celltype), 
                                     n=n_windows, i='srcwinnum')\
                       .to_dataframe(names=all_TADs_by_celltype.columns.tolist())
    idxs = windows[all_TADs_by_celltype.columns[-1]].str.split("_", expand=True)
    tad_ids = idxs.iloc[:, :-1].apply(lambda x: "_".join(x), axis=1)
    w_nums = idxs.iloc[:, -1].astype(int) - 1
    windows[all_TADs_by_celltype.columns[-1]] = tad_ids
    windows['w_num'] = w_nums
    windows = windows.sort_values(coords).reset_index(drop=True)
    return windows
Пример #5
0
def windowing_by_size(centered_boundaries, window_size):    
    windows = BedTool().window_maker(b=BedTool.from_dataframe(centered_boundaries), 
                                     w=window_size, i='srcwinnum')\
                       .to_dataframe(names=centered_boundaries.columns.tolist())
    idxs = windows[centered_boundaries.columns[-1]].str.split("_", expand=True)
    tad_ids = idxs.iloc[:, :-1].apply(lambda x: "_".join(x), axis=1)
    w_nums = idxs.iloc[:, -1].astype(int) - 1
    windows[centered_boundaries.columns[-1]] = tad_ids
    windows['w_num'] = w_nums
    windows = windows.sort_values(coords).reset_index(drop=True)
    return windows
Пример #6
0
def merge_intervals(panels):
    """Create genomic intervals to filter VCF files starting from the provided panel file(s)

    Accepts:
        panels(list) : path to one or more panel bed files

    Returns:
        merged_panels(Temp BED File): a temporary file with merged panel intervals

    """
    merged_panels = BedTool(panels[0])
    if len(panels) > 1:
        merged_panels = merged_panels.cat(*panels[1:])

    return merged_panels
Пример #7
0
def _compute_intersections(vcf_file, filter):
    """Create a temporary file with the gene panel intervals

    Accepts:
        vcf_file(str): path to the VCF file
        filter(BcfTool object)

    Returns:
        intersections()
    """

    vcf_bed = BedTool(vcf_file)
    LOG.info(
        "Extracting %s intervals from the %s total entries of the VCF file.",
        filter.count(),
        vcf_bed.count(),
    )
    intersections = vcf_bed.intersect(filter, header=True)
    intersected_vars = intersections.count()
    LOG.info("Number of variants found in the intervals:%s", intersected_vars)

    return intersections