示例#1
0
def main(mcoolfile,outdir,filename,window,cutoff,binsize):
    f = mcoolfile

    #Get the list of resolutions in the mcool file
    cooler_list = cooler.io.ls(f)
    old_version= False

    if not any([res for res in cooler_list if '/resolutions/' in res]): #gets the resolutions from a file in a older version of cooler
        old_version = True
        binsize_list = []
        for res in cooler_list:
            cooler_path = str(f)+'::'+ res
            c = cooler.Cooler(cooler_path)
            binsize_list.append(int(c.binsize))
    else:
        binsize_list = []
        for res in cooler_list:
            binsize_list.append(int(res.split('/')[-1]))

    # Check the input parameters
    if binsize == -1:
        binsize = min(binsize_list)
    else:
        if binsize in binsize_list:
            if window % binsize != 0:
                print("Error: Window size must be multiple of binsize")
                sys.exit()
        else:
            print("Error: This binsize is not available in this mcool file. This is the list of binsizes availables:")
            print(binsize_list)
            sys.exit()

    # Creates a cooler object
    if old_version:
        res_list = []
        for res in cooler_list:
            res_list.append(int(res.split('/')[-1]))
            res_index = max(res_list)

        cooler_path = str(f) + '::' + str(res_index)
    else:
        cooler_path = str(f) + '::' + cooler_list[binsize_list.index(binsize)]
    c = cooler.Cooler(cooler_path)
    print(c)

    # Gets the chromsizes
    chromsizes=pd.Series(c.chroms()[:]['length'].values, index=c.chroms()[:]['name'].values)
    #Getting insulating boundaries
    insul = find_insulating_boundaries(c,balance='weight',window_bp=window,min_dist_bad_bin=2)

    #Convert to BigWig
    bioframe.to_bigwig(insul, chromsizes,
                       f'/{outdir}/{filename}.bw',
                       f'log2_insulation_score_{window}')
示例#2
0
def eigs_cis(
    cool_path,
    phasing_track,
    view,
    n_eigs,
    clr_weight_name,
    ignore_diags,
    verbose,
    out_prefix,
    bigwig,
):
    """
    Perform eigen value decomposition on a cooler matrix to calculate
    compartment signal by finding the eigenvector that correlates best with the
    phasing track.


    COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    TRACK_PATH : the path to a BedGraph-like file that stores phasing track as
    track-name named column.

    BedGraph-like format assumes tab-separated columns chrom, start, stop and
    track-name.

    """
    clr = cooler.Cooler(cool_path)

    if phasing_track is not None:

        # TODO: This all needs to be refactored into a more generic tabular file parser
        # Needs to handle stdin case too.
        track_path, col = phasing_track
        buf, names = sniff_for_header(track_path)

        if names is None:
            if not isinstance(col, int):
                raise click.BadParameter(
                    "No header found. "
                    'Cannot find "{}" column without a header.'.format(col))

            track_name = "ref"
            kwargs = dict(
                header=None,
                usecols=[0, 1, 2, col],
                names=["chrom", "start", "end", track_name],
            )
        else:
            if isinstance(col, int):
                try:
                    col = names[col]
                except IndexError:
                    raise click.BadParameter(
                        'Column #{} not compatible with header "{}".'.format(
                            col, ",".join(names)))
            else:
                if col not in names:
                    raise click.BadParameter(
                        'Column "{}" not found in header "{}"'.format(
                            col, ",".join(names)))

            track_name = col
            kwargs = dict(header="infer",
                          usecols=["chrom", "start", "end", track_name])

        track_df = pd.read_table(buf,
                                 dtype={
                                     "chrom": str,
                                     "start": np.int64,
                                     "end": np.int64,
                                     track_name: np.float64,
                                 },
                                 comment="#",
                                 verbose=verbose,
                                 **kwargs)
        phasing_track = track_df

    # define view for cis compartment-calling
    # use input "view" BED file or all chromosomes mentioned in "track":
    if view is None:
        cooler_view_df = make_cooler_view(clr)
        view_df = cooler_view_df
    else:
        view_df = read_viewframe_from_file(view, clr, check_sorting=True)

    # TODO: Add check that view_df has the same bins as track
    eigvals, eigvec_table = eigdecomp.eigs_cis(
        clr=clr,
        phasing_track=phasing_track,
        view_df=view_df,
        n_eigs=n_eigs,
        clr_weight_name=clr_weight_name,
        ignore_diags=ignore_diags,
        clip_percentile=99.9,
        sort_metric=None,
    )

    # Output
    eigvals.to_csv(out_prefix + ".cis" + ".lam.txt", sep="\t", index=False)
    eigvec_table.to_csv(out_prefix + ".cis" + ".vecs.tsv",
                        sep="\t",
                        index=False)
    if bigwig:
        bioframe.to_bigwig(
            eigvec_table,
            clr.chromsizes,
            out_prefix + ".cis" + ".bw",
            value_field="E1",
        )
示例#3
0
def insulation(
    in_path,
    window,
    output,
    view,
    ignore_diags,
    clr_weight_name,
    min_frac_valid_pixels,
    min_dist_bad_bin,
    threshold,
    window_pixels,
    append_raw_scores,
    chunksize,
    verbose,
    bigwig,
):
    """
    Calculate the diamond insulation scores and call insulating boundaries.

    IN_PATH : The paths to a .cool file with a balanced Hi-C map.

    WINDOW : The window size for the insulation score calculations.
             Multiple space-separated values can be provided.
             By default, the window size must be provided in units of bp.
             When the flag --window-pixels is set, the window sizes must
             be provided in units of pixels instead.
    """

    clr = cooler.Cooler(in_path)

    # Create view:
    cooler_view_df = make_cooler_view(clr)
    if view is None:
        # full chromosomes:
        view_df = cooler_view_df
    else:
        # read view_df dataframe, and verify against cooler
        view_df = read_viewframe_from_file(view, clr, check_sorting=True)

    # Read list with windows:
    if window_pixels:
        window = [win * clr.info["bin-size"] for win in window]

    ins_table = api.insulation.insulation(
        clr,
        view_df=view_df,
        window_bp=window,
        ignore_diags=ignore_diags,
        clr_weight_name=clr_weight_name if clr_weight_name else None,
        min_frac_valid_pixels=min_frac_valid_pixels,
        min_dist_bad_bin=min_dist_bad_bin,
        threshold=threshold,
        append_raw_scores=append_raw_scores,
        chunksize=chunksize,
        verbose=verbose,
    )

    # output to file if specified:
    if output:
        ins_table.to_csv(output, sep="\t", index=False, na_rep="nan")
    # or print into stdout otherwise:
    else:
        print(ins_table.to_csv(sep="\t", index=False, na_rep="nan"))

    # Write the insulation track as a bigwig:
    if bigwig:
        for w in window:
            bioframe.to_bigwig(
                ins_table,
                clr.chromsizes,
                output + "." + str(w) + ".bw",
                value_field=f"log2_insulation_score_{w}",
            )
示例#4
0
def call_compartments(cool_path, reference_track, contact_type, n_eigs,
                      verbose, out_prefix, bigwig):
    """
    Perform eigen value decomposition on a cooler matrix to calculate
    compartment signal by finding the eigenvector that correlates best with the
    phasing track.


    COOL_PATH : the paths to a .cool file with a balanced Hi-C map.

    TRACK_PATH : the path to a BedGraph-like file that stores phasing track as
    track-name named column.

    BedGraph-like format assumes tab-separated columns chrom, start, stop and
    track-name.

    """
    clr = cooler.Cooler(cool_path)

    if reference_track is not None:

        # TODO: This all needs to be refactored into a more generic tabular file parser
        # Needs to handle stdin case too.
        track_path, col = reference_track
        buf, names = sniff_for_header(track_path)

        if names is None:
            if not isinstance(col, int):
                raise click.BadParameter(
                    "No header found. "
                    'Cannot find "{}" column without a header.'.format(col))

            track_name = "ref"
            kwargs = dict(
                header=None,
                usecols=[0, 1, 2, col],
                names=["chrom", "start", "end", track_name],
            )
        else:
            if isinstance(col, int):
                try:
                    col = names[col]
                except IndexError:
                    raise click.BadParameter(
                        'Column #{} not compatible with header "{}".'.format(
                            col, ",".join(names)))
            else:
                if col not in names:
                    raise click.BadParameter(
                        'Column "{}" not found in header "{}"'.format(
                            col, ",".join(names)))

            track_name = col
            kwargs = dict(header="infer",
                          usecols=["chrom", "start", "end", track_name])

        track_df = pd.read_table(buf,
                                 dtype={
                                     "chrom": str,
                                     "start": np.int64,
                                     "end": np.int64,
                                     track_name: np.float64,
                                 },
                                 comment="#",
                                 verbose=verbose,
                                 **kwargs)

        # we need to merge phasing track DataFrame with the cooler bins to get
        # a DataFrame with phasing info aligned and validated against bins inside of
        # the cooler file.
        track = pd.merge(left=clr.bins()[:],
                         right=track_df,
                         how="left",
                         on=["chrom", "start", "end"])

        # sanity check would be to check if len(bins) becomes > than nbins ...
        # that would imply there was something in the track_df that didn't match
        # ["chrom", "start", "end"] - keys from the c.bins()[:] .
        if len(track) > len(clr.bins()):
            ValueError(
                "There is something in the {} that ".format(track_path) +
                "couldn't be merged with cooler-bins {}".format(cool_path))
    else:
        track = clr.bins()[["chrom", "start", "end"]][:]
        track_name = None

    # it's contact_type dependent:
    if contact_type == "cis":
        eigvals, eigvec_table = eigdecomp.cooler_cis_eig(
            clr=clr,
            bins=track,
            regions=None,
            n_eigs=n_eigs,
            phasing_track_col=track_name,
            clip_percentile=99.9,
            sort_metric=None,
        )
    elif contact_type == "trans":
        eigvals, eigvec_table = eigdecomp.cooler_trans_eig(
            clr=clr,
            bins=track,
            n_eigs=n_eigs,
            partition=None,
            phasing_track_col=track_name,
            sort_metric=None,
        )

    # Output
    eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt",
                   sep="\t",
                   index=False)
    eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv",
                        sep="\t",
                        index=False)
    if bigwig:
        bioframe.to_bigwig(
            eigvec_table,
            clr.chromsizes,
            out_prefix + "." + contact_type + ".bw",
            value_field="E1",
        )
示例#5
0
def save_bigwig(vectors, savepath, genome, columns=['E1', 'E2', 'E3']):
    chroms = fetch_chromsizes(genome)
    for item in columns:
        save = savepath+'.{}.bw'.format(item)
        create_dir(save)
        to_bigwig(vectors, chroms, save, value_field=item)
示例#6
0
def main(mcoolfile, outdir, filename, binsize, window, bweak, bstrong, cutoff,
         pixels_frac):
    f = mcoolfile

    # Get the list of resolutions in the mcool file
    cooler_list = cooler.fileops.list_coolers(f)
    old_version = False

    # gets the resolutions from a file in a older version of cooler
    if not any([res for res in cooler_list if '/resolutions/' in res]):
        old_version = True
        binsize_list = []
        for res in cooler_list:
            cooler_path = str(f) + '::' + res
            c = cooler.Cooler(cooler_path)
            binsize_list.append(int(c.binsize))
    else:
        binsize_list = []
        for res in cooler_list:
            binsize_list.append(int(res.split('/')[-1]))

    # Check the input parameters
    if binsize == -1:
        binsize = min(binsize_list)
    else:
        if binsize in binsize_list:
            if window % binsize != 0:
                print("Error: Window size must be multiple of binsize")
                sys.exit()
        else:
            print(
                "Error: This binsize is not available in this mcool file. This is the list of binsizes availables:"
            )
            print(binsize_list)
            sys.exit()

    # Creates a cooler object
    if old_version:
        res_list = []
        for res in cooler_list:
            res_list.append(int(res.split('/')[-1]))
            res_index = max(res_list)

        cooler_path = str(f) + '::' + str(res_index)
    else:
        cooler_path = str(f) + '::' + cooler_list[binsize_list.index(binsize)]
    c = cooler.Cooler(cooler_path)
    print(c)

    # Gets the chromsizes
    chromsizes = pd.Series(c.chroms()[:]['length'].values,
                           index=c.chroms()[:]['name'].values)

    # calculate insulation score
    ins_table = calculate_insulation_score(c, window)

    # Find boundaries
    ins_table = find_boundaries(ins_table, pixels_frac, cutoff)

    # Filter out insulation score in which pixels are less than 66% of the max number of pixels
    icutoff = pow((window / binsize), 2) * pixels_frac

    ins_table_filtered = ins_table[
        ins_table[f'n_valid_pixels_{window}'] >= icutoff]
    # Convert to BigWig
    bioframe.to_bigwig(ins_table_filtered, chromsizes,
                       f'{outdir}/{filename}.bw',
                       f'log2_insulation_score_{window}')

    # Classify the boundaries as strong and week
    # strong boundaries >= 0.5,  weak 0.2>= and < 0.5
    boun_table = ins_table_filtered[
        ins_table_filtered[f'boundary_strength_{window}'] >= bweak].copy()
    boun_table['boundary_classification'] = np.where(
        boun_table[f'boundary_strength_{window}'] >= bstrong, 'Strong', 'Weak')
    # Filter boundaries abover thresholds
    columns = [
        'chrom', 'start', 'end', 'boundary_classification',
        f'boundary_strength_{window}'
    ]
    boun_table.to_csv(f'{outdir}/{filename}_boundaries.bed',
                      sep='\t',
                      header=None,
                      index=False,
                      columns=columns)
                               for chrom in cool.chromnames[0:22]]}
#supports

# Computing eigenvealues and eigenvectors
lams, vectors = eigdecomp.cooler_cis_eig(cool, genes, regions=supports['cis'], 
                                                   phasing_track_col='gene_count', 
                                                   sort_metric='spearmanr')
# cooler_cis_eig sorts eigenvectors by decreasing Spearman correlation with the phasing track 
# (gene count in this case). In the past, people have used the eigenvector associated with the max 
# eigenvalue. It is worth considering situations where these two sorting process differ.

# Output
contact_type='cis'
lams.to_csv(out_prefix + '.' + contact_type +'.'+  res+ '.lam.txt', sep='\t', index=False)
vectors.to_csv(out_prefix + '.' + contact_type +'.'+ res+'.vecs.tsv', sep='\t', index=False)
bioframe.to_bigwig(vectors,cool.chromsizes,out_prefix +'.' + contact_type + '.'+ res+ '.bw',value_field='E1')


exp = {}
#Cis Expected
with mp.Pool(10) as p:
    result = expected.diagsum(cool, supports['cis'],
                    transforms={'balanced': lambda p: p['count']*p['weight1']*p['weight2']},
                    ignore_diags=2, map=p.map)
result = pd.concat([result[arm] for arm in supports['cis']], 
                   keys=[(arm[0], f'{arm[0]}:{arm[1]}-{arm[2]}') for arm in supports['cis']], 
                   names=['chrom', 'region'])
result = result.groupby(['chrom','diag']).sum()
result['balanced.avg'] = result['balanced.sum'] / result['n_valid']
result = result.reset_index()
exp['cis'] = result
def call_compartments(
    cool_path,
    reference_track,
    regions,
    contact_type,
    n_eigs,
    verbose,
    out_prefix,
    bigwig,
):
    """
    Perform eigen value decomposition on a cooler matrix to calculate
    compartment signal by finding the eigenvector that correlates best with the
    phasing track.


    COOL_PATH : the paths to a .cool file with a balanced Hi-C map. Use the
    '::' syntax to specify a group path in a multicooler file.

    TRACK_PATH : the path to a BedGraph-like file that stores phasing track as
    track-name named column.

    BedGraph-like format assumes tab-separated columns chrom, start, stop and
    track-name.

    """
    clr = cooler.Cooler(cool_path)

    if reference_track is not None:

        # TODO: This all needs to be refactored into a more generic tabular file parser
        # Needs to handle stdin case too.
        track_path, col = reference_track
        buf, names = sniff_for_header(track_path)

        if names is None:
            if not isinstance(col, int):
                raise click.BadParameter(
                    "No header found. "
                    'Cannot find "{}" column without a header.'.format(col))

            track_name = "ref"
            kwargs = dict(
                header=None,
                usecols=[0, 1, 2, col],
                names=["chrom", "start", "end", track_name],
            )
        else:
            if isinstance(col, int):
                try:
                    col = names[col]
                except IndexError:
                    raise click.BadParameter(
                        'Column #{} not compatible with header "{}".'.format(
                            col, ",".join(names)))
            else:
                if col not in names:
                    raise click.BadParameter(
                        'Column "{}" not found in header "{}"'.format(
                            col, ",".join(names)))

            track_name = col
            kwargs = dict(header="infer",
                          usecols=["chrom", "start", "end", track_name])

        track_df = pd.read_table(buf,
                                 dtype={
                                     "chrom": str,
                                     "start": np.int64,
                                     "end": np.int64,
                                     track_name: np.float64,
                                 },
                                 comment="#",
                                 verbose=verbose,
                                 **kwargs)

        # we need to merge phasing track DataFrame with the cooler bins to get
        # a DataFrame with phasing info aligned and validated against bins inside of
        # the cooler file.
        track = pd.merge(left=clr.bins()[:],
                         right=track_df,
                         how="left",
                         on=["chrom", "start", "end"])

        # sanity check would be to check if len(bins) becomes > than nbins ...
        # that would imply there was something in the track_df that didn't match
        # ["chrom", "start", "end"] - keys from the c.bins()[:] .
        if len(track) > len(clr.bins()):
            ValueError(
                "There is something in the {} that ".format(track_path) +
                "couldn't be merged with cooler-bins {}".format(cool_path))
    else:
        # use entire bin-table from cooler, when reference-track is not provided:
        track = clr.bins()[["chrom", "start", "end"]][:]
        track_name = None

    # define regions for cis compartment-calling
    # use input "regions" BED file or all chromosomes mentioned in "track":
    if regions is None:
        # use full chromosomes referred to in the track :
        track_chroms = track["chrom"].unique()
        cis_regions_table = bioframe.parse_regions(track_chroms,
                                                   clr.chromsizes)
        cis_regions_table["name"] = cis_regions_table["chrom"]
    else:
        if contact_type == "trans":
            raise NotImplementedError(
                "Regions not yet supported with trans contact type")
        # Flexible reading of the regions table:
        regions_buf, names = sniff_for_header(regions)
        cis_regions_table = pd.read_csv(regions_buf, sep="\t", header=None)
        if cis_regions_table.shape[1] not in (3, 4):
            raise ValueError(
                "The region file does not have three or four tab-delimited columns."
                "We expect a bed file with columns chrom, start, end, and optional name"
            )
        if cis_regions_table.shape[1] == 4:
            cis_regions_table = cis_regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end",
                3: "name"
            })
            cis_regions_table = bioframe.parse_regions(cis_regions_table)
        else:
            cis_regions_table = cis_regions_table.rename(columns={
                0: "chrom",
                1: "start",
                2: "end"
            })
            cis_regions_table = bioframe.parse_regions(cis_regions_table)
        # make sure custom regions are compatible with the track:
        track_chroms = track["chrom"].unique()
        cis_regions_table = cis_regions_table[cis_regions_table["chrom"].isin(
            track_chroms)].reset_index(drop=True)

    # it's contact_type dependent:
    if contact_type == "cis":
        eigvals, eigvec_table = eigdecomp.cooler_cis_eig(
            clr=clr,
            bins=track,
            regions=cis_regions_table,
            n_eigs=n_eigs,
            phasing_track_col=track_name,
            clip_percentile=99.9,
            sort_metric=None,
        )
    elif contact_type == "trans":
        eigvals, eigvec_table = eigdecomp.cooler_trans_eig(
            clr=clr,
            bins=track,
            n_eigs=n_eigs,
            partition=None,
            phasing_track_col=track_name,
            sort_metric=None,
        )

    # Output
    eigvals.to_csv(out_prefix + "." + contact_type + ".lam.txt",
                   sep="\t",
                   index=False)
    eigvec_table.to_csv(out_prefix + "." + contact_type + ".vecs.tsv",
                        sep="\t",
                        index=False)
    if bigwig:
        bioframe.to_bigwig(
            eigvec_table,
            clr.chromsizes,
            out_prefix + "." + contact_type + ".bw",
            value_field="E1",
        )