Exemplo n.º 1
0
def load_bed(bed_path, as_tensor=True, strict=False):
    """
    Load a single BED of bins with SV counts and features as a tuple of Tensors (features, labels)
    If 'sv' column not present in bed_path, will return None for labels
    """

    df = dfutils.load_feature_df(bed_path)

    if 'sv' not in df.columns:
        if strict:
            err = 'ERROR: required column "sv" not found in input BED {}'
            from sys import exit
            exit(err.format(bed_path))
        else:
            warn = 'WARNING: column "sv" not found in input BED {}; only loading features'
            print(warn.format(bed_path))
            if as_tensor:
                features = torch.tensor(df.values).float()
                labels = None

    else:
        if as_tensor:
            features = torch.tensor(df.drop('sv', axis=1).values).float()
            labels = torch.tensor(df[['sv']].values).float()

    return features, labels
Exemplo n.º 2
0
def feature_hists(bed,
                  png_prefix,
                  skip_cols=3,
                  log_transform=None,
                  sqrt_transform=None,
                  exp_transform=None,
                  square_transform=None,
                  boxcox_transform=None,
                  fill_missing=None):
    """
    Plot simple histograms for all columns in a BED file
    """

    # Load & sanitize features
    df = dfutils.load_feature_df(bed, skip_cols, log_transform, sqrt_transform,
                                 exp_transform, square_transform,
                                 boxcox_transform, fill_missing)

    def _simple_hist(vals, title):
        """
        Plot a single simple histogram of values
        """

        # Subset to values within the middle 99.9% of data
        n_nan = len(vals[np.isnan(vals)])
        vals_num = vals[~np.isnan(vals)]
        vlims = np.quantile(vals_num, q=[0.0005, 0.9995])
        n_outlier = len(vals_num[(vals_num < vlims[0]) |
                                 (vals_num > vlims[1])])
        vals_plot = vals_num[(vals_num >= vlims[0]) & (vals_num <= vlims[1])]

        # Plot & format histogram
        fig, ax = plt.subplots()
        n, bins, patches = plt.hist(vals_plot, 25)
        plt.subplots_adjust(top=0.8)

        # Add axes & title
        ax.set_xlabel(title)
        ax.set_ylabel('Bins')
        fulltitle = '\n'.join([
            title, '{:,} total bins'.format(len(vals)),
            '{:,} bins with missing values (not shown)'.format(n_nan),
            '{:,} outlier bins (not shown)'.format(n_outlier)
        ])
        ax.set_title(fulltitle)

    # Plot one histogram per column
    for i in range(len(df.columns)):
        title = df.columns[i]
        vals = df[title]
        plot_title = title.replace('/', '_').replace(' ', '_')
        _simple_hist(vals, title)
        plt.savefig('.'.join([png_prefix, plot_title, 'png']), format='png')
Exemplo n.º 3
0
def load_bed(bed_path, as_tensor=True):
    """
    Load a single BED of bins with SV counts and features as a tuple of Tensors (features, labels)
    """

    df = dfutils.load_feature_df(bed_path)

    if 'sv' not in df.columns:
        err = 'ERROR: required column "sv" not found in input BED {}'
        from sys import exit
        exit(err.format(bed_path))

    if as_tensor:
        features = torch.tensor(df.drop('sv', axis=1).values).float()
        labels = torch.tensor(df[['sv']].values).float()
        return features, labels
Exemplo n.º 4
0
def mu_predict(pairs, model_pkl, outfile, raw_mu, keep_features, maxfloat,
               bgzip):
    """
    Apply a trained mutation rate model to new bin-pairs
    """

    # Load pairs and split coordinates from features
    coords = pd.read_csv(pairs, sep='\t', usecols=range(3))
    feats, labels = load_bed(pairs)
    if keep_features:
        feats_df = dfutils.load_feature_df(pairs)

    # Load model from .pkl and switch to evaluation mode
    model = torch.load(model_pkl)
    model.eval()

    # Predict mutation rates for all bins
    with torch.no_grad():
        preds = model(feats).numpy()
        if not raw_mu:
            preds = log10(preds)
        preds_df = pd.DataFrame(preds, columns=['mu'])

    # Format output dataframe
    out_df = pd.concat([coords, preds_df], axis=1)
    if keep_features:
        out_df = pd.concat([out_df, feats_df], axis=1)
    out_df = dfutils.float_cleanup(out_df, maxfloat=maxfloat, start_idx=3)

    # Save pairs with predicted mutation rates
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    out_df.to_csv(outfile, sep='\t', index=False)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)
Exemplo n.º 5
0
def decompose_bins(bins, bins_outfile=None, parameters_outfile=None, precomp_model=None, 
                   components=10, minvar=None, trans_dict=None, whiten=False, 
                   fill_missing=0, first_column=3, maxfloat=5, max_pcs=100, 
                   pca_stats=None, eigen_prefix='eigenfeature', bgzip=False):
    """
    Master function for Eigendecomposition of bin annotations
    """

    # Set certain defaults prior to loading precomputed model
    whitener = None

    # Load precomputed model, if optioned
    if precomp_model is not None:
        df_fills, trans_dict, scaler, pca, components, whitener = \
            _load_precomp_model(precomp_model)
        fill_missing = df_fills

    # Expand feature transformation dictionary
    log_transform = trans_dict.get('log', [])
    sqrt_transform = trans_dict.get('sqrt', [])
    exp_transform = trans_dict.get('exp', [])
    square_transform = trans_dict.get('square', [])
    boxcox_transform = trans_dict.get('boxcox', [])

    # Read bins, then sanitize and transform annotations
    df_bins = pd.read_csv(bins, sep='\t', usecols=range(first_column))
    df_annos, df_fills = \
        dfutils.load_feature_df(bins, first_column, log_transform, sqrt_transform, 
                                exp_transform, square_transform,  boxcox_transform, 
                                fill_missing, return_fills=True)
    feature_names = df_annos.columns.tolist()

    # Scale all columns
    if precomp_model is None:
        scaler = StandardScaler().fit(df_annos)
    df_annos = scaler.transform(df_annos)

    # Learn covariance matrix & determine number of components to keep
    if precomp_model is None:
        pcs_to_calc = min([df_annos.shape[1], max_pcs])
        pca = PCA(n_components=pcs_to_calc).fit(df_annos)
        if minvar is None:
            components = pcs_to_calc
        else:
            components = len([i for i in np.cumsum(pca.explained_variance_ratio_) \
                              if i < minvar])

    # Decompose annotations
    pcs = pca.transform(df_annos)
    eigen_names = ['_'.join([eigen_prefix, str(i+1)]) for i in range(components)]
    df_pcs = pd.DataFrame(pcs[:, :components], columns=eigen_names)

    # "Whiten" eigenfeatures, if optioned
    if whiten:
        if precomp_model is None:
            whitener = StandardScaler().fit(df_pcs)
    if whitener is not None:
        df_pcs = pd.DataFrame(whitener.transform(df_pcs), columns=eigen_names)

    # Write output bins with PCs
    if bins_outfile is not None:
        if 'compressed' in determine_filetype(bins_outfile):
            bins_outfile = path.splitext(bins_outfile)[0]
        out_df = dfutils.float_cleanup(pd.concat([df_bins, df_pcs], axis=1), 
                                       maxfloat, first_column)
        out_df.to_csv(bins_outfile, sep='\t', index=False)
        if bgzip:
            bgz(bins_outfile)

    # Save model for future use, if optioned
    if parameters_outfile is not None:
        _save_model_params(df_fills, trans_dict, scaler, pca, components, 
                           whitener, parameters_outfile)

    # Perform extra assessments of PCA & feature fits, if optioned
    if pca_stats is not None:
        get_feature_stats(df_annos, feature_names, pca, pcs, pca_stats, 
                          eigen_prefix, components)
Exemplo n.º 6
0
def count_sv(bins_in, sv_in, outfile, paired, binsize, breakpoints, probs,
             sv_ci, maxfloat, bgzip):
    """
    Master function to annotate bins_in with count (or probability) of SVs
    """

    # Load bins, split bin coordinates from annotations, and retain header
    if 'compressed' in determine_filetype(bins_in):
        bins_header = gzip.open(
            bins_in, 'r').readline().decode('utf-8').rstrip().split('\t')
    else:
        bins_header = open(bins_in, 'r').readline().rstrip().split('\t')
    bins_bt = pbt.BedTool(bins_in).cut(range(3)).saveas()
    bins_df = bins_bt.to_dataframe()
    feats_df = dfutils.load_feature_df(bins_in)
    if binsize is None:
        binsize = calc_binsize(bins_in)

    # Parse input SV file depending on format
    # If breakpoints == False, will return simple four-column BED with variant ID in fourth column
    # If breakpoints == True, will return two rows per record where each record
    # is one breakpoint with columns 4 = variant ID, 5 = POS or END, 6 = original
    # POS or END coordinate, 7 = std dev of left side of breakpoint, 8 = std dev of
    # right side of breakpoint, and 9 = number of std deviations extended left & right (i.e., z_extend)
    sv_format = determine_filetype(sv_in)
    if 'vcf' in sv_format:
        vcf = pysam.VariantFile(sv_in)
        sv = vcf2bed(vcf,
                     breakpoints=breakpoints,
                     add_ci_to_bkpts=probs,
                     ci=sv_ci)
    elif 'bed' in sv_format:
        sv = _load_sv_from_bed(sv_in, breakpoints=breakpoints)

    # Perform intersection with bins depending on input parameters
    if breakpoints:
        bins_bt = add_names_to_bed(bins_bt)
        bin_ids = [b.name for b in bins_bt]

        # Split pairs if necessary
        if paired:
            bins_bt = _split_pairs(bins_bt,
                                   binsize=binsize,
                                   add_name=True,
                                   add_side=True)

        # Intersect breakpoints with bins
        hits = bins_bt.intersect(sv, wa=True, wb=True)
        bkpt_res = parse_breakpoint_hits(hits, paired, probs)
        sv_column = pd.Series([bkpt_res.get(b_id, 0) for b_id in bin_ids])

    # --comparison "overlap" (i.e., breakpoints == False) is the same for both 1D and 2D bins
    else:
        if probs:
            sv_column = pd.Series(
                [min([1, int(x[-1])]) for x in bins_bt.intersect(sv, c=True)])
        else:
            sv_column = pd.Series(
                [int(x[-1]) for x in bins_bt.intersect(sv, c=True)])

    # Paste bin coordinates, SV counts, and original features into single dataframe
    out_df = dfutils.float_cleanup(
        pd.concat([bins_df, sv_column, feats_df], axis=1), maxfloat, 3)
    out_df.columns = bins_header[:3] + ['sv'] + bins_header[3:]

    # Save bins with SV counts
    if 'compressed' in determine_filetype(outfile):
        outfile = path.splitext(outfile)[0]
    out_df.to_csv(outfile, sep='\t', header=True, index=False)

    # Bgzip bins, if optioned
    if bgzip:
        bgz(outfile)