def prepare_expression(counts_df, tpm_df, vcf_lookup_s, sample_frac_threshold=0.2, count_threshold=6, tpm_threshold=0.1, mode='tmm'):
    """
    Genes are thresholded based on the following expression rules:
      TPM > tpm_threshold in >= sample_frac_threshold*samples
      read counts >= count_threshold in sample_frac_threshold*samples
    
    vcf_lookup: lookup table mapping sample IDs to VCF IDs
    
    Between-sample normalization modes:
      tmm: TMM from edgeR
      qn:  quantile normalization
    """

    ix = np.intersect1d(counts_df.columns, vcf_lookup_s.index)
    tpm_df = tpm_df[ix]
    counts_df = counts_df[ix]
    ns = tpm_df.shape[1]

    # expression thresholds
    mask = (
        (np.sum(tpm_df>=tpm_threshold,axis=1)>=sample_frac_threshold*ns) &
        (np.sum(counts_df>=count_threshold,axis=1)>=sample_frac_threshold*ns)
    ).values

    # apply normalization
    if mode.lower()=='tmm':
        tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df, normalized_lib_sizes=True)
        norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df[mask])
    elif mode.lower()=='qn':
        qn_df = rnaseqnorm.normalize_quantiles(tpm_df.loc[mask])
        norm_df = rnaseqnorm.inverse_normal_transform(qn_df)
    else:
        raise ValueError('Unsupported mode {}'.format(mode))

    return norm_df
def prepare_expression(counts_df, tpm_df, mode='tmm'):
    """
    This part and Normalization part is adapted from gtex official pipeline

    Genes are thresholded based on the following expression rules:
      TPM >= tpm_threshold in >= sample_frac_threshold*samples
      read counts >= count_threshold in sample_frac_threshold*samples
    
    vcf_lookup: lookup table mapping sample IDs to VCF IDs
    
    Between-sample normalization modes:
      tmm: TMM from edgeR
      qn:  quantile normalization
    """
    ns = tpm_df.shape[1]
    # apply normalization
    if mode.lower() == 'tmm':
        tmm_counts_df = rnaseqnorm.edgeR_cpm(counts_df,
                                             normalized_lib_sizes=True)
        norm_df = rnaseqnorm.inverse_normal_transform(tmm_counts_df)
    elif mode.lower() == 'qn':
        qn_df = rnaseqnorm.normalize_quantiles(tpm_df)
        norm_df = rnaseqnorm.inverse_normal_transform(qn_df)
    else:
        raise ValueError('Unsupported mode {}'.format(mode))

    return norm_df
Exemplo n.º 3
0
def normalize_expression_and_generate_expression_pcs(
        raw_pseudobulk_expression, sample_level_normalization,
        gene_level_normalization, num_pcs, pb_expression_output_root):
    # Initialize output normalized expression matrix
    normalized_expression = np.zeros(raw_pseudobulk_expression.shape)

    ##################################
    # Perform sample level normalization
    ##################################
    if sample_level_normalization == 'qn':
        df = pd.DataFrame(np.transpose(raw_pseudobulk_expression))
        temp_out = rnaseqnorm.normalize_quantiles(df)
        raw_pseudobulk_expression = np.transpose(np.asarray(temp_out))

    ##################################
    # Perform gene level normalization
    ##################################
    if gene_level_normalization == 'zscore':
        for gene_num in range(normalized_expression.shape[1]):
            temp_expr = (raw_pseudobulk_expression[:, gene_num] - np.mean(
                raw_pseudobulk_expression[:, gene_num])) / np.std(
                    raw_pseudobulk_expression[:, gene_num])
            temp_expr[temp_expr > 10.0] = 10.0
            temp_expr[temp_expr < -10.0] = -10.0
            temp_expr = temp_expr - np.mean(temp_expr)
            normalized_expression[:, gene_num] = temp_expr
    elif gene_level_normalization == 'ign':
        # Code from GTEx v8
        # Project each gene onto a gaussian
        df = pd.DataFrame(np.transpose(raw_pseudobulk_expression))
        norm_df = rnaseqnorm.inverse_normal_transform(df)
        normalized_expression = np.transpose(np.asarray(norm_df))
    else:
        print(gene_level_normalization +
              ' gene level normalization method currently not implemented')
        pdb.set_trace()

    # Save normalized pseudobulk gene expression to output file
    pseudobulk_expression_file = pb_expression_output_root + 'normalized_expression.txt'
    np.savetxt(pseudobulk_expression_file,
               normalized_expression,
               fmt="%s",
               delimiter='\t')

    # Run PCA on pseudobulk data
    pca_file = pb_expression_output_root + 'pca_scores.txt'
    pca_ve_file = pb_expression_output_root + 'pca_pve.txt'
    generate_pca_scores_and_variance_explained(pseudobulk_expression_file,
                                               num_pcs, pca_file, pca_ve_file)
def standardize_expression(tpm_expression_matrix_file,
                           standardized_tpm_expression_matrix_file):
    tpm_full = np.loadtxt(tpm_expression_matrix_file,
                          dtype=str,
                          delimiter='\t')
    tpm = tpm_full[1:, 1:].astype(float)
    samples = tpm_full[1:, 0]
    genes = tpm_full[0, 1:]
    # Quantile normalize the samples
    df = pd.DataFrame(np.transpose(tpm))
    #rank_mean = df.stack().groupby(df.rank(method='first').stack().astype(int)).mean()
    #temp_out = df.rank(method='min').stack().astype(int).map(rank_mean).unstack()
    #tpm_quantile_normalized = np.transpose(np.asarray(temp_out))
    temp_out = rnaseqnorm.normalize_quantiles(df)
    norm_df = rnaseqnorm.inverse_normal_transform(temp_out)
    standardized_tpm = np.transpose(np.asarray(norm_df))
    ###
    #tpm_quantile_normalized = np.transpose(np.asarray(temp_out))
    ###

    # Standardize the genes
    #num_genes = tpm_quantile_normalized.shape[1]
    #num_samples = tpm_quantile_normalized.shape[0]

    ####
    #standardized_tpm = np.zeros((num_samples, num_genes))
    #for gene_num in range(num_genes):
    #	standardized_tpm[:,gene_num] = (tpm_quantile_normalized[:, gene_num] - np.mean(tpm_quantile_normalized[:, gene_num]))/np.std(tpm_quantile_normalized[:, gene_num])
    ####
    # Print to output file
    t = open(standardized_tpm_expression_matrix_file, 'w')
    # print header
    t.write('SampleId\t' + '\t'.join(samples) + '\n')
    for gene_num, gene_name in enumerate(genes):
        #expr = tpm_quantile_normalized[sample_num, :].astype(str)
        ###
        expr = standardized_tpm[:, gene_num].astype(str)
        ###
        t.write(gene_name + '\t' + '\t'.join(expr) + '\n')
    t.close()
    '''
Exemplo n.º 5
0
def main():

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    count_matrix_df = pd.read_csv(args.count_matrix, sep='\t', index_col=0)

    print("Normalizing...")
    norm_matrix_df = normalize_quantiles(count_matrix_df)
    print(norm_matrix_df.head())

    print("Averaging replicate counts...")

    pat = r'\w+(?=[0-9]+$)'  # regex for groups from sample ids
    groups = OrderedDict()
    for i, sid in enumerate(list(norm_matrix_df.columns)):
        groups.setdefault(re.search(pat, sid).group(0), []).append(i)

    # get mean across group indices
    mu_norm_matrix_df = pd.DataFrame(0,
                                     index=norm_matrix_df.index,
                                     columns=groups.keys())

    for grp, idc in groups.items():
        group_cols = norm_matrix_df.columns[idc]
        mu_norm_matrix_df[grp] = norm_matrix_df[group_cols].apply(np.mean,
                                                                  axis=1)

    outfile = os.path.join(args.output_dir, args.prefix + '.accessibility.txt')
    mu_norm_matrix_df.to_csv(outfile, sep='\t')

    outfile = os.path.join(args.output_dir, args.prefix + '.quant_norm.pct')
    norm_matrix_df.to_csv(outfile, sep='\t')

    print(
        "wrote to *.accessibility (averaged) and *.quant_norm.pct (w/ replicates) to: {}"
        .format(args.output_dir))
def qn_normalization(tpm_df, mask):
    qn_df = rnaseqnorm.normalize_quantiles(tpm_df.loc[mask])
    norm_df = rnaseqnorm.inverse_normal_transform(qn_df)
    return norm_df