def encode_and_decode(dm, W, Be, Bd, activation, apply_activation_to_embedding=False, use_softmax=False, apply_activation_to_output=False, return_embedding=False, return_reconstruction_error=False, bn_encode_variables=None, bn_decode_variables=None):
    mat = dm.matrix.copy()
    if bn_encode_variables == None:
        for i, (w, b) in enumerate(zip(W, Be)):
            if i+1 < len(W):
                mat = activation(mat.dot(w) + b)
            else:
                if apply_activation_to_embedding:
                    if use_softmax:
                        mat = softmax(mat.dot(w) + b)
                    else:
                        mat = activation(mat.dot(w) + b)
                else:
                    mat = mat.dot(w) + b
    else:
        gammas, betas, moving_means, moving_variances = bn_encode_variables
        for i, (w, b, gamma, beta, moving_mean, moving_variance) in enumerate(zip(W, Be, gammas, betas, moving_means, moving_variances)):
            if i+1 < len(W) or apply_activation_to_embedding:
                mat = activation(batchnorm(mat.dot(w), gamma, beta, moving_mean, moving_variance))
            else:
                mat = batchnorm(mat.dot(w), gamma, beta, moving_mean, moving_variance)
    if return_embedding:
        em = dataclasses.datamatrix(rowname=dm.rowname,
                                    rowlabels=dm.rowlabels.copy(),
                                    rowmeta=copy.deepcopy(dm.rowmeta),
                                    columnname='latent_component',
                                    columnlabels=np.array(['LC'+str(x) for x in range(mat.shape[1])], dtype='object'),
                                    columnmeta={},
                                    matrixname='sdae_encoding_of_'+dm.matrixname,
                                    matrix=mat.copy())
    if bn_decode_variables == None:
        for i, (w, b) in enumerate(zip(W[::-1], Bd[::-1])):
            if i+1 < len(W) or apply_activation_to_output:
                mat = activation(mat.dot(w.T) + b)
            else:
                mat = mat.dot(w.T) + b
    else:
        gammas, betas, moving_means, moving_variances = bn_decode_variables
        for i, (w, b, gamma, beta, moving_mean, moving_variance) in enumerate(zip(W[::-1], Bd[::-1], gammas, betas, moving_means, moving_variances)):
            if i+1 < len(W) or apply_activation_to_output:
                mat = activation(batchnorm(mat.dot(w.T), gamma, beta, moving_mean, moving_variance))
            else:
                mat = batchnorm(mat.dot(w.T), gamma, beta, moving_mean, moving_variance)
    rm = dataclasses.datamatrix(rowname=dm.rowname,
                                rowlabels=dm.rowlabels.copy(),
                                rowmeta=copy.deepcopy(dm.rowmeta),
                                columnname=dm.columnname,
                                columnlabels=dm.columnlabels.copy(),
                                columnmeta=copy.deepcopy(dm.columnmeta),
                                matrixname='decoding_from_sdae_encoding_of_'+dm.matrixname,
                                matrix=mat)
    reconstruction_error = np.mean((rm.matrix - dm.matrix)**2)
    if return_embedding and return_reconstruction_error:
        return rm, em, reconstruction_error
    elif return_embedding:
        return rm, em
    elif return_reconstruction_error:
        return rm, reconstruction_error
    else:
        return rm
예제 #2
0
def load_splitdata(rowdatapath,
                   columndatapath,
                   matrixdatapath,
                   studyname='',
                   dtype='float64',
                   delimiter='\t',
                   matrix_has_labels=True):
    rowname, rowlabels, rowmeta = load_metadata(rowdatapath, delimiter)
    columnname, columnlabels, columnmeta = load_metadata(
        columndatapath, delimiter)
    if matrix_has_labels:
        matrix = np.loadtxt(matrixdatapath,
                            dtype=dtype,
                            delimiter=delimiter,
                            skiprows=1,
                            usecols=range(1,
                                          len(columnlabels) + 1),
                            ndmin=2)
    else:
        matrix = np.loadtxt(matrixdatapath,
                            dtype=dtype,
                            delimiter=delimiter,
                            ndmin=2)
    if studyname == '':
        studyname = matrixdatapath
    matrixname = '{0}-{1}_data_from_{2}'.format(rowname, columnname, studyname)
    return dc.datamatrix(rowname, rowlabels, columnname, columnlabels,
                         matrixname, matrix, rowmeta, columnmeta)
def encode(dm, W, Be, activation, apply_activation_to_embedding=False, use_softmax=False, bn_variables=None):
    mat = dm.matrix.copy()
    if bn_variables == None:
        for i, (w, b) in enumerate(zip(W, Be)):
            if i+1 < len(W):
                mat = activation(mat.dot(w) + b)
            else:
                if apply_activation_to_embedding:
                    if use_softmax:
                        mat = softmax(mat.dot(w) + b)
                    else:
                        mat = activation(mat.dot(w) + b)
                else:
                    mat = mat.dot(w) + b
    else:
        gammas, betas, moving_means, moving_variances = bn_variables
        for i, (w, b, gamma, beta, moving_mean, moving_variance) in enumerate(zip(W, Be, gammas, betas, moving_means, moving_variances)):
            if i+1 < len(W) or apply_activation_to_embedding:
                mat = activation(batchnorm(mat.dot(w), gamma, beta, moving_mean, moving_variance))
            else:
                mat = batchnorm(mat.dot(w), gamma, beta, moving_mean, moving_variance)
    em = dataclasses.datamatrix(rowname=dm.rowname,
                                rowlabels=dm.rowlabels.copy(),
                                rowmeta=copy.deepcopy(dm.rowmeta),
                                columnname='latent_component',
                                columnlabels=np.array(['LC'+str(x) for x in range(mat.shape[1])], dtype='object'),
                                columnmeta={},
                                matrixname='sdae_encoding_of_'+dm.matrixname,
                                matrix=mat)
    return em
예제 #4
0
def sdae_reconstruction(dm, W, Be, Bd, activation, apply_activation_to_output=False, return_embedding=False, return_reconstruction_error=False):
    mat = dm.matrix.copy()
    for i, (w, b) in enumerate(zip(W, Be)):
        if i+1 < len(W) or apply_activation_to_output:
            mat = activation(mat.dot(w) + b)
        else:
            mat = mat.dot(w) + b
    if return_embedding:
        em = dataclasses.datamatrix(rowname=dm.rowname,
                                    rowlabels=dm.rowlabels.copy(),
                                    rowmeta=copy.deepcopy(dm.rowmeta),
                                    columnname='latent_component',
                                    columnlabels=np.array(['LC'+str(x) for x in range(mat.shape[1])], dtype='object'),
                                    columnmeta={'activation_applied':np.full(mat.shape[1], apply_activation_to_output, dtype='bool')},
                                    matrixname='sdae_transform_of_'+dm.matrixname,
                                    matrix=mat.copy())
    if not apply_activation_to_output:
        mat = activation(mat)
    for i, (w, b) in enumerate(zip(W[::-1], Bd[::-1])):
        if i+1 < len(W) or apply_activation_to_output:
            mat = activation(mat.dot(w.T) + b)
        else:
            mat = mat.dot(w.T) + b
    rm = dataclasses.datamatrix(rowname=dm.rowname,
                                rowlabels=dm.rowlabels.copy(),
                                rowmeta=copy.deepcopy(dm.rowmeta),
                                columnname='reconstructed_' + dm.columnname,
                                columnlabels=dm.columnlabels.copy(),
                                columnmeta=copy.deepcopy(dm.columnmeta),
                                matrixname='reconstruction_from_sdae_transform_of_'+dm.matrixname,
                                matrix=mat)
    reconstruction_error = np.mean((rm.matrix - dm.matrix)**2)
    if return_embedding and return_reconstruction_error:
        return rm, em, reconstruction_error
    elif return_embedding:
        return rm, em
    elif return_reconstruction_error:
        return rm, reconstruction_error
    else:
        return rm
예제 #5
0
def sdae_transform(dm, W, Be, activation, apply_activation_to_output=False):
    mat = dm.matrix.copy()
    for i, (w, b) in enumerate(zip(W, Be)):
        if i+1 < len(W) or apply_activation_to_output:
            mat = activation(mat.dot(w) + b)
        else:
            mat = mat.dot(w) + b
    em = dataclasses.datamatrix(rowname=dm.rowname,
                                rowlabels=dm.rowlabels.copy(),
                                rowmeta=copy.deepcopy(dm.rowmeta),
                                columnname='latent_component',
                                columnlabels=np.array(['LC'+str(x) for x in range(mat.shape[1])], dtype='object'),
                                columnmeta={'activation_applied':np.full(mat.shape[1], apply_activation_to_output, dtype='bool')},
                                matrixname='sdae_transform_of_'+dm.matrixname,
                                matrix=mat)
    return em
예제 #6
0
def decode(em,
           W,
           Bd,
           activation,
           apply_activation_to_output=False,
           output_activation_mask=[],
           bn_variables=None):
    mat = em.matrix.copy()
    if bn_variables == None:
        for i, (w, b) in enumerate(zip(W[::-1], Bd[::-1])):
            if i + 1 < len(W):
                mat = activation(mat.dot(w.T) + b)
            elif apply_activation_to_output:
                mat = mat.dot(w.T) + b
                mat[:, output_activation_mask] = activation(
                    mat[:, output_activation_mask])
            else:
                mat = mat.dot(w.T) + b
    else:
        gammas, betas, moving_means, moving_variances = bn_variables
        for i, (w, b, gamma, beta, moving_mean, moving_variance) in enumerate(
                zip(W[::-1], Bd[::-1], gammas, betas, moving_means,
                    moving_variances)):
            if i + 1 < len(W):
                mat = activation(
                    batchnorm(mat.dot(w.T), gamma, beta, moving_mean,
                              moving_variance))
            elif apply_activation_to_output:
                mat = batchnorm(mat.dot(w.T), gamma, beta, moving_mean,
                                moving_variance)
                mat[:, output_activation_mask] = activation(
                    mat[:, output_activation_mask])
            else:
                mat = batchnorm(mat.dot(w.T), gamma, beta, moving_mean,
                                moving_variance)
    rm = dataclasses.datamatrix(
        rowname=em.rowname,
        rowlabels=em.rowlabels.copy(),
        rowmeta=copy.deepcopy(em.rowmeta),
        columnname='reconstructed_feature',
        columnlabels=np.array(['RF' + str(x) for x in range(mat.shape[1])],
                              dtype='object'),
        columnmeta={},
        matrixname='decoding_from_' + em.matrixname,
        matrix=mat)
    return rm
예제 #7
0
def sdae_inverse_transform(em, W, Bd, activation, apply_activation_to_output=False):
    if ~(em.columnmeta['activation_applied'].any()):
        mat = activation(em.matrix)
    else:
        mat = em.matrix.copy()
    for i, (w, b) in enumerate(zip(W[::-1], Bd[::-1])):
        if i+1 < len(W) or apply_activation_to_output:
            mat = activation(mat.dot(w.T) + b)
        else:
            mat = mat.dot(w.T) + b
    rm = dataclasses.datamatrix(rowname=em.rowname,
                                rowlabels=em.rowlabels.copy(),
                                rowmeta=copy.deepcopy(em.rowmeta),
                                columnname='reconstructed_feature',
                                columnlabels=np.array(['RF'+str(x) for x in range(mat.shape[1])], dtype='object'),
                                columnmeta={},
                                matrixname='reconstruction_from_'+em.matrixname,
                                matrix=mat)
    return rm
예제 #8
0
def main(study_name='your_study'):

    # load your data and create datamatrix object
    with open('data/original_data/{0}/ensembl_gene_ids.txt'.format(study_name),
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        ensembl_gene_ids = np.array([x.strip() for x in fr.read().split('\n')],
                                    dtype='object')

    with open('data/original_data/{0}/sample_ids.txt'.format(study_name),
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        sample_ids = np.array([x.strip() for x in fr.read().split('\n')],
                              dtype='object')

    counts_matrix = np.loadtxt(
        'data/original_data/{0}/expression_matrix.txt.gz'.format(study_name),
        dtype='float64',
        delimiter='\t',
        ndmin=2)
    total_counts_per_sample = counts_matrix.sum(0)

    gene_sample = dataclasses.datamatrix(
        rowname='ensembl_gene_id',
        rowlabels=ensembl_gene_ids,
        rowmeta={},
        columnname='sample_id',
        columnlabels=sample_ids,
        columnmeta={'total_counts': total_counts_per_sample},
        matrixname='rnaseq_gene_counts_from_{0}'.format(study_name),
        matrix=counts_matrix)
    del ensembl_gene_ids, sample_ids, counts_matrix, total_counts_per_sample

    # scale counts
    gene_sample.matrix = np.exp(
        np.log(gene_sample.matrix) -
        np.log(gene_sample.columnmeta['total_counts'].reshape(1, -1)) +
        (np.log(4) + 7 * np.log(10)))
    gene_sample.matrixname = 'rnaseq_scaled_counts_from_{0}'.format(study_name)

    # shuffle the data
    gene_sample.reorder(np.random.permutation(gene_sample.shape[0]), 0)
    gene_sample.reorder(np.random.permutation(gene_sample.shape[1]), 1)
    print(gene_sample)

    # load the reference data
    gene_sample_ref = datasetIO.load_datamatrix(
        'data/prepared_data/fat/train.pickle').totranspose()
    print(gene_sample_ref)

    # align genes
    tobediscarded = ~np.in1d(gene_sample.rowlabels,
                             gene_sample_ref.rowmeta['ensembl_gene_id'])
    gene_sample.discard(tobediscarded, 0)
    missing_ensembl_ids = gene_sample_ref.rowmeta['ensembl_gene_id'][~np.in1d(
        gene_sample_ref.rowmeta['ensembl_gene_id'], gene_sample.rowlabels)]
    gene_sample = gene_sample.tolabels(
        rowlabels=gene_sample_ref.rowmeta['ensembl_gene_id'].copy(),
        columnlabels=[])
    gene_sample.rowlabels = gene_sample_ref.rowlabels.copy()
    gene_sample.rowname = gene_sample_ref.rowname
    for k, v in gene_sample_ref.rowmeta.items():
        gene_sample.rowmeta[k] = v.copy()
    gene_sample.rowmeta['is_missing'] = np.in1d(
        gene_sample.rowmeta['ensembl_gene_id'], missing_ensembl_ids)
    gene_sample.rowmeta['all_zero'] = (gene_sample.matrix == 0).all(1)
    print('missing data for {0!s} genes'.format(
        gene_sample.rowmeta['is_missing'].sum()))
    print('no counts for {0!s} genes'.format(
        gene_sample.rowmeta['all_zero'].sum()))
    print(gene_sample)

    # handle zeros
    nonzeromins = np.zeros(gene_sample.shape[1], dtype='float64')
    for j in range(gene_sample.shape[1]):
        nonzeromins[j] = gene_sample.matrix[gene_sample.matrix[:, j] > 0,
                                            j].min()
        gene_sample.matrix[gene_sample.matrix[:, j] == 0,
                           j] = nonzeromins[j] / 2.0

    # distributions


#    plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50)
#    plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10)

# log2
    gene_sample.matrix = np.log2(gene_sample.matrix)

    # distributions
    #    plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50)
    #    plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10)

    # normalize samples
    median_shift_from_median = np.median(
        gene_sample.matrix -
        gene_sample.rowmeta['median_sample_ref'].reshape(-1, 1), 0)
    gene_sample.matrix -= median_shift_from_median.reshape(1, -1)

    # distributions
    #    plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50)
    #    plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10)

    # standardize the data
    gene_sample.matrix = (
        gene_sample.matrix - gene_sample.rowmeta['row_mean_ref'].reshape(
            -1, 1)) / gene_sample.rowmeta['row_stdv_ref'].reshape(-1, 1)

    # handle missing genes
    gene_sample.matrix[gene_sample.rowmeta['is_missing'], :] = 0
    #    gene_sample.matrix[gene_sample.rowmeta['is_missing'],:] = gene_sample_ref.matrix[gene_sample.rowmeta['is_missing'],:].min(1, keepdims=True)/2.0

    # distributions
    #    plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50)
    #    plt.figure(); plt.hist(gene_sample.matrix[:5,:].T, 10)
    #    plt.figure(); plt.hist(gene_sample.matrix.reshape(-1), 1000)

    # transpose the data
    atb_gene = gene_sample.totranspose()

    # split the data
    test_fraction = 0.1
    tobepopped = np.random.permutation(gene_sample.shape[0]) < round(
        max([test_fraction * gene_sample.shape[0], 2.0]))
    gene_sample_test = gene_sample.pop(tobepopped, 0)
    valid_fraction = 0.1
    tobepopped = np.random.permutation(gene_sample.shape[0]) < round(
        max([valid_fraction * gene_sample.shape[0], 2.0]))
    gene_sample_valid = gene_sample.pop(tobepopped, 0)
    gene_sample_train = gene_sample
    del gene_sample, tobepopped

    # save the data
    if not os.path.exists('data/prepared_data'):
        os.mkdir('data/prepared_data')
    if not os.path.exists('data/prepared_data/{0}'.format(study_name)):
        os.mkdir('data/prepared_data/{0}'.format(study_name))
    if not os.path.exists('data/prepared_data/{0}/skinny'.format(study_name)):
        os.mkdir('data/prepared_data/{0}/skinny'.format(study_name))
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/skinny/test.pickle'.format(study_name),
        gene_sample_test)
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/skinny/valid.pickle'.format(study_name),
        gene_sample_valid)
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/skinny/train.pickle'.format(study_name),
        gene_sample_train)
    del gene_sample_test, gene_sample_valid, gene_sample_train

    # split the data
    test_fraction = 0.1
    tobepopped = np.random.permutation(atb_gene.shape[0]) < round(
        max([test_fraction * atb_gene.shape[0], 2.0]))
    atb_gene_test = atb_gene.pop(tobepopped, 0)
    valid_fraction = 0.1
    tobepopped = np.random.permutation(atb_gene.shape[0]) < round(
        max([valid_fraction * atb_gene.shape[0], 2.0]))
    atb_gene_valid = atb_gene.pop(tobepopped, 0)
    atb_gene_train = atb_gene
    del atb_gene, tobepopped

    # save the data
    if not os.path.exists('data/prepared_data'):
        os.mkdir('data/prepared_data')
    if not os.path.exists('data/prepared_data/{0}'.format(study_name)):
        os.mkdir('data/prepared_data/{0}'.format(study_name))
    if not os.path.exists('data/prepared_data/{0}/fat'.format(study_name)):
        os.mkdir('data/prepared_data/{0}/fat'.format(study_name))
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/fat/test.pickle'.format(study_name),
        atb_gene_test)
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/fat/valid.pickle'.format(study_name),
        atb_gene_valid)
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/fat/train.pickle'.format(study_name),
        atb_gene_train)
예제 #9
0
def main():

    # load class examples
    print('loading class examples...', flush=True)
    class_examples_folder = 'targets/pharmaprojects'
    class_examples = {
        'positive':
        datasetIO.load_examples(
            '{0}/positive.txt'.format(class_examples_folder)),
        'negative':
        datasetIO.load_examples(
            '{0}/negative.txt'.format(class_examples_folder)),
        'unknown':
        datasetIO.load_examples(
            '{0}/unknown.txt'.format(class_examples_folder))
    }

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/harmonizome/dataset_info.txt'
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/candidate_features'
    if not os.path.exists(results_folder):
        os.mkdir(results_folder)

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for dataset_info in dataset_infos:

        #        # just work with hpatissuesmrna for testing/debugging the pipeline
        #        if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned':
        #            print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True)
        #            continue

        # check if another python instance is already working on this dataset
        if os.path.exists('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation'])):
            print('skipping {0}. already in progress...'.format(
                dataset_info['abbreviation']),
                  flush=True)
            continue

        # log start of processing
        with open('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            print('working on {0}...'.format(dataset_info['abbreviation']),
                  flush=True)
            fw.write('working on {0}...'.format(dataset_info['abbreviation']))

        # load dataset
        print('loading dataset...', flush=True)
        gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])
        dataset_info['original_genes'] = gene_atb.shape[0]
        dataset_info['original_features'] = gene_atb.shape[1]

        # decide feature normalization
        print('deciding feature normalization...', flush=True)
        if ('standardized' in dataset_info['abbreviation']
                or 'cleaned' in dataset_info['abbreviation']
            ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5:
            # dataset is many-valued and filled-in
            print('    dataset is many-valued and filled-in...', flush=True)
            print('    z-scoring features...', flush=True)
            dataset_info['feature_normalization'] = 'z-score'
            mnv = np.nanmean(gene_atb.matrix, axis=0, keepdims=True)
            sdv = np.nanstd(gene_atb.matrix, axis=0, keepdims=True)
            gene_atb.matrix = (gene_atb.matrix - mnv) / sdv
            gene_atb.columnmeta['mean'] = mnv.reshape(-1)
            gene_atb.columnmeta['stdv'] = sdv.reshape(-1)
        else:
            # dataset is binary or tertiary or sparse
            print('    dataset is binary, tertiary, or sparse...', flush=True)
            print('    no feature normalization...', flush=True)
            dataset_info['feature_normalization'] = 'none'

        # assign class labels to genes
        print('assigning class labels to genes...', flush=True)
        gene_atb.rowmeta['class'] = np.full(gene_atb.shape[0],
                                            'unknown',
                                            dtype='object')
        gene_atb.rowmeta['class'][np.in1d(
            gene_atb.rowlabels, list(class_examples['positive']))] = 'positive'
        gene_atb.rowmeta['class'][np.in1d(
            gene_atb.rowlabels, list(class_examples['negative']))] = 'negative'

        # add dataset mean and stdv as features
        print('adding dataset mean and stdv as features...', flush=True)
        gene_stat = dataclasses.datamatrix(
            rowname=gene_atb.rowname,
            rowlabels=gene_atb.rowlabels.copy(),
            rowmeta=copy.deepcopy(gene_atb.rowmeta),
            columnname=gene_atb.columnname,
            columnlabels=np.array(['mean', 'stdv'], dtype='object'),
            columnmeta={},
            matrixname=gene_atb.matrixname,
            matrix=np.append(gene_atb.matrix.mean(1, keepdims=True),
                             gene_atb.matrix.std(1, keepdims=True), 1))
        gene_atb.append(gene_stat, 1)
        gene_atb.columnmeta['isrowstat'] = np.in1d(gene_atb.columnlabels,
                                                   gene_stat.columnlabels)
        del gene_stat

        # identify features with little information about labelled examples
        print(
            'identifying features with little information about labelled examples...',
            flush=True)
        isunknown = gene_atb.rowmeta['class'] == 'unknown'
        tobediscarded = np.logical_or.reduce(
            ((gene_atb.matrix[~isunknown, :] != 0).sum(axis=0) < 3,
             (gene_atb.matrix[~isunknown, :] != 1).sum(axis=0) < 3,
             np.isnan(gene_atb.matrix[~isunknown, :]).any(axis=0)))
        if tobediscarded.any():
            # discard features
            print('    discarding {0!s} features. {1!s} features remaining...'.
                  format(tobediscarded.sum(), (~tobediscarded).sum()),
                  flush=True)
            gene_atb.discard(tobediscarded, axis=1)
        else:
            # keep all features
            print('    no features to discard. {0!s} features remaining...'.
                  format(gene_atb.shape[1]),
                  flush=True)

        # save if dataset has content
        print('saving if dataset has content...', flush=True)
        if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
            # no content
            print('    nothing to save...', flush=True)
        else:
            # save candidate features
            print('    saving {0!s} candidate features...'.format(
                gene_atb.shape[1]),
                  flush=True)
            dataset_info['path'] = '{0}/{1}.txt.gz'.format(
                results_folder, dataset_info['abbreviation'])
            dataset_info['candidate_genes'] = gene_atb.shape[0]
            dataset_info['candidate_features'] = gene_atb.shape[1]
            dataset_info['positive_examples'] = (
                gene_atb.rowmeta['class'] == 'positive').sum()
            dataset_info['negative_examples'] = (
                gene_atb.rowmeta['class'] == 'negative').sum()
            dataset_info['unknown_examples'] = (
                gene_atb.rowmeta['class'] == 'unknown').sum()
            datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
            datasetIO.append_datasetinfo(
                '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)
예제 #10
0
def main(model_folders_path):
    
    print('reading list of model folders...', flush=True)
    with open(model_folders_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr:
        model_folders = fr.read().split('\n')
#    if '_v' in model_folders_path:
#        version = model_folders_path.replace('.txt', '').split('_')[-1]
    
    print('loading input datamatrix...', flush=True)
    model_folder_parts = model_folders[0].split('/')
    dataset_name = model_folder_parts[model_folder_parts.index('hp_search')+1]
    observed_ = datasetIO.load_datamatrix('../../input_data/{0}/datamatrix.pickle'.format(dataset_name))
    print(observed_, flush=True)
    
    print('attaching hla types...', flush=True)
    columnlabel_idx = {l:i for i,l in enumerate(observed_.columnlabels)}
    hla_types_df = pd.read_csv('../../original_data/1000genomes/20140702_hla_diversity.csv', index_col=False)
    for metalabel in hla_types_df.columns.values[1:]:
        observed_.columnmeta[metalabel] = np.full(observed_.shape[1], 'NA', dtype='object')
        for columnlabel, value in zip(hla_types_df['id'].values, hla_types_df[metalabel].values):
            if columnlabel in columnlabel_idx:
                columnidx = columnlabel_idx[columnlabel]
                observed_.columnmeta[metalabel][columnidx] = value
        uvals, counts = np.unique(observed_.columnmeta[metalabel], return_counts=True)
        max_num_uvals = 25
        if uvals.size > max_num_uvals:
            si = np.argsort(counts)[::-1]
            low_freq_uvals = uvals[si[max_num_uvals:]]
            observed_.columnmeta[metalabel][np.in1d(observed_.columnmeta[metalabel], low_freq_uvals)] = 'NA'
    
    for model_folder in model_folders:
        
        print('working on model_folder: {0}...'.format(model_folder), flush=True)
        input_path = '{0}/embedding.csv.gz'.format(model_folder)
        output_folder = '/'.join(model_folder.replace('/hp_search/', '/output_data/').split('/')[:-1]) + '/embeddings'
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        output_path_prefix = '{0}/{1}'.format(output_folder, model_folder.split('/')[-1])
        print('input_path: {0}'.format(input_path), flush=True)
        print('output_folder: {0}'.format(output_folder), flush=True)
        print('output_path_prefix: {0}'.format(output_path_prefix), flush=True)
        
        if os.path.exists(input_path):
            
            print('loading embedding datamatrix...', flush=True)
            df = pd.read_csv(input_path, index_col=False, usecols=[observed_.rowname, 'Latent1', 'Latent2'])
            hidden = dc.datamatrix(rowname=observed_.rowname,
                                   rowlabels=df[observed_.rowname].values,
                                   rowmeta={},
                                   columnname='latent_component',
                                   columnlabels=np.array(['Latent1', 'Latent2'], dtype='object'),
                                   columnmeta={},
                                   matrixname=observed_.rowname + '_embedding_from_' + observed_.matrixname,
                                   matrix=np.concatenate((df.Latent1.values.reshape(-1,1), df.Latent2.values.reshape(-1,1)), 1))
            del df
            print(hidden, flush=True)
            
            print('aligning input datamatrix and embedding datamatrix...', flush=True)
            if observed_.shape[0] == hidden.shape[0] and (observed_.rowlabels == hidden.rowlabels).all():
                observed = copy.deepcopy(observed_)
            else:
                observed = observed_.tolabels(rowlabels=hidden.rowlabels.copy())
            hidden.rowmeta = copy.deepcopy(observed.rowmeta)
            print(observed, flush=True)
            
            # visualization
            print('plotting embedding...', flush=True)
            fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3))
            ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3])
            ax.plot(hidden.matrix[:,0], hidden.matrix[:,1], 'ok', markersize=1, markeredgewidth=0, alpha=0.5, zorder=0)
            ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False)
#            ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8)
            ax.set_frame_on(False)
            fg.savefig('{0}.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300)
            plt.close()
            for metalabel in ['mean', 'stdv', 'position']:
                z = hidden.rowmeta[metalabel].astype('float64')
                fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3))
                ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3])
                ax.scatter(hidden.matrix[:,0], hidden.matrix[:,1],  s=1, c=z, marker='o', edgecolors='none', cmap=plt.get_cmap('jet'), alpha=0.5, vmin=z.min(), vmax=z.max())
                ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False)
#                ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8)
                ax.set_frame_on(False)
                fg.savefig('{0}_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300)
                plt.close()
            for metalabel in ['gene_name']:
                categories = np.unique(hidden.rowmeta[metalabel])
                cmap = plt.get_cmap('gist_rainbow')
                colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))]
                fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3))
                ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3])
                for category, color in zip(categories, colors):
                    if category == 'NA':
                        color = 'k'
                        alpha = 0.1
                        zorder = 0
                    else:
                        alpha = 0.5
                        zorder = 1
                    hit = hidden.rowmeta[metalabel] == category
                    ax.plot(hidden.matrix[hit,0], hidden.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category)
                ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False)
                ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25)
#                ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8)
                ax.set_frame_on(False)
                fg.savefig('{0}_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300)
                plt.close()
            hla_hit = np.array(['HLA-' in x for x in hidden.rowmeta['gene_name']], dtype='bool')
            hla_names = hidden.rowmeta['gene_name'].copy()
            hla_names[~hla_hit] = 'NA'
            categories = np.unique(hla_names)
            cmap = plt.get_cmap('gist_rainbow')
            colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))]
            fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3))
            ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3])
            for category, color in zip(categories, colors):
                if category == 'NA':
                    color = 'k'
                    alpha = 0.1
                    zorder = 0
                else:
                    alpha = 0.5
                    zorder = 1
                hit = hla_names == category
                ax.plot(hidden.matrix[hit,0], hidden.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=1, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category)
            ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False)
            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25)
#            ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8)
            ax.set_frame_on(False)
            fg.savefig('{0}_colored_by_hlagene.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300)
            plt.close()
            
            print('computing right factor matrix...', flush=True)
            rightfactormat, residuals, rank, singular_values = np.linalg.lstsq(hidden.matrix, observed.matrix)
            factored = dc.datamatrix(rowname=observed.columnname,
                                     rowlabels=observed.columnlabels.copy(),
                                     rowmeta=copy.deepcopy(observed.columnmeta),
                                     columnname='latent_component',
                                     columnlabels=np.array(['Latent1', 'Latent2'], dtype='object'),
                                     columnmeta={},
                                     matrixname=observed.columnname + '_embedding_from_' + observed.matrixname,
                                     matrix=rightfactormat.T)
            
            print('plotting transpose embedding...', flush=True)
            fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3))
            ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3])
            ax.plot(factored.matrix[:,0], factored.matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0)
            ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False)
#            ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8)
            ax.set_frame_on(False)
            fg.savefig('{0}_transpose.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300)
            plt.close()
            for metalabel in factored.rowmeta: # ['population', 'super_population', 'gender']:
                categories = np.unique(factored.rowmeta[metalabel])
                cmap = plt.get_cmap('gist_rainbow')
                colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))]
                fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3))
                ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3])
                for category, color in zip(categories, colors):
                    if category == 'NA':
                        color = 'k'
                        alpha = 0.1
                        zorder = 0
                    else:
                        alpha = 0.5
                        zorder = 1
                    hit = factored.rowmeta[metalabel] == category
                    ax.plot(factored.matrix[hit,0], factored.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category)
                ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False)
                ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25)
#                ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8)
                ax.set_frame_on(False)
                fg.savefig('{0}_transpose_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300)
                plt.close()

    print('done plot_embeddings.py', flush=True)
예제 #11
0
def get_classifier_performance_stats(Y,
                                     P,
                                     uP=1000,
                                     classifier_stats='all',
                                     plot_curves=True,
                                     get_priority_cutoffs=True,
                                     pp_min_frac=0.1,
                                     xx_min_frac=0.01):
    if type(uP) == int:
        uP = get_unique_pcuts(P=P, max_cuts=uP).reshape(-1, 1)
    elif len(uP.shape) == 1:
        uP = uP.reshape(-1, 1)
    if type(classifier_stats) == str:
        classifier_stats = np.array([
            'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr',
            'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc',
            'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr',
            'f1s', 'mcc', 'fnlp'
        ],
                                    dtype='object')
    n = np.float64(Y.size) + 0.2
    ap = Y.sum().astype('float64') + 0.1
    an = (~Y).sum().astype('float64') + 0.1
    pp = (P >= uP).sum(1).astype('float64') + 0.1
    pn = (P < uP).sum(1).astype('float64') + 0.1
    tp = np.logical_and(P >= uP, Y).sum(1).astype(
        'float64') + 0.05  # if count is 5, then this introduces 1% error
    fp = np.logical_and(P >= uP, ~Y).sum(1).astype(
        'float64') + 0.05  # so don't take seriously any cut-off where
    tn = np.logical_and(
        P < uP, ~Y).sum(1).astype('float64') + 0.05  # any count is less than 5
    fn = np.logical_and(P < uP, Y).sum(1).astype(
        'float64'
    ) + 0.05  # nnt is extremely sensitive to this adjustment, but not where nnt is actually reasonable
    uP = uP.reshape(-1)
    tpr = tp / ap  # sensitivity, recall, 1-fnr
    fpr = fp / an  # fall-out, 1-tnr, 1-specificity
    auroc = np.trapz(tpr, fpr)
    fnr = fn / ap  # miss rate
    tnr = tn / an  # specificity
    mcr = (fp + fn) / n
    acc = (tp + tn) / n
    fdr = fp / pp
    ppv = tp / pp  # precision = 1-fdr
    auprc = np.trapz(ppv, tpr)
    fomr = fn / pn  # false omission rate
    npv = tn / pn
    plr = (tp / fp) / (
        ap / an
    )  # ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better, tpr/fpr
    nlr = (fn / tn) / (
        ap / an
    )  # ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better, fnr/tnr
    dor = (tp / fp) / (
        fn / tn
    )  # ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions, positivelikelihoodratio/negativelikelihoodratio
    drr = (tp / pp) / (
        fn / pn
    )  # relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions, ppv/fomr
    darr = (tp / pp) - (
        fn / pn
    )  # absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions, ppv - fomr
    mrr = (tp / pp) / (
        ap / n
    )  # modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample, ppv/prevalence
    marr = (tp / pp) - (
        ap / n
    )  # modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample, ppv - prevalence
    f1s = 2 * tp / (2 * tp + fp + fn)
    mcc = (tp * tn - fp * fn) / np.sqrt(
        (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    fnlp = -stats.hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10)
    results_dict = {
        'p': uP,
        'n': n,
        'ap': ap,
        'an': an,
        'pp': pp,
        'pn': pn,
        'tp': tp,
        'fp': fp,
        'tn': tn,
        'fn': fn,
        'tpr': tpr,
        'fpr': fpr,
        'auroc': auroc,
        'fnr': fnr,
        'tnr': tnr,
        'mcr': mcr,
        'acc': acc,
        'fdr': fdr,
        'ppv': ppv,
        'auprc': auprc,
        'fomr': fomr,
        'npv': npv,
        'plr': plr,
        'nlr': nlr,
        'dor': dor,
        'drr': drr,
        'darr': darr,
        'mrr': mrr,
        'marr': marr,
        'f1s': f1s,
        'mcc': mcc,
        'fnlp': fnlp
    }
    stat_cut = dataclasses.datamatrix(
        rowname='classifier_performance_stat',
        rowlabels=classifier_stats.copy(),
        rowmeta={},
        columnname='probability_cutoff',
        columnlabels=uP.copy(),
        columnmeta={},
        matrixname='classifier_performance_stats_vs_probability_cutoffs',
        matrix=np.zeros((classifier_stats.size, uP.size), dtype='float64'))
    for i, stat in enumerate(stat_cut.rowlabels):
        stat_cut.matrix[i, :] = results_dict[stat]
    if get_priority_cutoffs:
        get_priority_cutoff_metadata(stat_cut, pp_min_frac, xx_min_frac)
    if plot_curves:
        plt.figure()
        plt.subplot(2, 2, 1)
        plt.plot(fpr, tpr, 'k-')
        plt.ylabel('tpr, sensitivity, recall')
        plt.xlabel('fpr, 1-specificity, fall-out')
        plt.axis([0, 1, 0, 1])
        plt.subplot(2, 2, 2)
        plt.plot(tpr, ppv, 'k-')
        plt.ylabel('ppv, precision, 1-fdr')
        plt.xlabel('tpr, sensitivity, recall')
        plt.axis([0, 1, 0, 1])
        plt.subplot(2, 2, 3)
        plt.plot(uP, mcr, 'k-')
        plt.ylabel('mcr')
        plt.xlabel('p')
        plt.axis([0, 1, 0, 1])
        plt.gca().invert_xaxis()
        plt.subplot(2, 2, 4)
        plt.plot(uP, mcc, 'k-')
        plt.ylabel('mcc')
        plt.xlabel('p')
        plt.axis([0, 1, 0, 1])
        plt.gca().invert_xaxis()
    return stat_cut
예제 #12
0
def get_classifier_performance_stats(Y,
                                     P,
                                     uP=1000,
                                     classifier_stats='all',
                                     plot_curves=True,
                                     get_priority_cutoffs=True,
                                     pp_min_frac=0.1,
                                     xx_min_frac=0.01):
    if type(uP) == int:
        uP = get_unique_pcuts(P=P, max_cuts=uP).reshape(-1, 1)
    elif len(uP.shape) == 1:
        uP = uP.reshape(-1, 1)
    if type(classifier_stats) == str:
        classifier_stats = np.array([
            'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr',
            'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc',
            'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr',
            'f1s', 'mcc', 'fnlp'
        ],
                                    dtype='object')
    n = np.float64(Y.size) + 0.2
    ap = Y.sum().astype('float64') + 0.1
    an = (~Y).sum().astype('float64') + 0.1
    pp = (P >= uP).sum(1).astype('float64') + 0.1
    pn = (P < uP).sum(1).astype('float64') + 0.1
    tp = np.logical_and(P >= uP, Y).sum(1).astype(
        'float64') + 0.05  # if count is 5, then this introduces 1% error
    fp = np.logical_and(P >= uP, ~Y).sum(1).astype(
        'float64') + 0.05  # so don't take seriously any cut-off where
    tn = np.logical_and(
        P < uP, ~Y).sum(1).astype('float64') + 0.05  # any count is less than 5
    fn = np.logical_and(P < uP, Y).sum(1).astype(
        'float64'
    ) + 0.05  # nnt is extremely sensitive to this adjustment, but not where nnt is actually reasonable
    uP = uP.reshape(-1)
    stat_fun_params = (uP, n, ap, an, pp, pn, tp, fp, tn, fn)
    stat_fun = {
        'p':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: P,
        'n':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: N,
        'ap':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: AP,
        'an':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: AN,
        'pp':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: PP,
        'pn':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: PN,
        'tp':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TP,
        'fp':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FP,
        'tn':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TN,
        'fn':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FN,
        'tpr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TP / AP,
        'fpr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FP / AN,
        'auroc':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: np.trapz(
            TP / AP, FP / AN),
        'fnr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FN / AP,
        'tnr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TN / AN,
        'mcr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (FP + FN) / N,
        'acc':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP + TN) / N,
        'fdr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FP / PP,
        'ppv':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TP / PP,
        'auprc':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: np.trapz(
            TP / PP, TP / AP),
        'fomr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FN / PN,
        'npv':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TN / PN,
        'plr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / FP) / (AP / AN),
        'nlr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (FN / TN) / (AP / AN),
        'dor':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / FP) / (FN / TN),
        'drr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / PP) / (FN / PN),
        'darr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / PP) - (FN / PN),
        'mrr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / PP) / (AP / N),
        'marr':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / PP) - (AP / N),
        'f1s':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: 2 * TP /
        (2 * TP + FP + FN),
        'mcc':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN:
        (TP * TN - FP * FN) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) *
                                      (TN + FN)),
        'fnlp':
        lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: -stats.hypergeom.logsf(
            TP, N, AP, PP, loc=1) / np.log(10)
    }
    stat_cut = dataclasses.datamatrix(
        rowname='classifier_performance_stat',
        rowlabels=classifier_stats.copy(),
        rowmeta={},
        columnname='probability_cutoff',
        columnlabels=uP.copy(),
        columnmeta={},
        matrixname='classifier_performance_stats_vs_probability_cutoffs',
        matrix=np.zeros((classifier_stats.size, uP.size), dtype='float64'))
    for i, stat in enumerate(stat_cut.rowlabels):
        stat_cut.matrix[i, :] = stat_fun[stat](*stat_fun_params)
    if get_priority_cutoffs:
        get_priority_cutoff_metadata(stat_cut, pp_min_frac, xx_min_frac)
    if plot_curves:
        plt.figure()
        plt.subplot(2, 2, 1)
        plt.plot(stat_cut.select('fpr', []), stat_cut.select('tpr', []), 'k-')
        plt.ylabel('tpr, sensitivity, recall')
        plt.xlabel('fpr, 1-specificity, fall-out')
        plt.axis([0, 1, 0, 1])
        plt.subplot(2, 2, 2)
        plt.plot(stat_cut.select('tpr', []), stat_cut.select('ppv', []), 'k-')
        plt.ylabel('ppv, precision, 1-fdr')
        plt.xlabel('tpr, sensitivity, recall')
        plt.axis([0, 1, 0, 1])
        plt.subplot(2, 2, 3)
        plt.plot(stat_cut.select('p', []), stat_cut.select('mcr', []), 'k-')
        plt.ylabel('mcr')
        plt.xlabel('p')
        plt.axis([0, 1, 0, 1])
        plt.gca().invert_xaxis()
        plt.subplot(2, 2, 4)
        plt.plot(stat_cut.select('p', []), stat_cut.select('mcc', []), 'k-')
        plt.ylabel('mcc')
        plt.xlabel('p')
        plt.axis([0, 1, 0, 1])
        plt.gca().invert_xaxis()
    return stat_cut
예제 #13
0
 results = dc.datamatrix(
     rowname=subds.columnname,
     rowlabels=subds.columnlabels.copy(),
     rowmeta=copy.deepcopy(subds.columnmeta),
     columnname='statistic_summary_numsamples',
     columnlabels=np.array([
         '{0}_N{1!s}'.format(x, y) for x in [
             'pc1loadings_mean', 'pc1loadings_stdv',
             'reconerrors_mean', 'reconerrors_stdv', 'tvalues_mean',
             'tvalues_stdv', 'dvalues_mean', 'dvalues_stdv',
             'pranks_mean', 'pranks_stdv', 'tranks_mean',
             'tranks_stdv', 'dranks_mean', 'dranks_stdv',
             'significanceindicators_mean',
             'significanceindicators_stdv'
         ] for y in num_samples
     ],
                           dtype='object'),
     columnmeta={
         'statistic_summary':
         np.array([
             x for x in [
                 'pc1loadings_mean', 'pc1loadings_stdv',
                 'reconerrors_mean', 'reconerrors_stdv',
                 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean',
                 'dvalues_stdv', 'pranks_mean', 'pranks_stdv',
                 'tranks_mean', 'tranks_stdv', 'dranks_mean',
                 'dranks_stdv', 'significanceindicators_mean',
                 'significanceindicators_stdv'
             ] for y in num_samples
         ],
                  dtype='object'),
         'statistic':
         np.array([
             x.split('_')[0] for x in [
                 'pc1loadings_mean', 'pc1loadings_stdv',
                 'reconerrors_mean', 'reconerrors_stdv',
                 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean',
                 'dvalues_stdv', 'pranks_mean', 'pranks_stdv',
                 'tranks_mean', 'tranks_stdv', 'dranks_mean',
                 'dranks_stdv', 'significanceindicators_mean',
                 'significanceindicators_stdv'
             ] for y in num_samples
         ],
                  dtype='object'),
         'summary':
         np.array([
             x.split('_')[1] for x in [
                 'pc1loadings_mean', 'pc1loadings_stdv',
                 'reconerrors_mean', 'reconerrors_stdv',
                 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean',
                 'dvalues_stdv', 'pranks_mean', 'pranks_stdv',
                 'tranks_mean', 'tranks_stdv', 'dranks_mean',
                 'dranks_stdv', 'significanceindicators_mean',
                 'significanceindicators_stdv'
             ] for y in num_samples
         ],
                  dtype='object'),
         'numsamples':
         np.array([
             y for x in [
                 'pc1loadings_mean', 'pc1loadings_stdv',
                 'reconerrors_mean', 'reconerrors_stdv',
                 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean',
                 'dvalues_stdv', 'pranks_mean', 'pranks_stdv',
                 'tranks_mean', 'tranks_stdv', 'dranks_mean',
                 'dranks_stdv', 'significanceindicators_mean',
                 'significanceindicators_stdv'
             ] for y in num_samples
         ],
                  dtype='int64')
     },
     matrixname='gene_statistics_for_{0}_vs_{1}'.format(
         group_i, group_j),
     matrix=np.concatenate(
         (pc1loadings_mean, pc1loadings_stdv,
          np.broadcast_to(reconerrors_mean.reshape(-1, 1),
                          (num_samples.size, subds.shape[1])),
          np.broadcast_to(reconerrors_stdv.reshape(-1, 1),
                          (num_samples.size, subds.shape[1])),
          tvalues_mean, tvalues_stdv, dvalues_mean, dvalues_stdv,
          pranks_mean, pranks_stdv, tranks_mean, tranks_stdv,
          dranks_mean, dranks_stdv, significanceindicators_mean,
          significanceindicators_stdv), 0).T)
예제 #14
0
def main(validation_rep=0, validation_fold=0):

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/merged_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format(
        validation_rep, validation_fold)
    dataset_info = datasetIO.load_datasetinfo(dataset_info_path)[0]

    # load validation examples
    print('loading validation examples...', flush=True)
    validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format(
        validation_rep, validation_fold)
    with open(validation_examples_path,
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        validation_examples = fr.read().split('\n')

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/useful_features/rep{0!s}_fold{1!s}'.format(
        validation_rep, validation_fold)
    results_folder_parts = results_folder.split('/')
    for i in range(len(results_folder_parts)):
        results_folder_part = '/'.join(results_folder_parts[:i + 1])
        if not os.path.exists(results_folder_part):
            os.mkdir(results_folder_part)

    # load dataset
    print('loading dataset {0}...'.format(dataset_info['abbreviation']),
          flush=True)
    gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])

    # specify cross-validation parameters
    print('specifying cross-validation parameters...', flush=True)
    reps = 20
    folds = 5
    rf_trees = 1000
    include_logistic_regression = True
    skf = StratifiedKFold(n_splits=folds, shuffle=True)
    print('    reps: {0!s}'.format(reps))
    print('    folds: {0!s}'.format(folds))

    # initialize models
    print('initializing models...', flush=True)
    rfmodel = RandomForestClassifier(n_estimators=rf_trees,
                                     oob_score=False,
                                     n_jobs=-1,
                                     class_weight='balanced')
    print(rfmodel)
    lrmodel = LogisticRegression(penalty='l2',
                                 dual=False,
                                 tol=0.0001,
                                 C=1e3,
                                 fit_intercept=True,
                                 intercept_scaling=1e3,
                                 class_weight='balanced',
                                 random_state=None,
                                 solver='liblinear',
                                 max_iter=100,
                                 multi_class='ovr',
                                 verbose=0,
                                 warm_start=False,
                                 n_jobs=1)
    print(lrmodel)

    # initialize data matrices for collecting model feature importances and cross-validation performance stats
    print(
        'initializing data matrices for collecting model feature importances and cross-validation performance stats...',
        flush=True)
    classifier_stats = np.array([
        'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr',
        'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr',
        'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc',
        'fnlp'
    ],
                                dtype='object')
    sm = dataclasses.datamatrix(
        rowname='classifier_performance_stat',
        rowlabels=classifier_stats.copy(),
        rowmeta={},
        columnname='model',
        columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])],
                              dtype='object'),
        columnmeta={
            'num_features': np.zeros(gene_atb.shape[1], dtype='int64'),
            'features': np.full(gene_atb.shape[1], '', dtype='object'),
            'oob_score': np.zeros(gene_atb.shape[1], dtype='float64')
        },
        matrixname='crossvalidation_classifier_performance_stats_vs_models',
        matrix=np.zeros((classifier_stats.size, gene_atb.shape[1]),
                        dtype='float64'))
    stat_model_rf_mean = copy.deepcopy(sm)
    stat_model_rf_stdv = copy.deepcopy(sm)
    stat_model_lr_mean = copy.deepcopy(sm)
    stat_model_lr_stdv = copy.deepcopy(sm)
    del sm
    fm = dataclasses.datamatrix(
        rowname=gene_atb.columnname,
        rowlabels=gene_atb.columnlabels.copy(),
        rowmeta=copy.deepcopy(gene_atb.columnmeta),
        columnname='model',
        columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])],
                              dtype='object'),
        columnmeta={
            'num_features': np.zeros(gene_atb.shape[1], dtype='int64'),
            'features': np.full(gene_atb.shape[1], '', dtype='object'),
            'oob_score': np.zeros(gene_atb.shape[1], dtype='float64')
        },
        matrixname='model_feature_importances',
        matrix=np.zeros((gene_atb.shape[1], gene_atb.shape[1]),
                        dtype='float64'))
    feature_model_rf = copy.deepcopy(fm)
    feature_model_lr = copy.deepcopy(fm)
    del fm

    # exclude validation and unlabeled examples from cross-validation loop
    print(
        'excluding validation and unlabeled examples from cross-validation loop...',
        flush=True)
    isvalidation = np.in1d(gene_atb.rowlabels, validation_examples)
    isunknown = gene_atb.rowmeta['class'] == 'unknown'
    istraintest = ~np.logical_or(isvalidation, isunknown)
    Y = (gene_atb.rowmeta['class'][istraintest] == 'positive')
    #X = gene_atb.matrix[istraintest,:]

    # perform incremental feature elimination with cross-validation
    print(
        'performing incremental feature elimination with cross-validation...',
        flush=True)
    for i in range(gene_atb.shape[1]):
        print('    features: {0!s}...'.format(gene_atb.shape[1] - i),
              flush=True)
        if i == 0:
            hit_rf = np.ones(gene_atb.shape[1], dtype='bool')
            hit_lr = np.ones(gene_atb.shape[1], dtype='bool')
        else:
            hit_rf = feature_model_rf.matrix[:,
                                             i - 1] > feature_model_rf.matrix[
                                                 feature_model_rf.
                                                 matrix[:, i - 1] > 0,
                                                 i - 1].min()
            #hit_lr = feature_model_lr.matrix[:,i-1] > feature_model_lr.matrix[feature_model_lr.matrix[:,i-1] > 0,i-1].min()
            hit_lr = hit_rf
        X_rf = gene_atb.matrix[istraintest, :][:, hit_rf]
        X_lr = gene_atb.matrix[istraintest, :][:, hit_lr]
        stat_rep_rf = np.zeros((classifier_stats.size, reps), dtype='float64')
        stat_rep_lr = np.zeros((classifier_stats.size, reps), dtype='float64')
        fi_rep_rf = np.zeros((X_rf.shape[1], reps), dtype='float64')
        fi_rep_lr = np.zeros((X_lr.shape[1], reps), dtype='float64')
        for rep in range(reps):
            print('        rep {0!s} of {1!s}...'.format(rep + 1, reps),
                  flush=True)
            Ptest_rf = np.zeros(Y.size, dtype='float64')
            Ptest_lr = np.zeros(Y.size, dtype='float64')
            fi_fold_rf = np.zeros((X_rf.shape[1], folds), dtype='float64')
            fi_fold_lr = np.zeros((X_lr.shape[1], folds), dtype='float64')
            for fold, (train_indices,
                       test_indices) in enumerate(skf.split(X_rf, Y)):
                print('            fold {0!s} of {1!s}...'.format(
                    fold + 1, folds),
                      flush=True)
                Y_train = Y[train_indices]
                X_rf_train = X_rf[train_indices]
                X_lr_train = X_lr[train_indices]
                #Y_test = Y[test_indices]
                X_rf_test = X_rf[test_indices]
                X_lr_test = X_lr[test_indices]
                rfmodel.fit(X_rf_train, Y_train)
                Ptest_rf[test_indices] = rfmodel.predict_proba(
                    X_rf_test)[:, rfmodel.classes_ == 1].reshape(-1)
                fi_fold_rf[:, fold] = rfmodel.feature_importances_
                lrmodel.fit(X_lr_train, Y_train)
                Ptest_lr[test_indices] = lrmodel.predict_proba(
                    X_lr_test)[:, lrmodel.classes_ == 1].reshape(-1)
                fi_fold_lr[:, fold] = np.abs(lrmodel.coef_.reshape(-1))
            fi_rep_rf[:, rep] = fi_fold_rf.mean(1)
            stat_cut = modelevaluation.get_classifier_performance_stats(
                Y=Y,
                P=Ptest_rf,
                classifier_stats=classifier_stats,
                plot_curves=False,
                get_priority_cutoffs=True)
            stat_rep_rf[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[
                'p50_cutoff']].reshape(-1)
            fi_rep_lr[:, rep] = fi_fold_lr.mean(1)
            stat_cut = modelevaluation.get_classifier_performance_stats(
                Y=Y,
                P=Ptest_lr,
                classifier_stats=classifier_stats,
                plot_curves=False,
                get_priority_cutoffs=True)
            stat_rep_lr[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[
                'p50_cutoff']].reshape(-1)
        feature_model_rf.matrix[hit_rf, i] = fi_rep_rf.mean(1)
        feature_model_rf.columnmeta['num_features'][i] = gene_atb.shape[1] - i
        feature_model_rf.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        stat_model_rf_mean.matrix[:, i] = stat_rep_rf.mean(1)
        stat_model_rf_mean.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_rf_mean.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        stat_model_rf_stdv.matrix[:, i] = stat_rep_rf.std(1)
        stat_model_rf_stdv.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_rf_stdv.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        feature_model_lr.matrix[hit_lr, i] = fi_rep_lr.mean(1)
        feature_model_lr.columnmeta['num_features'][i] = gene_atb.shape[1] - i
        feature_model_lr.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())
        stat_model_lr_mean.matrix[:, i] = stat_rep_lr.mean(1)
        stat_model_lr_mean.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_lr_mean.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())
        stat_model_lr_stdv.matrix[:, i] = stat_rep_lr.std(1)
        stat_model_lr_stdv.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_lr_stdv.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())

    # concatenate data matrices with model feature importances
    print('concatenating data matrices with model feature importances...',
          flush=True)
    feature_model_rf.columnlabels += '_rf'
    feature_model_rf.columnmeta['model_type'] = np.full(
        feature_model_rf.shape[1], 'random_forest', dtype='object')
    feature_model_lr.columnlabels += '_lr'
    feature_model_lr.columnmeta['model_type'] = np.full(
        feature_model_lr.shape[1], 'logistic_regression', dtype='object')
    feature_model_rf.append(feature_model_lr, 1)
    feature_model = feature_model_rf
    del feature_model_rf, feature_model_lr

    # concatenate data matrices with model cross-validation performance stats
    print(
        'concatenating data matrices with model cross-validation performance stats...',
        flush=True)
    stat_model_rf_mean.rowlabels += '_mean'
    stat_model_rf_stdv.rowlabels += '_stdv'
    stat_model_rf_mean.append(stat_model_rf_stdv, 0)
    stat_model_rf_mean.columnlabels += '_rf'
    stat_model_rf_mean.columnmeta['model_type'] = np.full(
        stat_model_rf_mean.shape[1], 'random_forest', dtype='object')
    stat_model_lr_mean.rowlabels += '_mean'
    stat_model_lr_stdv.rowlabels += '_stdv'
    stat_model_lr_mean.append(stat_model_lr_stdv, 0)
    stat_model_lr_mean.columnlabels += '_lr'
    stat_model_lr_mean.columnmeta['model_type'] = np.full(
        stat_model_lr_mean.shape[1], 'logistic_regression', dtype='object')
    stat_model_rf_mean.append(stat_model_lr_mean, 1)
    stat_model = stat_model_rf_mean
    del stat_model_rf_mean

    # select simplest model (fewest features) with auroc and auprc within 95% of max
    print(
        'selecting simplest model (fewest features) with auroc and auprc within 95% of max...',
        flush=True)
    model_scores = 0.5 * (stat_model.select('auroc_mean', []) +
                          stat_model.select('auprc_mean', []))
    if include_logistic_regression:
        selected_model_index = np.where(
            model_scores >= 0.95 * model_scores.max())[0][-1]
    else:
        selected_model_index = np.where(
            np.logical_and(
                model_scores >=
                0.95 * model_scores[stat_model.columnmeta['model_type'] ==
                                    'random_forest'].max(),
                stat_model.columnmeta['model_type'] == 'random_forest'))[0][-1]
    selected_model_name = stat_model.columnlabels[selected_model_index]
    selected_model_features = feature_model.rowlabels[
        feature_model.matrix[:, selected_model_index] != 0]
    selected_model_type = stat_model.columnmeta['model_type'][
        selected_model_index]
    selected_model = rfmodel if selected_model_type == 'random_forest' else lrmodel
    gene_atb = gene_atb.tolabels(columnlabels=selected_model_features)
    feature_model_selected = feature_model.tolabels(
        columnlabels=selected_model_name)
    stat_model_selected = stat_model.tolabels(columnlabels=selected_model_name)
    print('    selected_model_name: {0}'.format(selected_model_name),
          flush=True)
    print('    selected_model_features: {0}'.format(
        '|'.join(selected_model_features)),
          flush=True)

    # iterate over selected features to rebuild design matrix
    print('iterating over selected features to rebuild design matrix...',
          flush=True)
    for i, (selected_feature, dataset_abbreviation) in enumerate(
            zip(gene_atb.columnlabels,
                gene_atb.columnmeta['dataset_abbreviation'])):

        # load dataset
        print('    loading dataset {0}...'.format(dataset_abbreviation),
              flush=True)
        dataset_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/{2}.txt.gz'.format(
            validation_rep, validation_fold, dataset_abbreviation)
        gene_atb_i = datasetIO.load_datamatrix(dataset_path)
        gene_atb_i.columnmeta[
            'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[
                'generalizability_pvalues_corrected'].astype('float64')
        gene_atb_i.columnmeta['dataset_abbreviation'] = np.full(
            gene_atb_i.shape[1], dataset_abbreviation, dtype='object')
        gene_atb_i.columnmeta[
            'dataset_feature'] = gene_atb_i.columnlabels.copy()
        gene_atb_i.columnlabels += '_' + dataset_abbreviation
        gene_atb_i.rowname = 'GeneSym'
        gene_atb_i.columnname = 'Feature'
        if dataset_abbreviation == 'gtextissue_cleaned':
            gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55',
                               0)  # pesky duplicate row
        print(gene_atb_i)

        # select feature
        print('    selecting feature {0}...'.format(selected_feature),
              flush=True)
        gene_atb_i.discard(gene_atb_i.columnlabels != selected_feature, 1)

        # merge dataset
        print('    merging dataset...', flush=True)
        if i == 0:
            gene_atb_selected = copy.deepcopy(gene_atb_i)
            gene_atb_selected.matrixname = 'merged_target_features'
            print('        first dataset, no merge...', flush=True)
        else:
            common_genes = np.intersect1d(gene_atb_selected.rowlabels,
                                          gene_atb_i.rowlabels)
            gene_atb_selected = gene_atb_selected.tolabels(
                rowlabels=common_genes)
            gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes)
            gene_atb_selected.append(gene_atb_i, 1)
            print('        common_genes: {0!s}...'.format(common_genes.size),
                  flush=True)

    # normalize features
    print('normalizing features...', flush=True)
    gene_atb_selected.columnmeta['min'] = gene_atb_selected.matrix.min(0)
    gene_atb_selected.columnmeta['max'] = gene_atb_selected.matrix.max(0)
    gene_atb_selected.matrix = (
        gene_atb_selected.matrix - gene_atb_selected.columnmeta['min'].reshape(
            1, -1)) / (gene_atb_selected.columnmeta['max'].reshape(1, -1) -
                       gene_atb_selected.columnmeta['min'].reshape(1, -1))

    # update metadata
    print('updating metadata...', flush=True)
    assert (gene_atb.columnlabels == gene_atb_selected.columnlabels).all()
    for field, values in gene_atb.columnmeta.items():
        if field not in gene_atb_selected.columnmeta:
            gene_atb_selected.columnmeta[field] = values
    print('old_num_genes:{0!s}\tnew_num_genes:{1!s}'.format(
        gene_atb.shape[0], gene_atb_selected.shape[0]),
          flush=True)
    del gene_atb

    # refit selected model
    print('refitting selected model...', flush=True)
    isvalidation = np.in1d(gene_atb_selected.rowlabels, validation_examples)
    isunknown = gene_atb_selected.rowmeta['class'] == 'unknown'
    istraintest = ~np.logical_or(isvalidation, isunknown)
    selected_model.fit(
        gene_atb_selected.matrix[istraintest, :],
        gene_atb_selected.rowmeta['class'][istraintest] == 'positive')

    # get predictions for validation and unlabelled examples
    print('getting predictions for validation and unlabelled examples...',
          flush=True)
    gene_model_selected = dataclasses.datamatrix(
        rowname=gene_atb_selected.rowname,
        rowlabels=gene_atb_selected.rowlabels.copy(),
        rowmeta=copy.deepcopy(gene_atb_selected.rowmeta),
        columnname=stat_model_selected.columnname,
        columnlabels=stat_model_selected.columnlabels.copy(),
        columnmeta=copy.deepcopy(stat_model_selected.columnmeta),
        matrixname=
        'success_probabilities_for_validation_and_unlabelled_examples',
        matrix=selected_model.predict_proba(
            gene_atb_selected.matrix)[:, selected_model.classes_ == 1])
    gene_model_selected.discard(istraintest, 0)

    # save results
    print('saving {0!s} useful features and model results...'.format(
        gene_atb_selected.shape[1]),
          flush=True)
    dataset_info['path'] = '{0}/{1}.txt.gz'.format(
        results_folder, dataset_info['abbreviation'])
    dataset_info['selected_model_name'] = selected_model_name
    dataset_info['selected_model_features'] = '|'.join(selected_model_features)
    dataset_info['selected_model_type'] = selected_model_type
    dataset_info['crossvalidation_reps'] = reps
    dataset_info['crossvalidation_folds'] = folds
    dataset_info['rf_trees'] = rf_trees
    dataset_info['include_logistic_regression'] = include_logistic_regression
    for stat_name, stat_values in zip(stat_model_selected.rowlabels,
                                      stat_model_selected.matrix):
        dataset_info[stat_name] = stat_values.item()
    datasetIO.save_datamatrix(dataset_info['path'], gene_atb_selected)
    datasetIO.save_datamatrix('{0}/stat_model.txt.gz'.format(results_folder),
                              stat_model)
    datasetIO.save_datamatrix(
        '{0}/feature_model.txt.gz'.format(results_folder), feature_model)
    datasetIO.save_datamatrix(
        '{0}/stat_model_selected.txt.gz'.format(results_folder),
        stat_model_selected)
    datasetIO.save_datamatrix(
        '{0}/feature_model_selected.txt.gz'.format(results_folder),
        feature_model_selected)
    datasetIO.save_datamatrix(
        '{0}/gene_model_selected.txt.gz'.format(results_folder),
        gene_model_selected)
    datasetIO.append_datasetinfo('{0}/dataset_info.txt'.format(results_folder),
                                 dataset_info)

    print('done.', flush=True)
예제 #15
0
classifier_cutoff = 'mcc_cutoff'
classifier_stats = np.array([
    'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr',
    'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr', 'npv',
    'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc', 'fnlp'
],
                            dtype='object')

# classifier stats for each of 200 repetitions of cross-validation
stat_rep = dataclasses.datamatrix(
    rowname='classifier_performance_stat',
    rowlabels=classifier_stats.copy(),
    rowmeta={},
    columnname='validation_rep',
    columnlabels=np.array(['Rep' + str(x) for x in range(validation_reps)],
                          dtype='object'),
    columnmeta={'validation_folds': np.zeros(validation_reps, dtype='int64')},
    matrixname=
    'crossvalidation_classifier_performance_stats_across_validation_reps',
    matrix=np.zeros((classifier_stats.size, validation_reps), dtype='float64'))

# classifier stats for each of 200reps*5folds=1000 train-test cycles
stat_fold = dataclasses.datamatrix(
    rowname='classifier_performance_stat',
    rowlabels=classifier_stats.copy(),
    rowmeta={},
    columnname='validation_rep_and_fold',
    columnlabels=np.full(validation_reps * validation_folds,
                         '',
                         dtype='object'),
예제 #16
0
                    gene_names.append('NA')
                    gene_types.append('NA')
                    entrez_ids.append('NA')

    print('creating datamatrix object...', flush=True)
    dataset[partition] = dc.datamatrix(
        rowname='rsid',
        rowlabels=np.array(rsids, dtype='object'),
        rowmeta={
            'chromosome': np.array(chroms, dtype='object'),
            'position': np.array(poss, dtype='object'),
            'ref_allele': np.array(refs, dtype='object'),
            'alt_allele': np.array(alts, dtype='object'),
            'ensembl_gene_id': np.array(ensembl_gene_ids, dtype='object'),
            'gene_name': np.array(gene_names, dtype='object'),
            'gene_type': np.array(gene_types, dtype='object'),
            'entrez_id': np.array(entrez_ids, dtype='object')
        },
        columnname='genome_id',
        columnlabels=np.array(genome_ids, dtype='object'),
        columnmeta={
            'population': np.array(pops, dtype='object'),
            'super_population': np.array(super_pops, dtype='object'),
            'gender': np.array(genders, dtype='object')
        },
        matrixname='MHC_phased_genotypes_from_1000_genomes',
        matrix=np.array(genotype_matrix, dtype='float32'))
    print(dataset[partition], flush=True)

    for i in range(5):
        printdict = {
            dataset[partition].rowname: dataset[partition].rowlabels[i]
예제 #17
0
def load_datamatrix(datasetpath,
                    delimiter='\t',
                    dtype='float64',
                    getmetadata=True,
                    getmatrix=True):
    if '.pickle' in datasetpath:
        with open(datasetpath, 'rb') as fr:
            return pickle.load(fr)
    else:
        if '.gz' in datasetpath:
            openfunc = gzip.open
        else:
            openfunc = open
        with openfunc(datasetpath,
                      mode='rt',
                      encoding="utf-8",
                      errors="surrogateescape") as fr:
            rowmeta = {}
            columnmeta = {}
            rowlabels = []
            entries = [x.strip() for x in fr.readline().split(delimiter)]
            skipcolumns = sum([entry == '#' for entry in entries]) + 1
            columnname = entries[skipcolumns - 1]
            columnlabels = np.array(entries[skipcolumns:], dtype='object')
            firstentry = entries[0]
            skiprows = 1
            if getmetadata:
                while firstentry == '#':
                    entries = [
                        x.strip() for x in fr.readline().split(delimiter)
                    ]
                    columnmetaname = entries[skipcolumns - 1].split('/')[-1]
                    if columnmetaname.lower() != 'na':
                        columnmeta[columnmetaname] = np.array(
                            entries[skipcolumns:], dtype='object')
                    firstentry = entries[0]
                    skiprows += 1
                rowname = firstentry
                rowmetanames = entries[1:skipcolumns]
                if len(rowmetanames) > 0:
                    rowmetanames[-1] = rowmetanames[-1].split('/')[0]
                rowmetaname_idx = {}
                for i, rowmetaname in enumerate(rowmetanames):
                    if rowmetaname.lower() != 'na':
                        rowmeta[rowmetaname] = []
                        rowmetaname_idx[rowmetaname] = i
                for line in fr:
                    entries = [
                        x.strip() for x in line.split(
                            delimiter, maxsplit=skipcolumns)[:skipcolumns]
                    ]
                    rowlabels.append(entries.pop(0))
                    for rowmetaname, idx in rowmetaname_idx.items():
                        rowmeta[rowmetaname].append(entries[idx])
                rowlabels = np.array(rowlabels, dtype='object')
                for rowmetaname, rowmetavalues in rowmeta.items():
                    rowmeta[rowmetaname] = np.array(rowmetavalues,
                                                    dtype='object')
            else:
                while firstentry == '#':
                    entries = [
                        x.strip() for x in fr.readline().split(delimiter)
                    ]
                    firstentry = entries[0]
                    skiprows += 1
                rowname = firstentry
                for line in fr:
                    rowlabels.append(
                        line.split(delimiter, maxsplit=1)[0].strip())
                rowlabels = np.array(rowlabels, dtype='object')
        if getmatrix:
            matrix = np.loadtxt(datasetpath,
                                dtype=dtype,
                                delimiter=delimiter,
                                skiprows=skiprows,
                                usecols=range(skipcolumns,
                                              len(columnlabels) + skipcolumns),
                                ndmin=2)
        else:
            matrix = np.zeros((0, 0), dtype=dtype)
        matrixname = rowname + '_' + columnname + '_associations_from_' + datasetpath
        return dc.datamatrix(rowname, rowlabels, columnname, columnlabels,
                             matrixname, matrix, rowmeta, columnmeta)
예제 #18
0
hit = np.in1d(sample_metadata['sample_id'], chosen_samples)
for field, values in sample_metadata.items():
    sample_metadata[field] = values[hit]
run_ids = run_ids[hit]

matrix = matrix = np.loadtxt(
    '../../original_data/GTEXv6plus/counts_gene.tsv.gz',
    dtype='float64',
    delimiter='\t',
    skiprows=1,
    usecols=hit.nonzero()[0],
    ndmin=2)

gene_tissue = dataclasses.datamatrix(
    rowname='ensembl_gene_id',
    rowlabels=ensembl_gene_ids,
    rowmeta={},
    columnname='recount2_run_id',
    columnlabels=run_ids,
    columnmeta=sample_metadata,
    matrixname='recount2_processed_rnaseq_counts_from_gtexv6',
    matrix=matrix)

datasetIO.save_datamatrix(
    '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.pickle',
    gene_tissue)
datasetIO.save_datamatrix(
    '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.txt.gz',
    gene_tissue)
def main(dictionaries, year, datestamp, min_score):

    print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1]))
    print('year: {0}'.format(year))
    print('datestamp: {0}'.format(datestamp))
    print('min_score: {0!s}'.format(min_score))

    # set term dictionaries and paths to dicts containing PMIDs for each term
    # these files are generated by get_term_pmids_from_termite.py
    row_dictionary = dictionaries[
        0]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    row_pmids_path = 'term_pmid_dict_dictionary_{0}_year_{1}_datestamp_{2}_minscore_{3!s}.pickle'.format(
        row_dictionary, year, datestamp, min_score)
    column_dictionary = dictionaries[
        1]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    column_pmids_path = 'term_pmid_dict_dictionary_{0}_year_{1}_datestamp_{2}_minscore_{3!s}.pickle'.format(
        column_dictionary, year, datestamp, min_score)

    hucellanat_path = 'term_pmid_dict_dictionary_{0}_year_{1}_datestamp_{2}_minscore_{3!s}.pickle'.format(
        'HUCELLANAT', year, datestamp, min_score)
    if 'HUCELLANAT' in dictionaries and not os.path.exists(hucellanat_path):
        # combine HUCELL and ANAT term-pmid dicts into a single dict
        print('creating {0}...'.format(hucellanat_path), flush=True)
        with open(hucellanat_path.replace('ANAT', ''), 'rb') as fr:
            term_pmids = pickle.load(fr)
        with open(hucellanat_path.replace('HUCELL', ''), 'rb') as fr:
            term_pmids.update(pickle.load(fr))
        with open(hucellanat_path, 'wb') as fw:
            pickle.dump(term_pmids, fw)
        del term_pmids

    hucellanatindication_path = 'term_pmid_dict_dictionary_{0}_year_{1}_datestamp_{2}_minscore_{3!s}.pickle'.format(
        'HUCELLANATINDICATION', year, datestamp, min_score)
    if 'HUCELLANATINDICATION' in dictionaries and not os.path.exists(
            hucellanatindication_path):
        # combine HUCELL ANAT and INDICATION term-pmid dicts into a single dict
        print('creating {0}...'.format(hucellanatindication_path), flush=True)
        with open(
                hucellanatindication_path.replace('HUCELLANATINDICATION',
                                                  'HUCELL'), 'rb') as fr:
            term_pmids = pickle.load(fr)
        with open(
                hucellanatindication_path.replace('HUCELLANATINDICATION',
                                                  'ANAT'), 'rb') as fr:
            term_pmids.update(pickle.load(fr))
        with open(
                hucellanatindication_path.replace('HUCELLANATINDICATION',
                                                  'INDICATION'), 'rb') as fr:
            term_pmids.update(pickle.load(fr))
        with open(hucellanatindication_path, 'wb') as fw:
            pickle.dump(term_pmids, fw)
        del term_pmids

    # first dictionary of biomedical terms
    # load dict mapping terms to PMID sets
    # parse dict to rowlabels and rowmetadata
    print('loading row_dictionary: {0}...'.format(row_dictionary), flush=True)
    with open(row_pmids_path, 'rb') as fr:
        rowterm_pmids = pickle.load(fr)
    rowlabels, rowmeta = get_labels_and_metadata(rowterm_pmids)

    # second dictionary of biomedical terms
    # load dict mapping terms to PMID sets
    # parse dict to columnlabels and columnmetadata
    print('loading column_dictionary: {0}...'.format(column_dictionary),
          flush=True)
    if column_dictionary == row_dictionary:
        columnterm_pmids = rowterm_pmids
        columnlabels = rowlabels
        columnmeta = rowmeta
    else:
        with open(column_pmids_path, 'rb') as fr:
            columnterm_pmids = pickle.load(fr)
        columnlabels, columnmeta = get_labels_and_metadata(columnterm_pmids)

    # create datamatrix object for storing co-occurrence counts and marginal counts
    print(
        'creating datamatrix object for storing co-occurrence counts and marginal counts...'
    )
    term_term = dataclasses.datamatrix(
        rowname='term_dictidname',
        rowlabels=rowlabels.copy(),
        rowmeta=copy.deepcopy(rowmeta),
        columnname='term_dictidname',
        columnlabels=columnlabels.copy(),
        columnmeta=copy.deepcopy(columnmeta),
        matrixname='literature_cooccurrence_from_termite',
        matrix=np.zeros((rowlabels.size, columnlabels.size), dtype='int64'))
    del rowlabels, rowmeta, columnlabels, columnmeta
    print(term_term)

    # get co-occurrence counts and marginal counts
    print('calculating co-occurrence counts and marginal counts...')
    row_pmids_intersectionunion = defaultdict(
        set
    )  # the set of PMIDs mentioning row term i and any column term (union of all of the intersections)
    column_pmids_intersectionunion = defaultdict(
        set
    )  # the set of PMIDs mentioning column term j and any row term (union of all of the intersections)
    all_pmids_intersectionunion = set(
    )  # the set of PMIDs mentioning any row term AND any column term ("universe" is limited to publications that have at least one row term association AND at least one column term association)
    all_pmids_union = set(
    )  # the set of PMIDs mentioning any row term OR any column term ("universe" is limited to publications that have at least one row term association OR at least one column term association)
    # *** term_term_union_matrix = np.zeros(term_term.shape, dtype='int64') # the count of PMIDs mentioning row term i OR column term j
    for i, rowlabel in enumerate(term_term.rowlabels):
        if np.mod(i, 100) == 0 or i + 1 == term_term.shape[0]:
            print('working on row {0!s} of {1!s}...'.format(
                i + 1, term_term.shape[0]),
                  flush=True)
        row_pmids = rowterm_pmids[rowlabel]
        for j, columnlabel in enumerate(term_term.columnlabels):
            column_pmids = columnterm_pmids[columnlabel]
            intersection_pmids = row_pmids.intersection(column_pmids)
            term_term.matrix[i, j] = len(
                intersection_pmids
            )  # the count of PMIDs mentioning row term i AND column term j
            #        all_pmids_union = row_pmids.union(column_pmids)
            #        term_term_union_matrix[i,j] = len(all_pmids_union) # the count of PMIDs mentioning row term i OR column term j
            if rowlabel != columnlabel:
                row_pmids_intersectionunion[rowlabel].update(
                    intersection_pmids)
                column_pmids_intersectionunion[columnlabel].update(
                    intersection_pmids)
        all_pmids_union.update(row_pmids)
        all_pmids_intersectionunion.update(
            row_pmids_intersectionunion[rowlabel])
    for column_pmids in columnterm_pmids.values():
        all_pmids_union.update(column_pmids)

    # include marginal counts as metadata
    print('including marginal counts as datamatrix metadata...')
    #     relevant universe
    term_term.rowmeta['term_count_intersectionunion'] = np.array([
        len(row_pmids_intersectionunion[rowlabel])
        for rowlabel in term_term.rowlabels
    ],
                                                                 dtype='int64')
    term_term.columnmeta['term_count_intersectionunion'] = np.array(
        [
            len(column_pmids_intersectionunion[columnlabel])
            for columnlabel in term_term.columnlabels
        ],
        dtype='int64')
    term_term.rowmeta['all_count_intersectionunion'] = np.full(
        term_term.shape[0], len(all_pmids_intersectionunion), dtype='int64')
    term_term.columnmeta['all_count_intersectionunion'] = np.full(
        term_term.shape[1], len(all_pmids_intersectionunion), dtype='int64')
    #     whole universe
    term_term.rowmeta['term_count_union'] = np.array(
        [len(rowterm_pmids[rowlabel]) for rowlabel in term_term.rowlabels],
        dtype='int64')
    term_term.columnmeta['term_count_union'] = np.array([
        len(columnterm_pmids[columnlabel])
        for columnlabel in term_term.columnlabels
    ],
                                                        dtype='int64')
    term_term.rowmeta['all_count_union'] = np.full(term_term.shape[0],
                                                   len(all_pmids_union),
                                                   dtype='int64')
    term_term.columnmeta['all_count_union'] = np.full(term_term.shape[1],
                                                      len(all_pmids_union),
                                                      dtype='int64')

    # *** no need to calculate term_term_union_matrix
    #     if want this as universe size,
    #        start with universe size = all_count_intersectionunion
    #        calculate true positive, true negatives, false positives, false negatives
    #        subtract true negatives from universe size and set true negatives to zero

    # save results
    print('saving results...')
    datasetIO.save_datamatrix(
        '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.txt.gz'
        .format(row_dictionary, column_dictionary, year, datestamp,
                min_score), term_term)
    datasetIO.save_datamatrix(
        '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'
        .format(row_dictionary, column_dictionary, year, datestamp,
                min_score), term_term)

    print('done count_term-term_pmids_from_termite.py', flush=True)