예제 #1
0
def pca_projected_clusters(feature_matrix):
    n_components = 5

    if not util.is_server():
        color_names = colors.cnames

        df = pd.read_csv('{}/pca_projected_clusters.csv'.format(util.get_output_dir()))
        df = df[['x', 'y', 'label']]
        ax = df[df['label'] == 0].plot(kind='scatter', x='x', y='y', c=random.choice(list(color_names.keys())),
                                       label='Phenotype 0')
        for k in range(1, n_components):
            cluster = df[df['label'] == k]
            if cluster.shape[0] > 0:
                cluster.plot(kind='scatter', x='x', y='y', c=random.choice(list(color_names.keys())),
                             label='Phenotype {}'.format(k), ax=ax)
        plt.title('PCA Projected Clusters')
        plt.savefig('{}/pca_projected_clusters.png'.format(util.get_output_dir()))
        plt.clf()
        return

    W, H, _ = clustering.get_nmf_w_h(feature_matrix, n_components=n_components, alpha=5, l1_ratio=1.0)
    labels = [label for eid, label in clustering.get_cluster_assignments(np.zeros(W.shape[0]), W)]
    model = TruncatedSVD(n_components=2)
    data_r = model.fit_transform(feature_matrix)
    d = {}
    d['x'] = [data[0] for data in data_r]
    d['y'] = [data[1] for data in data_r]
    d['label'] = labels
    df = pd.DataFrame(d)
    df.to_csv('{}/pca_projected_clusters.csv'.format(util.get_output_dir()))
예제 #2
0
def hyperparameter_tuning(feature_matrix):
    alphas = [1, 5, 15]
    l1_ratios = [0, .5, 1]
    ks = range(2, 40, 5)
    threshold = 1

    d = {}
    for k in ks:
        for alpha in alphas:
            for l1_ratio in l1_ratios:
                    W, H, _ = get_nmf_w_h(feature_matrix, n_components=k, alpha=alpha, l1_ratio=l1_ratio)
                    avg_count_above_threshold = np.mean(get_count_features_above_threshold(H, threshold))
                    num_labels = len(set([label for _, label in get_cluster_assignments(np.zeros(W.shape[0]), W)]))
                    d['k'] = d.get('k', []) + [k]
                    d['alpha'] = d.get('alpha', []) + [alpha]
                    d['l1_ratio'] = d.get('l1_ratio', []) + [l1_ratio]
                    d['num_labels'] = d.get('num_labels', []) + [num_labels]
                    d['avg_nonzero_feature_count'] = d.get('avg_nonzero_feature_count', []) + [avg_count_above_threshold]
                    print('k={0}, alpha={1}, l1_ratio={2}, num_labels={3}, count={4}'.format(k, alpha, l1_ratio, num_labels, avg_count_above_threshold))
    #output all results
    df = pd.DataFrame(d)
    df.to_csv('{}/hyperparamtuning.csv'.format(util.get_output_dir()))

    #output optimal results
    df = df[(df['count'] >= 5) & (df['count'] <= 10)]
    df.to_csv('{}/optimalparams.csv'.format(util.get_output_dir()))
예제 #3
0
def k_vs_silhouette(feature_matrix, distance_metric='euclidean'):

    if not util.is_server():
        df = pd.read_csv('{}/silhouette_avg.csv'.format(util.get_output_dir()))
        df = df[['k', 'silhouette_avg']].set_index('k')
        df.plot(title='K vs Silhouette Average')
        plt.xlabel('K')
        plt.ylabel('Silhouette average')
        plt.show()
        return

    silhouette_avgs = []
    ks = range(2, 40, 4)
    for k in ks:
        W, H, _ = clustering.get_nmf_w_h(feature_matrix, n_components=k)
        labels = [label for eid, label in clustering.get_cluster_assignments(np.zeros(W.shape[0]), W)]
        for iter in range(3):
            avgs = []
            avgs.append(metrics.silhouette_score(feature_matrix, labels, distance_metric,
                                                      sample_size=10000))
        silhouette_avgs.append(sum(avgs) / len(avgs))
    d = {}
    d['k'] = ks
    d['silhouette_avg'] = silhouette_avgs
    df = pd.DataFrame(d)
    df.to_csv('{}/silhouette_avg.csv'.format(util.get_output_dir()))
예제 #4
0
def vis_feature_bar(feature_matrix):
    frequency = feature_matrix.sum(axis = 0)
    bar_num = frequency.shape[1]
    plt.bar(range(0,bar_num),frequency.getA().flatten())
    plt.ylabel('Frequency')
    plt.xlabel('Feature#')
    plt.title('Frequency of Features')
    plt.savefig('{}/frequency.png'.format(util.get_output_dir()))
    plt.clf()
예제 #5
0
def vis_info_table(H, feature_names):
    nonzero_feature_names = clustering.get_nonzero_featurenames(H, .1, feature_names)
    feature_names_weight_df = pd.DataFrame(nonzero_feature_names).set_index('featureName')
    med_info_df = pd.read_csv('{}/medicationCodes/coding4.tsv'.format(util.get_output_dir()),
                          delimiter='\t',
                          dtype={'coding': str},
                          usecols=['coding', 'meaning']).rename(columns = {'coding':'node_id'})
    med_info_df['node_id'] = '20003='+ med_info_df['node_id'].astype(str)
    med_info_df['category'] = 'med'
    med_info = feature_names_weight_df\
     .join(med_info_df.set_index('node_id'), how='inner')\
     .reset_index()\
     .drop('index',axis = 1)

    noncan_info_df = pd.read_csv('{}/nonCancerIllnessCodes/coding6.tsv'.format(util.get_output_dir()),
                          delimiter='\t',
                          dtype={'node_id': str},
                          usecols=['node_id', 'meaning']) 
    noncan_info_df['node_id'] = '20002=' + noncan_info_df['node_id'].astype(str)
    noncan_info_df['category'] = 'noncan'
    noncan_info = feature_names_weight_df\
     .join(noncan_info_df.set_index('node_id'), how='inner')\
     .reset_index()\
     .drop('index',axis = 1)

    can_info_df = pd.read_csv('{}/cancerCodes/coding3.tsv'.format(util.get_output_dir()),
                          delimiter='\t',
                          dtype={'node_id': str},
                          usecols=['node_id', 'meaning'])
    can_info_df['node_id'] = '20001=' + can_info_df['node_id'].astype(str)
    can_info_df['category'] = 'cancer'
    can_info = feature_names_weight_df\
     .join(can_info_df.set_index('node_id'), how='inner')\
     .reset_index()\
     .drop('index',axis = 1)
    frame = [med_info, noncan_info, can_info]
    f = open('{}/info_table.csv'.format(util.get_output_dir()),'w+')
    info_table = pd.concat(frame).groupby(['label','category'])\
        .apply(lambda x: x.loc[:,['label','category','meaning','weight']]\
        .to_csv(f,index = False))
예제 #6
0
def iteration_vs_error(feature_matrix, n_components):
    iterations = range(10, 300, 10)
    errors = []
    for iteration in iterations:
        _, _, error = clustering.get_nmf_w_h(feature_matrix, n_components, max_iter=iteration)
        errors.append(error)

    d = {'error': errors, 'iteration': iterations}
    df = pd.DataFrame(d)
    df.plot(x='iteration', y='error')
    plt.xlabel('Iteration')
    plt.ylabel('Error')
    plt.savefig('{}/iteration_vs_error.png'.format(util.get_output_dir()))
    plt.clf()
예제 #7
0
def vis_grs(labels):
    grs_df = pd.read_csv('{}/GWAS_Catalog_20170317_50traits_Good_CALCULATED_GRS.txt'.format(util.get_output_dir()),sep='\t')
    cluster_assignment_df = pd.DataFrame.from_records(labels)
    cluster_assignment_df.columns = ['Indiv', 'cluster']
    grs_cluster_df = grs_df.join(cluster_assignment_df.set_index('Indiv'), on = 'Indiv')
    grs_cluster_df = grs_cluster_df[pd.notnull(grs_cluster_df['cluster'])].drop('Indiv',axis = 1)
    grs_cluster_df_group = grs_cluster_df.groupby(["cluster"])
    matrix = grs_cluster_df_group.mean().as_matrix()
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.ocean, aspect = 'auto')
    plt.colorbar()
    plt.ylabel('Phenotype')
    plt.xlabel('Disease')
    plt.title('Average Genetic Risk Score of Each Clutser ')
    plt.savefig('{}/grs.png'.format(util.get_output_dir()))
    plt.clf()
예제 #8
0
def nonzero_alpha_beta_plot(k):
    hyperpara_df = pd.read_csv('{}/hyperparamtuning.csv'.format(util.get_output_dir()),
                              dtype={'k':int, 'alpha':float,'beta':int,'count':int},
                              usecols=['k', 'alpha','beta','count'])
    alphas = [.01, .1, 1, 10, 100]
    betas = [1, 10, 100, 1000]
    hyperpara_df = hyperpara_df[hyperpara_df['k']==k]
    alpha_beat_array =[]
    for alpha in alphas:
        temp = []
        for beta in betas:
            index = np.logical_and(hyperpara_df['alpha']==alpha,hyperpara_df['beta']==beta)
            temp.append(hyperpara_df[index]['count'].values[0])
        alpha_beat_array.append(temp)
    alpha_beat_array = np.asarray(alpha_beat_array)
    group_bar(alpha_beat_array,alphas,betas,'Alpha-Beta-Count Bar Plot')
예제 #9
0
def group_bar(data,x_ticklabel,legend,category):
    width = 0.15
    fig, ax = plt.subplots(figsize=(15,10))
    pos = range(0,data.shape[0])
    times = range(0,data.shape[1])
    for i in times:
        postion = [ p + i*width for p in pos]
        plt.bar(postion,data[:,i],width,alpha=0.5)
    ax.set_ylabel("Count")
    ax.set_xticks([p + width*(data.shape[1]-1)/2 for p in pos ])
    ax.set_xticklabels(x_ticklabel)
    ax.set_title(category)
    plt.legend(["cluster"+str(i) for i in pos],loc = "upper left")
    plt.ylim([0, max(data[0,:])] )
    if category in ['Medication','Cancer','NonCancer Illness']:
        ax.set_xlabel("Threshold")
        plt.legend(["cluster"+str(i) for i in legend],loc = "upper left")
    else:
        ax.set_xlabel('Alpha')
        plt.legend(["Beta = "+str(i) for i in legend],loc = "upper left")
    plt.grid()
    plt.savefig('{}/'.format(util.get_output_dir())+category+'_threshold_bar.png')
    plt.clf()
예제 #10
0
def load_sparse_feature_matrix():
    medication_df = pd.read_csv('{}/MEDICATIONS/part-00000'.format(
        get_output_dir()),
                                dtype={'featureName': str})
    cancer_df = pd.read_csv('{}/CANCERS/part-00000'.format(get_output_dir()),
                            dtype={'featureName': str})
    non_cancer_illness_df = pd.read_csv(
        '{}/NON_CANCER_ILLNESSES/part-00000'.format(get_output_dir()),
        dtype={'featureName': str})
    lifestyle_df = pd.read_csv('{}/LIFESTYLE/part-00000'.format(
        get_output_dir()),
                               dtype={'featureName': str})
    demographic_df = pd.read_csv('{}/DEMOGRAPHIC/part-00000'.format(
        get_output_dir()),
                                 dtype={'featureName': str})
    physical_measure_df = pd.read_csv('{}/PHYSICALMEASURE/part-00000'.format(
        get_output_dir()),
                                      dtype={'featureName': str})

    df = non_cancer_illness_df \
        .append(cancer_df) \
        .append(medication_df) \
        # # .append(physical_measure_df)\

    # # .append(lifestyle_df)\
    # # .append(demographic_df)\

    cols = ['featureName', 'featureValue']
    df = df.sort_values(by='eid')
    grouped = df.groupby('eid')
    l = [dict(zip(*g[cols].T.values)) for n, g in grouped]

    vec = DictVectorizer()
    sparse_feature_matrix = vec.fit_transform(l)

    eids = [group['eid'].unique()[0] for _, group in grouped]

    feature_names = vec.get_feature_names()

    return eids, sparse_feature_matrix, feature_names
예제 #11
0
        lp = likelihood(Y[i, j], 1)
        lm = likelihood(Y[i, j], -1)
        a = m + 0.5 * (lp - lm)
        X[i, j] = tanh(a)

    for _ in tqdm(range(T)):
        for i in range(N):
            for j in range(M):
                optimize(i, j)

    X[X < 0] = 0
    X[X > 0] = 255

    img_name = filename.split('/')[-1].split('.')[0]
    cv2.imwrite('../output/denoise_vi_{}.jpg'.format(img_name), X)


if __name__ == '__main__':
    get_output_dir('../output')

    img_files = [
        '../a1/1_noise.txt',
        '../a1/2_noise.txt',
        '../a1/3_noise.txt',
        '../a1/4_noise.txt',
    ]

    for img in img_files:
        gibbs(img, 1, 1, 20)
        vi(img, 1, 1, 20)
예제 #12
0
def silhouette_plot(eids, feature_matrix, distance_metric='euclidean'):
    for k in range(2, 21, 4):
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, feature_matrix.shape[0] + (k + 1) * 10])

        W, H, _ = clustering.get_nmf_w_h(feature_matrix, n_components=k)
        cluster_assignments = clustering.get_cluster_assignments(eids, W)
        cluster_labels = np.array([assignment for eid, assignment in cluster_assignments])

        sample_size = 30000
        indices = np.random.permutation(feature_matrix.shape[0])[:sample_size]
        feature_matrix_sample = feature_matrix[indices]
        cluster_labels_sample = cluster_labels[indices]
        sample_silhouette_values = metrics.silhouette_samples(feature_matrix_sample, cluster_labels_sample, distance_metric)

        silhouette_avg = metrics.silhouette_score(feature_matrix, cluster_labels, distance_metric, sample_size=sample_size)

        #project data onto 2 dimensions for visualization
        model = TruncatedSVD(n_components=2)
        data_r = model.fit_transform(feature_matrix)

        y_lower = 10
        for i in range(k):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.spectral(float(i) / k)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.spectral(cluster_labels.astype(float) / k)
        ax2.scatter(data_r[:, 0], data_r[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors)

        # # Labeling the clusters
        # centers = H
        # # Draw white circles at cluster centers
        # ax2.scatter(centers[:, 0], centers[:, 1],
        #             marker='o', c="white", alpha=1, s=200)
        #
        # for i, c in enumerate(centers):
        #     ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for NMF clustering"
                      "with n_clusters = %d" % k),
                     fontsize=14, fontweight='bold')

        plt.savefig('{0}/silAnalysis_k={1}.png'.format(util.get_output_dir(), k))
        plt.clf()
예제 #13
0
def percentage_feature_in_cluster_vs_cohort(mode, labels, H, feature_names, pop_size):
    valid_modes = ['medication', 'non-cancer-illness', 'cancer']
    if mode not in valid_modes:
        raise AttributeError('mode must be one of {}'.format(', '.join(valid_modes)))

    if not util.is_server():
        df = pd.read_csv('{0}/{1}_percent_ftr_vs_clstr.csv'.format(util.get_output_dir(), mode))
        pre = 'percentFeatureInCluster'
        d1 = df.set_index(['meaning', 'label'])[pre].unstack(fill_value=0).add_prefix(pre).reset_index()
        df = d1.set_index('meaning').join(df[['meaning', 'percentFeatureInPop']].set_index('meaning'), how='inner').drop_duplicates()
        df.plot.bar(title='Percent Feature in Cluster Vs Pop', fontsize=8, rot=90, grid=True, stacked=True, figsize=[20, 20], colormap=plt.cm.ocean)
        plt.xlabel(mode)
        plt.gcf().subplots_adjust(bottom=0.5)
        plt.savefig('{0}/{1}_percent_ftr_vs_clstr.png'.format(util.get_output_dir(), mode), bbox_inches='tight')
        plt.clf()
        return

    medication_df = pd.read_csv('{}/MEDICATIONS/part-00000'.format(util.get_output_dir()), dtype={'featureName': str})
    cancer_df = pd.read_csv('{}/CANCERS/part-00000'.format(util.get_output_dir()), dtype={'featureName': str})
    non_cancer_illness_df = pd.read_csv('{}/NON_CANCER_ILLNESSES/part-00000'.format(util.get_output_dir()), dtype={'featureName': str})

    if mode == 'medication':
        df = medication_df
        info_df = pd.read_csv('{}/medicationCodes/coding4.tsv'.format(util.get_output_dir()),
                              delimiter='\t',
                              dtype={'coding': str},
                              usecols=['coding', 'meaning'])
        join_on = 'coding'
    elif mode == 'non-cancer-illness':
        df = non_cancer_illness_df
        info_df = pd.read_csv('{}/nonCancerIllnessCodes/coding6.tsv'.format(util.get_output_dir()),
                              delimiter='\t',
                              dtype={'node_id': str},
                              usecols=['node_id', 'meaning'])
        join_on = 'node_id'
    elif mode == 'cancer':
        df = cancer_df
        info_df = pd.read_csv('{}/cancerCodes/coding3.tsv'.format(util.get_output_dir()),
                              delimiter='\t',
                              dtype={'node_id': str},
                              usecols=['node_id', 'meaning'])
        join_on = 'node_id'

    nonzero_feature_names = clustering.get_nonzero_featurenames(H, 1, feature_names)
    feature_names_df = pd.DataFrame(nonzero_feature_names).drop('weight', axis=1)


    #join data with cluster labels
    cluster_labels_df = pd.DataFrame.from_records(labels, columns=['eid', 'label']).set_index('eid')
    data = df.join(cluster_labels_df, on='eid', how='inner')

    num_patients_in_cluster_df = pd.DataFrame(data.groupby(by='label')['eid']
                                              .apply(lambda x: len(x.unique()))
                                              .rename('numPatientsInCluster'))

    count_feature_in_pop_df = pd.DataFrame(data.groupby(by='featureName')['eid']
                                           .apply(lambda x: len(x.unique()))
                                           .rename('countFeatureInPop'))

    features_selected_by_H_df = data\
        .set_index(['label', 'featureName'])\
        .join(feature_names_df.set_index(['label', 'featureName']), how='inner')\
        .reset_index()
    feature_count_in_cluster_df = pd.DataFrame(features_selected_by_H_df
                                               .groupby(by=['label', 'featureName'])['eid']
                                               .apply(lambda x: len(x.unique()))
                                               .rename('featureCountInCluster'))

    data = features_selected_by_H_df.set_index(['label', 'featureName']).join(feature_count_in_cluster_df, how='inner').reset_index()
    data = data.set_index('label').join(num_patients_in_cluster_df, how='inner').reset_index()
    data = data.set_index('featureName').join(count_feature_in_pop_df, how='inner').reset_index()

    #calculate count of patients with feauture in cluster / num patients in cluster
    data['percentFeatureInCluster'] = data['featureCountInCluster'] / data['numPatientsInCluster'] * 100
    #calculate total count of feature / total pop
    data['percentFeatureInPop'] = data['countFeatureInPop'] / pop_size * 100
    #get meaning names
    data['coding_or_node_id'] = data['featureName'].apply(lambda str: str.split('=')[1])
    data = data.set_index('coding_or_node_id').join(info_df.set_index(join_on), how='inner').reset_index()
    data = data[['meaning', 'label', 'percentFeatureInCluster', 'percentFeatureInPop']].drop_duplicates()
    data.to_csv('{0}/{1}_percent_ftr_vs_clstr.csv'.format(util.get_output_dir(), mode))