def cluster_embeddings(Convo1_Embeddings, Convo2_Embeddings, df_labels1, df_labels2, common_speaker): Agg_EmbeddingList = torch.cat([Convo1_Embeddings, Convo2_Embeddings], dim=0).cpu().numpy() kmeans = KMeans(n_clusters=2, n_init=20, algorithm="elkan") ac = AgglomerativeClustering(n_clusters=2) pca = PCA(n_components=2) Convo2_Embeddings = StandardScaler().fit_transform( Convo2_Embeddings.cpu().numpy()) principalComponents = pca.fit_transform(Convo2_Embeddings) #kmeans.fit(principalComponents)_pre #ac.fit(principalComponents) ac = ac.fit_predict(principalComponents) #y_kmeans = kmeans.predict(principalComponents) #cluster_labels_1 = y_kmeans#[0:df_labels1.shape[1]]ta cluster_labels_2 = ac #y_kmeans#[df_labels1.shape[1]+1:] print(cluster_labels_2) #common_speaker_1 = df_labels1.loc[common_speaker,:].values.tolist() common_speaker_2 = df_labels2.loc[common_speaker, :].values.tolist() speaker_labels = list(df_labels1.index.values) + list( df_labels2.index.values) #second_speaker_1_label = [x for x in list(df_labels1.index.values) if common_speaker not in x][0] second_speaker_2_label = [ x for x in list(df_labels2.index.values) if common_speaker not in x ][0] #second_speaker_1 = df_labels1.loc[second_speaker_1_label,:].values.tolist() second_speaker_2 = df_labels2.loc[ second_speaker_2_label, :].values.tolist() #df_conv_1 = pd.DataFrame(list(zip(cluster_labels_1, common_speaker_1, second_speaker_1)), columns=['cluster', common_speaker, second_speaker_1_label]) df_conv_1 = pd.DataFrame() df_conv_2 = pd.DataFrame( list(zip(cluster_labels_2, common_speaker_2, second_speaker_2)), columns=['cluster', common_speaker, second_speaker_2_label]) return df_conv_1, df_conv_2
def run_dimensionality_reduction( feature_extractor, dataset_types, reduction_model, already_fit=False, ): print("Getting features") feature_datasets = FeatureDatasets(feature_extractor) xs = [] ys = [] if type(dataset_types) == DatasetType: dataset_types = [dataset_types] for dataset_type in dataset_types: if dataset_type == DatasetType.Competition: new_xs, new_ys = feature_datasets.get_features(dataset_type), None else: new_xs, new_ys = feature_datasets.get_features_and_labels( dataset_type) xs.append(new_xs) if new_ys is not None: ys.append(new_ys) xs = torch.cat(xs) if ys: ys = torch.cat(ys) print("Scaling data") xs = StandardScaler().fit_transform(xs.cpu().detach()) if already_fit: print("Using pre-fit model") reduced_features = reduction_model.transform(xs) else: print("Running " + str(reduction_model)) reduced_features = reduction_model.fit_transform(xs, ys) print(reduction_model.explained_variance_ratio_) print("Aggregating principal components") principal_df = pd.DataFrame(data=reduced_features, columns=["pc1", "pc2"]) target_df = pd.DataFrame(data=ys, columns=["target"]) final_df = pd.concat([principal_df, target_df], axis=1) return final_df