Exemplo n.º 1
0
def cluster_embeddings(Convo1_Embeddings, Convo2_Embeddings, df_labels1,
                       df_labels2, common_speaker):
    Agg_EmbeddingList = torch.cat([Convo1_Embeddings, Convo2_Embeddings],
                                  dim=0).cpu().numpy()

    kmeans = KMeans(n_clusters=2, n_init=20, algorithm="elkan")
    ac = AgglomerativeClustering(n_clusters=2)

    pca = PCA(n_components=2)
    Convo2_Embeddings = StandardScaler().fit_transform(
        Convo2_Embeddings.cpu().numpy())
    principalComponents = pca.fit_transform(Convo2_Embeddings)

    #kmeans.fit(principalComponents)_pre
    #ac.fit(principalComponents)
    ac = ac.fit_predict(principalComponents)
    #y_kmeans = kmeans.predict(principalComponents)

    #cluster_labels_1 = y_kmeans#[0:df_labels1.shape[1]]ta
    cluster_labels_2 = ac  #y_kmeans#[df_labels1.shape[1]+1:]
    print(cluster_labels_2)

    #common_speaker_1 = df_labels1.loc[common_speaker,:].values.tolist()
    common_speaker_2 = df_labels2.loc[common_speaker, :].values.tolist()

    speaker_labels = list(df_labels1.index.values) + list(
        df_labels2.index.values)
    #second_speaker_1_label = [x for x in list(df_labels1.index.values) if common_speaker not in x][0]
    second_speaker_2_label = [
        x for x in list(df_labels2.index.values) if common_speaker not in x
    ][0]

    #second_speaker_1 = df_labels1.loc[second_speaker_1_label,:].values.tolist()
    second_speaker_2 = df_labels2.loc[
        second_speaker_2_label, :].values.tolist()

    #df_conv_1 = pd.DataFrame(list(zip(cluster_labels_1, common_speaker_1, second_speaker_1)), columns=['cluster', common_speaker, second_speaker_1_label])
    df_conv_1 = pd.DataFrame()
    df_conv_2 = pd.DataFrame(
        list(zip(cluster_labels_2, common_speaker_2, second_speaker_2)),
        columns=['cluster', common_speaker, second_speaker_2_label])

    return df_conv_1, df_conv_2
def run_dimensionality_reduction(
    feature_extractor,
    dataset_types,
    reduction_model,
    already_fit=False,
):
    print("Getting features")
    feature_datasets = FeatureDatasets(feature_extractor)
    xs = []
    ys = []
    if type(dataset_types) == DatasetType:
        dataset_types = [dataset_types]
    for dataset_type in dataset_types:
        if dataset_type == DatasetType.Competition:
            new_xs, new_ys = feature_datasets.get_features(dataset_type), None
        else:
            new_xs, new_ys = feature_datasets.get_features_and_labels(
                dataset_type)
        xs.append(new_xs)
        if new_ys is not None:
            ys.append(new_ys)
    xs = torch.cat(xs)
    if ys:
        ys = torch.cat(ys)
    print("Scaling data")
    xs = StandardScaler().fit_transform(xs.cpu().detach())

    if already_fit:
        print("Using pre-fit model")
        reduced_features = reduction_model.transform(xs)
    else:
        print("Running " + str(reduction_model))
        reduced_features = reduction_model.fit_transform(xs, ys)
    print(reduction_model.explained_variance_ratio_)

    print("Aggregating principal components")
    principal_df = pd.DataFrame(data=reduced_features, columns=["pc1", "pc2"])
    target_df = pd.DataFrame(data=ys, columns=["target"])
    final_df = pd.concat([principal_df, target_df], axis=1)
    return final_df