def compare_components_vs_topic_distance():
    """
    See the relationship between number of SVD components and
    average distance of topic distance of suggested posters
    """
    # training to get poster vectors
    result = []
    poster_vect_comp = []
    N = len(poster_vect) # total number of posters
    N_trials = 1000
    n_suggest = 10
    n_posters = np.random.randint(N, size=N_trials)
    n_components_list = [50, 75, 100, 150, 200, 300, 400, 500]
    for n_c in n_components_list:
        poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c)
        poster_vect_comp.append(poster_vect)

    # loop through the model
    for n_model in range(len(n_components_list)):
        nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model])
        for n in n_posters:
            poster_idx = n # randomly select one poster (pre-random)
            poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=5)
            poster_likes = [poster_idx] + poster_idx_same_topic # list of posters with same topic
            distance, poster_idx_abs = scc.get_schedule_rocchio(nbrs_model, poster_vect_comp[n_model], like_posters=poster_likes[0:1])
            poster_list = poster_idx_abs.flatten()[1:1+n_suggest]
            avg_distance = np.array([compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in poster_list]).mean()
            result.append([poster_idx] + [avg_distance] + [n_components_list[n_model]])

    result_df = pd.DataFrame(result, columns=['poster_number', 'distance', 'n_components'])

    return result_df
Exemplo n.º 2
0
def compare_components_vs_topic_distance():
    """
    See the relationship between number of SVD components and
    average distance of topic distance of suggested posters
    """
    # training to get poster vectors
    result = []
    poster_vect_comp = []
    N = len(poster_vect)  # total number of posters
    N_trials = 1000
    n_suggest = 10
    n_posters = np.random.randint(N, size=N_trials)
    n_components_list = [50, 75, 100, 150, 200, 300, 400, 500]
    for n_c in n_components_list:
        poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c)
        poster_vect_comp.append(poster_vect)

    # loop through the model
    for n_model in range(len(n_components_list)):
        nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model])
        for n in n_posters:
            poster_idx = n  # randomly select one poster (pre-random)
            poster_idx_same_topic = get_poster_same_topic(poster_idx,
                                                          poster_df,
                                                          n_posters=5)
            poster_likes = [
                poster_idx
            ] + poster_idx_same_topic  # list of posters with same topic
            distance, poster_idx_abs = scc.get_schedule_rocchio(
                nbrs_model,
                poster_vect_comp[n_model],
                like_posters=poster_likes[0:1])
            poster_list = poster_idx_abs.flatten()[1:1 + n_suggest]
            avg_distance = np.array([
                compute_node_distance(poster_df.tree.iloc[poster_idx],
                                      poster_df.tree.iloc[idx])
                for idx in poster_list
            ]).mean()
            result.append([poster_idx] + [avg_distance] +
                          [n_components_list[n_model]])

    result_df = pd.DataFrame(
        result, columns=['poster_number', 'distance', 'n_components'])

    return result_df
Exemplo n.º 3
0
# from http://www.sfn.org/

import science_concierge as scc
import pandas as pd
import numpy as np

path_to_file = ''  # add path to poster pickle file
poster_df = pd.read_pickle(path_to_file)
abstracts = list(poster_df.abstract)
abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract),
                           abstracts)

# poster vector or abstract vector
tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess)
poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200)
nbrs_model = scc.build_nearest_neighbors(poster_vect)

# keywords vector
tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords)
keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30)
nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect)


def compute_node_distance(node_1, node_2):
    """
    Compute distance between two string nodes in format 'F.01.r'
    """
    node_1 = node_1.split('.')
    node_2 = node_2.split('.')
    if node_1[0] != node_2[0]:
        return 3
# note that we use data provide by SfN, which you can request through the society
# from http://www.sfn.org/

import science_concierge as scc
import pandas as pd
import numpy as np

path_to_file = '' # add path to poster pickle file
poster_df = pd.read_pickle(path_to_file)
abstracts = list(poster_df.abstract)
abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract), abstracts)

# poster vector or abstract vector
tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess)
poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200)
nbrs_model = scc.build_nearest_neighbors(poster_vect)

# keywords vector
tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords)
keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30)
nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect)


def compute_node_distance(node_1, node_2):
    """
    Compute distance between two string nodes in format 'F.01.r'
    """
    node_1 = node_1.split('.')
    node_2 = node_2.split('.')
    if node_1[0] != node_2[0]:
        return 3