def compare_components_vs_topic_distance():
    """
    See the relationship between number of SVD components and
    average distance of topic distance of suggested posters
    """
    # training to get poster vectors
    result = []
    poster_vect_comp = []
    N = len(poster_vect) # total number of posters
    N_trials = 1000
    n_suggest = 10
    n_posters = np.random.randint(N, size=N_trials)
    n_components_list = [50, 75, 100, 150, 200, 300, 400, 500]
    for n_c in n_components_list:
        poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c)
        poster_vect_comp.append(poster_vect)

    # loop through the model
    for n_model in range(len(n_components_list)):
        nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model])
        for n in n_posters:
            poster_idx = n # randomly select one poster (pre-random)
            poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=5)
            poster_likes = [poster_idx] + poster_idx_same_topic # list of posters with same topic
            distance, poster_idx_abs = scc.get_schedule_rocchio(nbrs_model, poster_vect_comp[n_model], like_posters=poster_likes[0:1])
            poster_list = poster_idx_abs.flatten()[1:1+n_suggest]
            avg_distance = np.array([compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in poster_list]).mean()
            result.append([poster_idx] + [avg_distance] + [n_components_list[n_model]])

    result_df = pd.DataFrame(result, columns=['poster_number', 'distance', 'n_components'])

    return result_df
예제 #2
0
def compare_components_vs_topic_distance():
    """
    See the relationship between number of SVD components and
    average distance of topic distance of suggested posters
    """
    # training to get poster vectors
    result = []
    poster_vect_comp = []
    N = len(poster_vect)  # total number of posters
    N_trials = 1000
    n_suggest = 10
    n_posters = np.random.randint(N, size=N_trials)
    n_components_list = [50, 75, 100, 150, 200, 300, 400, 500]
    for n_c in n_components_list:
        poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c)
        poster_vect_comp.append(poster_vect)

    # loop through the model
    for n_model in range(len(n_components_list)):
        nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model])
        for n in n_posters:
            poster_idx = n  # randomly select one poster (pre-random)
            poster_idx_same_topic = get_poster_same_topic(poster_idx,
                                                          poster_df,
                                                          n_posters=5)
            poster_likes = [
                poster_idx
            ] + poster_idx_same_topic  # list of posters with same topic
            distance, poster_idx_abs = scc.get_schedule_rocchio(
                nbrs_model,
                poster_vect_comp[n_model],
                like_posters=poster_likes[0:1])
            poster_list = poster_idx_abs.flatten()[1:1 + n_suggest]
            avg_distance = np.array([
                compute_node_distance(poster_df.tree.iloc[poster_idx],
                                      poster_df.tree.iloc[idx])
                for idx in poster_list
            ]).mean()
            result.append([poster_idx] + [avg_distance] +
                          [n_components_list[n_model]])

    result_df = pd.DataFrame(
        result, columns=['poster_number', 'distance', 'n_components'])

    return result_df
예제 #3
0
    poster_df = pd.read_pickle('poster_df.pickle')
    abstracts = list(poster_df.abstract)
    abstract_rdd = sc.parallelize(abstracts, numSlices=1000).\
        map(lambda a: scc.preprocess(a, stem=False)).\
        map(remove_stop_words)
    abstract_stem_rdd = sc.parallelize(abstracts, numSlices=1000).\
        map(lambda a: scc.preprocess(a, stem=False)).\
        collect()

    # average word vectors in abstract
    print(
        'compute abstract vector using word vectors (takes around 40 mins)...')
    vectors_df = pd.read_json('wordvec_df.json')
    word2vec_dict = dict(zip(vectors_df.key, vectors_df.vector.map(np.array)))
    word2vec_dict_bc = sc.broadcast(word2vec_dict)
    abstract_vec_wv = np.vstack(
        abstract_rdd.map(lambda x: list2vec(x, word2vec_dict_bc)).collect())

    print('compute abstract vector using LSA...')
    tfidf_matrix = scc.tfidf_vectorizer(
        abstracts_preprocess)  # convert to tf-idf matrix
    abstract_vec_lsa = scc.svd_vectorizer(tfidf_matrix,
                                          n_components=200,
                                          n_iter=150)

    print('save dataframe to pickle file...')
    poster_vect_multiple = pd.DataFrame(zip(range(len(poster_vect_wv)),
                                            abstract_vec_wv, abstract_vec_lsa),
                                        columns=['number', 'wordvec', 'lsa'])
    poster_vect_multiple.to_pickle('poster_vec_df.pickle')
    vec = [word2vec_dict_bc.value[a] for a in word_list if a in word2vec_dict_bc.value.keys()]
    return np.array(vec).mean(axis=0)


if __name__ == '__main__':
    # read dataframe pickle file with 'abstract' column
    poster_df = pd.read_pickle('poster_df.pickle')
    abstracts = list(poster_df.abstract)
    abstract_rdd = sc.parallelize(abstracts, numSlices=1000).\
        map(lambda a: scc.preprocess(a, stem=False)).\
        map(remove_stop_words)
    abstract_stem_rdd = sc.parallelize(abstracts, numSlices=1000).\
        map(lambda a: scc.preprocess(a, stem=False)).\
        collect()

    # average word vectors in abstract
    print('compute abstract vector using word vectors (takes around 40 mins)...')
    vectors_df = pd.read_json('wordvec_df.json')
    word2vec_dict = dict(zip(vectors_df.key, vectors_df.vector.map(np.array)))
    word2vec_dict_bc = sc.broadcast(word2vec_dict)
    abstract_vec_wv = np.vstack(abstract_rdd.map(lambda x: list2vec(x, word2vec_dict_bc)).collect())

    print('compute abstract vector using LSA...')
    tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess) # convert to tf-idf matrix
    abstract_vec_lsa = scc.svd_vectorizer(tfidf_matrix, n_components=200, n_iter=150)

    print('save dataframe to pickle file...')
    poster_vect_multiple = pd.DataFrame(zip(range(len(poster_vect_wv)), abstract_vec_wv, abstract_vec_lsa),
                                        columns=['number', 'wordvec', 'lsa'])
    poster_vect_multiple.to_pickle('poster_vec_df.pickle')
예제 #5
0
# note that we use data provide by SfN, which you can request through the society
# from http://www.sfn.org/

import science_concierge as scc
import pandas as pd
import numpy as np

path_to_file = ''  # add path to poster pickle file
poster_df = pd.read_pickle(path_to_file)
abstracts = list(poster_df.abstract)
abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract),
                           abstracts)

# poster vector or abstract vector
tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess)
poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200)
nbrs_model = scc.build_nearest_neighbors(poster_vect)

# keywords vector
tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords)
keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30)
nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect)


def compute_node_distance(node_1, node_2):
    """
    Compute distance between two string nodes in format 'F.01.r'
    """
    node_1 = node_1.split('.')
    node_2 = node_2.split('.')
    if node_1[0] != node_2[0]:
# experiment skeleton code for output figure in publication
# note that we use data provide by SfN, which you can request through the society
# from http://www.sfn.org/

import science_concierge as scc
import pandas as pd
import numpy as np

path_to_file = '' # add path to poster pickle file
poster_df = pd.read_pickle(path_to_file)
abstracts = list(poster_df.abstract)
abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract), abstracts)

# poster vector or abstract vector
tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess)
poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200)
nbrs_model = scc.build_nearest_neighbors(poster_vect)

# keywords vector
tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords)
keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30)
nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect)


def compute_node_distance(node_1, node_2):
    """
    Compute distance between two string nodes in format 'F.01.r'
    """
    node_1 = node_1.split('.')
    node_2 = node_2.split('.')
    if node_1[0] != node_2[0]: