Exemplo n.º 1
0
def list2vec(word_list, word2vec_dict_bc):
    """Convert abstract list into word vector"""
    vec = [
        word2vec_dict_bc.value[a] for a in word_list
        if a in word2vec_dict_bc.value.keys()
    ]
    return np.array(vec).mean(axis=0)


if __name__ == '__main__':
    # read dataframe pickle file with 'abstract' column
    poster_df = pd.read_pickle('poster_df.pickle')
    abstracts = list(poster_df.abstract)
    abstract_rdd = sc.parallelize(abstracts, numSlices=1000).\
        map(lambda a: scc.preprocess(a, stem=False)).\
        map(remove_stop_words)
    abstract_stem_rdd = sc.parallelize(abstracts, numSlices=1000).\
        map(lambda a: scc.preprocess(a, stem=False)).\
        collect()

    # average word vectors in abstract
    print(
        'compute abstract vector using word vectors (takes around 40 mins)...')
    vectors_df = pd.read_json('wordvec_df.json')
    word2vec_dict = dict(zip(vectors_df.key, vectors_df.vector.map(np.array)))
    word2vec_dict_bc = sc.broadcast(word2vec_dict)
    abstract_vec_wv = np.vstack(
        abstract_rdd.map(lambda x: list2vec(x, word2vec_dict_bc)).collect())

    print('compute abstract vector using LSA...')
Exemplo n.º 2
0
# experiment skeleton code for output figure in publication
# note that we use data provide by SfN, which you can request through the society
# from http://www.sfn.org/

import science_concierge as scc
import pandas as pd
import numpy as np

path_to_file = ''  # add path to poster pickle file
poster_df = pd.read_pickle(path_to_file)
abstracts = list(poster_df.abstract)
abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract),
                           abstracts)

# poster vector or abstract vector
tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess)
poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200)
nbrs_model = scc.build_nearest_neighbors(poster_vect)

# keywords vector
tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords)
keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30)
nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect)


def compute_node_distance(node_1, node_2):
    """
    Compute distance between two string nodes in format 'F.01.r'
    """
    node_1 = node_1.split('.')
    node_2 = node_2.split('.')
    words = w_tokenizer.tokenize(abstract)
    return [w for w in words if w not in stops]


def list2vec(word_list, word2vec_dict_bc):
    """Convert abstract list into word vector"""
    vec = [word2vec_dict_bc.value[a] for a in word_list if a in word2vec_dict_bc.value.keys()]
    return np.array(vec).mean(axis=0)


if __name__ == '__main__':
    # read dataframe pickle file with 'abstract' column
    poster_df = pd.read_pickle('poster_df.pickle')
    abstracts = list(poster_df.abstract)
    abstract_rdd = sc.parallelize(abstracts, numSlices=1000).\
        map(lambda a: scc.preprocess(a, stem=False)).\
        map(remove_stop_words)
    abstract_stem_rdd = sc.parallelize(abstracts, numSlices=1000).\
        map(lambda a: scc.preprocess(a, stem=False)).\
        collect()

    # average word vectors in abstract
    print('compute abstract vector using word vectors (takes around 40 mins)...')
    vectors_df = pd.read_json('wordvec_df.json')
    word2vec_dict = dict(zip(vectors_df.key, vectors_df.vector.map(np.array)))
    word2vec_dict_bc = sc.broadcast(word2vec_dict)
    abstract_vec_wv = np.vstack(abstract_rdd.map(lambda x: list2vec(x, word2vec_dict_bc)).collect())

    print('compute abstract vector using LSA...')
    tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess) # convert to tf-idf matrix
    abstract_vec_lsa = scc.svd_vectorizer(tfidf_matrix, n_components=200, n_iter=150)
# experiment skeleton code for output figure in publication
# note that we use data provide by SfN, which you can request through the society
# from http://www.sfn.org/

import science_concierge as scc
import pandas as pd
import numpy as np

path_to_file = '' # add path to poster pickle file
poster_df = pd.read_pickle(path_to_file)
abstracts = list(poster_df.abstract)
abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract), abstracts)

# poster vector or abstract vector
tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess)
poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200)
nbrs_model = scc.build_nearest_neighbors(poster_vect)

# keywords vector
tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords)
keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30)
nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect)


def compute_node_distance(node_1, node_2):
    """
    Compute distance between two string nodes in format 'F.01.r'
    """
    node_1 = node_1.split('.')
    node_2 = node_2.split('.')
    if node_1[0] != node_2[0]: