poster_df = pd.read_pickle('poster_df.pickle') abstracts = list(poster_df.abstract) abstract_rdd = sc.parallelize(abstracts, numSlices=1000).\ map(lambda a: scc.preprocess(a, stem=False)).\ map(remove_stop_words) abstract_stem_rdd = sc.parallelize(abstracts, numSlices=1000).\ map(lambda a: scc.preprocess(a, stem=False)).\ collect() # average word vectors in abstract print( 'compute abstract vector using word vectors (takes around 40 mins)...') vectors_df = pd.read_json('wordvec_df.json') word2vec_dict = dict(zip(vectors_df.key, vectors_df.vector.map(np.array))) word2vec_dict_bc = sc.broadcast(word2vec_dict) abstract_vec_wv = np.vstack( abstract_rdd.map(lambda x: list2vec(x, word2vec_dict_bc)).collect()) print('compute abstract vector using LSA...') tfidf_matrix = scc.tfidf_vectorizer( abstracts_preprocess) # convert to tf-idf matrix abstract_vec_lsa = scc.svd_vectorizer(tfidf_matrix, n_components=200, n_iter=150) print('save dataframe to pickle file...') poster_vect_multiple = pd.DataFrame(zip(range(len(poster_vect_wv)), abstract_vec_wv, abstract_vec_lsa), columns=['number', 'wordvec', 'lsa']) poster_vect_multiple.to_pickle('poster_vec_df.pickle')
# experiment skeleton code for output figure in publication # note that we use data provide by SfN, which you can request through the society # from http://www.sfn.org/ import science_concierge as scc import pandas as pd import numpy as np path_to_file = '' # add path to poster pickle file poster_df = pd.read_pickle(path_to_file) abstracts = list(poster_df.abstract) abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract), abstracts) # poster vector or abstract vector tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess) poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200) nbrs_model = scc.build_nearest_neighbors(poster_vect) # keywords vector tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords) keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30) nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect) def compute_node_distance(node_1, node_2): """ Compute distance between two string nodes in format 'F.01.r' """ node_1 = node_1.split('.') node_2 = node_2.split('.')
vec = [word2vec_dict_bc.value[a] for a in word_list if a in word2vec_dict_bc.value.keys()] return np.array(vec).mean(axis=0) if __name__ == '__main__': # read dataframe pickle file with 'abstract' column poster_df = pd.read_pickle('poster_df.pickle') abstracts = list(poster_df.abstract) abstract_rdd = sc.parallelize(abstracts, numSlices=1000).\ map(lambda a: scc.preprocess(a, stem=False)).\ map(remove_stop_words) abstract_stem_rdd = sc.parallelize(abstracts, numSlices=1000).\ map(lambda a: scc.preprocess(a, stem=False)).\ collect() # average word vectors in abstract print('compute abstract vector using word vectors (takes around 40 mins)...') vectors_df = pd.read_json('wordvec_df.json') word2vec_dict = dict(zip(vectors_df.key, vectors_df.vector.map(np.array))) word2vec_dict_bc = sc.broadcast(word2vec_dict) abstract_vec_wv = np.vstack(abstract_rdd.map(lambda x: list2vec(x, word2vec_dict_bc)).collect()) print('compute abstract vector using LSA...') tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess) # convert to tf-idf matrix abstract_vec_lsa = scc.svd_vectorizer(tfidf_matrix, n_components=200, n_iter=150) print('save dataframe to pickle file...') poster_vect_multiple = pd.DataFrame(zip(range(len(poster_vect_wv)), abstract_vec_wv, abstract_vec_lsa), columns=['number', 'wordvec', 'lsa']) poster_vect_multiple.to_pickle('poster_vec_df.pickle')
# experiment skeleton code for output figure in publication # note that we use data provide by SfN, which you can request through the society # from http://www.sfn.org/ import science_concierge as scc import pandas as pd import numpy as np path_to_file = '' # add path to poster pickle file poster_df = pd.read_pickle(path_to_file) abstracts = list(poster_df.abstract) abstracts_preprocess = map(lambda abstract: scc.preprocess(abstract), abstracts) # poster vector or abstract vector tfidf_matrix = scc.tfidf_vectorizer(abstracts_preprocess) poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=200) nbrs_model = scc.build_nearest_neighbors(poster_vect) # keywords vector tfidf_matrix_kw = scc.tfidf_vectorizer(poster_df.keywords) keywords_vect = scc.svd_vectorizer(tfidf_matrix_kw, n_components=30) nbrs_model_kw = scc.build_nearest_neighbors(keywords_vect) def compute_node_distance(node_1, node_2): """ Compute distance between two string nodes in format 'F.01.r' """ node_1 = node_1.split('.') node_2 = node_2.split('.') if node_1[0] != node_2[0]: