def compare_node_distance(): """ Give path to poster dataframe which has columns as follows: - abstract: column contains abstract of all posters - tree: human curated topic such as 'F.01.r', sometimes call node - keywords: string of keywords given from the conference Compare average node distance between random selected poster, keywords and abstract """ result = [] N = len(poster_df) # total number of posters N_trials = 1000 # number of trials n_suggest = 10 # number of suggested posters in experiment n_posters = 5 # number of posters used to predict for n in range(N_trials): poster_idx = np.random.randint(N) # randomly select one poster poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=n_posters) poster_likes = [ poster_idx ] + poster_idx_same_topic # list of posters with same topic for j in range(1, n_posters): distance, poster_idx_abs = scc.get_schedule_rocchio( nbrs_model, poster_vect, like_posters=poster_likes[0:j]) distance, poster_idx_kw = scc.get_schedule_rocchio( nbrs_model_kw, keywords_vect, like_posters=poster_likes[0:j]) poster_idx_random = np.random.randint( N, size=n_suggest) # random pick upall posters poster_list = np.vstack((np.vstack( (poster_idx_abs.flatten(), poster_idx_kw.flatten()))[:, 1:1 + n_suggest], poster_idx_random)) node_distances = [] for row in poster_list: node_distances.append([ compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in row ]) result.append([poster_idx] + list(np.array(node_distances).mean(axis=1)) + [j]) return result
def compare_components_vs_topic_distance(): """ See the relationship between number of SVD components and average distance of topic distance of suggested posters """ # training to get poster vectors result = [] poster_vect_comp = [] N = len(poster_vect) # total number of posters N_trials = 1000 n_suggest = 10 n_posters = np.random.randint(N, size=N_trials) n_components_list = [50, 75, 100, 150, 200, 300, 400, 500] for n_c in n_components_list: poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c) poster_vect_comp.append(poster_vect) # loop through the model for n_model in range(len(n_components_list)): nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model]) for n in n_posters: poster_idx = n # randomly select one poster (pre-random) poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=5) poster_likes = [poster_idx] + poster_idx_same_topic # list of posters with same topic distance, poster_idx_abs = scc.get_schedule_rocchio(nbrs_model, poster_vect_comp[n_model], like_posters=poster_likes[0:1]) poster_list = poster_idx_abs.flatten()[1:1+n_suggest] avg_distance = np.array([compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in poster_list]).mean() result.append([poster_idx] + [avg_distance] + [n_components_list[n_model]]) result_df = pd.DataFrame(result, columns=['poster_number', 'distance', 'n_components']) return result_df
def compare_node_distance(): """ Give path to poster dataframe which has columns as follows: - abstract: column contains abstract of all posters - tree: human curated topic such as 'F.01.r', sometimes call node - keywords: string of keywords given from the conference Compare average node distance between random selected poster, keywords and abstract """ result = [] N = len(poster_df) # total number of posters N_trials = 1000 # number of trials n_suggest = 10 # number of suggested posters in experiment n_posters = 5 # number of posters used to predict for n in range(N_trials): poster_idx = np.random.randint(N) # randomly select one poster poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=n_posters) poster_likes = [poster_idx] + poster_idx_same_topic # list of posters with same topic for j in range(1, n_posters): distance, poster_idx_abs = scc.get_schedule_rocchio(nbrs_model, poster_vect, like_posters=poster_likes[0:j]) distance, poster_idx_kw = scc.get_schedule_rocchio(nbrs_model_kw, keywords_vect, like_posters=poster_likes[0:j]) poster_idx_random = np.random.randint(N, size=n_suggest) # random pick upall posters poster_list = np.vstack((np.vstack((poster_idx_abs.flatten(), poster_idx_kw.flatten()))[:, 1:1+n_suggest], poster_idx_random)) node_distances = [] for row in poster_list: node_distances.append([compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in row]) result.append([poster_idx] + list(np.array(node_distances).mean(axis=1)) + [j]) return result
def compare_components_vs_topic_distance(): """ See the relationship between number of SVD components and average distance of topic distance of suggested posters """ # training to get poster vectors result = [] poster_vect_comp = [] N = len(poster_vect) # total number of posters N_trials = 1000 n_suggest = 10 n_posters = np.random.randint(N, size=N_trials) n_components_list = [50, 75, 100, 150, 200, 300, 400, 500] for n_c in n_components_list: poster_vect = scc.svd_vectorizer(tfidf_matrix, n_components=n_c) poster_vect_comp.append(poster_vect) # loop through the model for n_model in range(len(n_components_list)): nbrs_model = scc.build_nearest_neighbors(poster_vect_comp[n_model]) for n in n_posters: poster_idx = n # randomly select one poster (pre-random) poster_idx_same_topic = get_poster_same_topic(poster_idx, poster_df, n_posters=5) poster_likes = [ poster_idx ] + poster_idx_same_topic # list of posters with same topic distance, poster_idx_abs = scc.get_schedule_rocchio( nbrs_model, poster_vect_comp[n_model], like_posters=poster_likes[0:1]) poster_list = poster_idx_abs.flatten()[1:1 + n_suggest] avg_distance = np.array([ compute_node_distance(poster_df.tree.iloc[poster_idx], poster_df.tree.iloc[idx]) for idx in poster_list ]).mean() result.append([poster_idx] + [avg_distance] + [n_components_list[n_model]]) result_df = pd.DataFrame( result, columns=['poster_number', 'distance', 'n_components']) return result_df