def evaluate(partitioned_pairs_df, models_dict, patterns=None, verbose=False): if patterns is not None: partitioned_pairs_df = partitioned_pairs_df[partitioned_pairs_df['pattern'].isin(patterns)] for model_name, model in models_dict.items(): dfs = [] for pattern, pairs_df in partitioned_pairs_df.groupby('pattern'): print model_name, pattern train_pairs = get_word_pairs(filter_pairs(pairs_df, pattern, 0)) model.fit(train_pairs, verbose=verbose) _, target_pos = pattern_pos(pattern) scores_test = reciprocal_rank_scores(model, get_word_pairs(pairs_df), pos=target_pos, verbose=verbose) df = pairs_df[['word1', 'word2', 'partition']].copy() df.loc[:, model_name] = pd.Series(scores_test, index=df.index) dfs.append(df) return pd.concat(dfs)
def prediction_features(partitioned_pairs_df, model, patterns=None, verbose=False, pattern_map={}): def map_pattern(p): return pattern_map.get(p, p) partitioned_pairs_df['superpattern'] = partitioned_pairs_df.apply(lambda x: map_pattern(x['pattern']), axis=1) df = pd.DataFrame() for superpattern, pairs_df in partitioned_pairs_df.groupby('superpattern'): # Training an all patterns of a supergroup print('Running on superpattern "%s" with %d pairs' % (superpattern, len(pairs_df))) # Skip supergroup if none of the selected patterns in this superpattern group if (patterns is not None) and not (set(pairs_df['pattern']) & set(patterns)): print('Skipping this supergroup') continue pairs_train_df = pairs_df[pairs_df['partition'] == 0] print('Training on %d pairs...' % len(pairs_train_df)) train_pairs = get_word_pairs(pairs_train_df) model.fit(train_pairs, verbose=verbose) # Filter selected patterns for testing if patterns is not None: pairs_filtered_df = pairs_df[pairs_df['pattern'].isin(patterns)] # Test on selected patterns only print('Testing on %d pairs...' % len(pairs_filtered_df)) for i, pair in pairs_filtered_df.iterrows(): _, target_pos = pattern_pos(pair['pattern']) base = pair['word1'] derived = pair['word2'] print('\t %s %s' % (pair['word1'], pair['word2'])) rr = reciprocal_rank(model, base, derived, pos=target_pos) ns = neighbors_avg_sim(model, base, pos=target_pos) vn = derived_vector_norm(model, base) bs = base_derived_sim(model, base) df = df.append(pd.Series({'pattern': pair['pattern'], 'word1': base, 'word2': derived, 'avg_neighbors_sim': ns, 'derived_norm': vn, 'base_derived_sim': bs, 'rr': rr}, name=i)) return partitioned_pairs_df.merge(df)