Exemplo n.º 1
0
def evaluate(partitioned_pairs_df, models_dict, patterns=None, verbose=False):
    if patterns is not None:
        partitioned_pairs_df = partitioned_pairs_df[partitioned_pairs_df['pattern'].isin(patterns)]
    for model_name, model in models_dict.items():
        dfs = []
        for pattern, pairs_df in partitioned_pairs_df.groupby('pattern'):
            print model_name, pattern
            train_pairs = get_word_pairs(filter_pairs(pairs_df, pattern, 0))
            model.fit(train_pairs, verbose=verbose)
            _, target_pos = pattern_pos(pattern)
            scores_test = reciprocal_rank_scores(model, get_word_pairs(pairs_df), pos=target_pos, verbose=verbose)
            df = pairs_df[['word1', 'word2', 'partition']].copy()
            df.loc[:, model_name] = pd.Series(scores_test, index=df.index)
            dfs.append(df)
        return pd.concat(dfs)
Exemplo n.º 2
0
def prediction_features(partitioned_pairs_df, model, patterns=None, verbose=False, pattern_map={}):
    def map_pattern(p):
        return pattern_map.get(p, p)

    partitioned_pairs_df['superpattern'] = partitioned_pairs_df.apply(lambda x: map_pattern(x['pattern']), axis=1)

    df = pd.DataFrame()

    for superpattern, pairs_df in partitioned_pairs_df.groupby('superpattern'):

        # Training an all patterns of a supergroup
        print('Running on superpattern "%s" with %d pairs' % (superpattern, len(pairs_df)))

        # Skip supergroup if none of the selected patterns in this superpattern group
        if (patterns is not None) and not (set(pairs_df['pattern']) & set(patterns)):
            print('Skipping this supergroup')
            continue

        pairs_train_df = pairs_df[pairs_df['partition'] == 0]
        print('Training on %d pairs...' % len(pairs_train_df))
        train_pairs = get_word_pairs(pairs_train_df)
        model.fit(train_pairs, verbose=verbose)

        # Filter selected patterns for testing
        if patterns is not None:
            pairs_filtered_df = pairs_df[pairs_df['pattern'].isin(patterns)]

        # Test on selected patterns only
        print('Testing on %d pairs...' % len(pairs_filtered_df))
        for i, pair in pairs_filtered_df.iterrows():
            _, target_pos = pattern_pos(pair['pattern'])
            base = pair['word1']
            derived = pair['word2']
            print('\t %s %s' % (pair['word1'], pair['word2']))
            rr = reciprocal_rank(model, base, derived, pos=target_pos)
            ns = neighbors_avg_sim(model, base, pos=target_pos)
            vn = derived_vector_norm(model, base)
            bs = base_derived_sim(model, base)
            df = df.append(pd.Series({'pattern': pair['pattern'], 'word1': base, 'word2': derived,
                                      'avg_neighbors_sim': ns, 'derived_norm': vn, 'base_derived_sim': bs, 'rr': rr},
                                     name=i))

    return partitioned_pairs_df.merge(df)