예제 #1
0
def get_most_similar_restricted(limit=120):
    df, df_i = import_data()
    counts = df_i['ingredient'].value_counts()
    ings = counts.index.values

    found_ings, embeddings = retrieve_embeddings(model, ings)
    #found_ings, embeddings = np.load('word2vec_embeddings.npy')
    ranks = get_nearest_neighbors(embeddings)
    print_nearest_neighbors(ings[:limit], found_ings, ranks)
    highest_ranks, avg_rankings, random_avg_rankings = calc_score(ranks, limit, 
        print_scores=False, score_path='../model/scores.csv')

    indices = found_ings[found_ings<highest_ranks.shape[0]]
    highest_ranks = highest_ranks[indices]
    avg_rankings = avg_rankings[indices]
    random_avg_rankings = random_avg_rankings[indices]
    print (highest_ranks<=3).sum(dtype=float) / np.isfinite(highest_ranks).sum()
    print highest_ranks[np.isfinite(highest_ranks)].mean()
    print avg_rankings[np.isfinite(avg_rankings)].mean()
    print random_avg_rankings[np.isfinite(random_avg_rankings)].mean()
예제 #2
0
def main():
    mapping = pd.read_csv('../rasff/rasff_mapping.csv')
    d = {a : b for a,b in zip(mapping.category.values, mapping.shelf.values)}
    df_ = rasff.load_df()
    df_['category_'] = df_['category'].replace(d)
    df_['chemical_'] = [clean_chemical(c) for c in df_['chemical'].values]
    df, df_i = gather_data.import_data()
    counts = df_i['ingredient'].value_counts()
    ings = counts.index.values
    chemicals = [i for i in df_['chemical_'].unique() if i] # remove empty string
    found_chems, d_c = search_chemicals(counts, chemicals)
    unknown_chems = [c for c in chemicals if c not in found_chems]

    chemical_counts = df_.groupby('chemical_').size().sort_values()[::-1]
    category_counts = df_.groupby('category_').size().sort_values()[::-1]
    
    pairs = df_.groupby(['chemical_', 'product']).size().sort_values()[::-1]
    print 'Number of entries  :', len(df_)
    print 'Unique entries     :', len(pairs)
    print 'Unique adulterants :', len(df_['chemical_'].unique())
    print 'Unique products    :', len(df_['product'].unique())
    print 'Unique categories  :', len(df_['category'].unique())