Пример #1
0
user_profiles = pd.read_csv("user_profile.csv", delimiter='\t')
target_users = pd.read_csv("target_users.csv", delimiter='\t')
item_profiles = pd.read_csv("item_profile.csv", delimiter='\t')
interactions = pd.read_csv('interactions.csv', delimiter="\t")

##FOR RATING ESTIMATION WE BASICALLY WANT TO COMPARE RECOMMENDABLE ITEMS TO
##ITEMS WITH KNOWN RATING

items_r = item_profiles[item_profiles['active_during_test'] == 1]

users = target_users['user_id'].values.tolist()
items = item_profiles.fillna(0)
items['country'] = items['country'].replace(0,'null')
items['country'] = le.fit_transform(items['country'])
to_encode = ['discipline_id','industry_id','country','region']
util.encode_feature(items,to_encode)
items = items.drop(to_encode,1)
items = items.drop(['title','tags','latitude','longitude', 'created_at','active_during_test','id'],1)
items = items.as_matrix()

ids = item_profiles['id'].values
ids_r = pd.DataFrame(item_profiles['id'])
ids = dict(zip(range(ids.size),ids))
###optimization related stuff###

items_r2 = set(items_r.reset_index().drop('index',1).index.tolist())
interactions = interactions.drop('created_at',1)
sommati = interactions.groupby(['user_id','item_id']).aggregate(np.sum).reset_index().values
keys = zip(sommati[:,0],sommati[:,1])
fast_int = dict(zip(keys,sommati[:,2]))
fast_interacted = fast_int.keys()
Пример #2
0
for tag in common_tags:
    items['tag' + tag] = np.zeros(items.shape[0])

for index, row in items.iterrows():
    print(str(index))
    for tag in common_tags:
        if tag in row['tags']:
            items = items.set_value(index, 'tag'+tag, 1)

corpus2 = items['title'].ravel()
corpus2 = list(map(myf,corpus2))
corpus2 = list(map(myf2,corpus2))
corpus2 = [item for sublist in corpus2 for item in sublist]
fdist2 = nltk.FreqDist(corpus2)
common_titles = fdist.most_common(70)
common_titles = [title[0] for title in common_titles]
items['title'] = items['title'].astype(str)

for title in common_titles:
    items['title' + title] = np.zeros(items.shape[0])

for index, row in items.iterrows():
    print(str(index))
    for title in common_titles:
        if title in row['title']:
            items = items.set_value(index, 'title'+title, 1)

items = items.drop(['title','tags'],1)
to_encode = items.columns.difference(['id'])
items = utils.encode_feature(items,to_encode)
items = items.drop(to_encode,1)