user_profiles = pd.read_csv("user_profile.csv", delimiter='\t') target_users = pd.read_csv("target_users.csv", delimiter='\t') item_profiles = pd.read_csv("item_profile.csv", delimiter='\t') interactions = pd.read_csv('interactions.csv', delimiter="\t") ##FOR RATING ESTIMATION WE BASICALLY WANT TO COMPARE RECOMMENDABLE ITEMS TO ##ITEMS WITH KNOWN RATING items_r = item_profiles[item_profiles['active_during_test'] == 1] users = target_users['user_id'].values.tolist() items = item_profiles.fillna(0) items['country'] = items['country'].replace(0,'null') items['country'] = le.fit_transform(items['country']) to_encode = ['discipline_id','industry_id','country','region'] util.encode_feature(items,to_encode) items = items.drop(to_encode,1) items = items.drop(['title','tags','latitude','longitude', 'created_at','active_during_test','id'],1) items = items.as_matrix() ids = item_profiles['id'].values ids_r = pd.DataFrame(item_profiles['id']) ids = dict(zip(range(ids.size),ids)) ###optimization related stuff### items_r2 = set(items_r.reset_index().drop('index',1).index.tolist()) interactions = interactions.drop('created_at',1) sommati = interactions.groupby(['user_id','item_id']).aggregate(np.sum).reset_index().values keys = zip(sommati[:,0],sommati[:,1]) fast_int = dict(zip(keys,sommati[:,2])) fast_interacted = fast_int.keys()
for tag in common_tags: items['tag' + tag] = np.zeros(items.shape[0]) for index, row in items.iterrows(): print(str(index)) for tag in common_tags: if tag in row['tags']: items = items.set_value(index, 'tag'+tag, 1) corpus2 = items['title'].ravel() corpus2 = list(map(myf,corpus2)) corpus2 = list(map(myf2,corpus2)) corpus2 = [item for sublist in corpus2 for item in sublist] fdist2 = nltk.FreqDist(corpus2) common_titles = fdist.most_common(70) common_titles = [title[0] for title in common_titles] items['title'] = items['title'].astype(str) for title in common_titles: items['title' + title] = np.zeros(items.shape[0]) for index, row in items.iterrows(): print(str(index)) for title in common_titles: if title in row['title']: items = items.set_value(index, 'title'+title, 1) items = items.drop(['title','tags'],1) to_encode = items.columns.difference(['id']) items = utils.encode_feature(items,to_encode) items = items.drop(to_encode,1)