def data_process(config): train_data, test_data = util.get_data(config['data_name']) vocab2index = util.get_vocab( train_data["text"] + test_data["text"], max_size=config["vocab_size"]) train_data = train_data.map(lambda e: util.encode_sentence( e["text"], vocab2index, config)) train_data.set_format(type='torch', columns=['input_ids', 'label']) test_data = test_data.map(lambda e: util.encode_sentence( e["text"], vocab2index, config)) test_data.set_format(type='torch', columns=['input_ids', 'label']) train_dl = DataLoader( train_data, batch_size=config['batch_size'], shuffle=True) valid_dl = DataLoader(test_data, batch_size=config['batch_size']) pretrained_emb = util.load_glove('glove.6B.300d.txt') pretrained_embeddings = util.get_emb_matrix( pretrained_emb, vocab2index, emb_size=config['embed_dim']) keywords_matrix = [pretrained_emb[k] for k in config["keywords"]] related_embeddings = util.create_relatedness_matrix( keywords_matrix, pretrained_embeddings) print(f'embedding matrix shape: {pretrained_embeddings.shape}') print(f'relatedness matrix shape: {related_embeddings.shape}') return train_dl, valid_dl, pretrained_embeddings, related_embeddings
def embedding(self): self.embeddingMatrix = np.zeros((len(self.word2int), self.embeddingDimension)) self.word2emb = load_glove() for word, vec in self.word2int.items(): if word in self.word2emb: self.embeddingMatrix[vec] = self.word2emb[word] else: tempEmbedding = np.array(np.random.uniform(-1.0, 1.0, self.embeddingDimension)) self.word2emb[word] = tempEmbedding self.embeddingMatrix[vec] = tempEmbedding
plt.figure() plt.imshow(wordcloud, interpolation="bilinear") plt.axis("off") plt.show() # counting occurrence of each word mergedNews = list(chain.from_iterable(list(map(str.split, cleanedNews)))) wordFreq = dict(Counter(mergedNews).most_common()) print() print(wordFreq) print("Total Number of words :", len(wordFreq)) # loading 300d glove file for word embedding word2emb = load_glove() print() print("Total Number of embedding words : ", len(word2emb)) # we are going to use the words which occurred more than 10 and create the word to vector dictionary word2int = dict() threshold = 10 cnt = 0 for words, count in wordFreq.items(): if count >= threshold or words in word2emb: word2int[words] = cnt cnt += 1 # for the words which are less than threshold and not in embedding list we will assign unk and pad
def prepare_data(self): self.restaurants = pd.read_csv('data/geoplaces2.csv', index_col='placeID') # Restaurant Features df_res_accept = pd.read_csv('data/chefmozaccepts.csv', index_col='placeID') df_res_cuisine = pd.read_csv('data/chefmozcuisine.csv', index_col='placeID') df_res_parking = pd.read_csv('data/chefmozparking.csv', index_col='placeID') df_res_hours = pd.read_csv('data/chefmozhours4.csv', index_col='placeID') df_res_location = pd.read_csv('data/geoplaces2.csv', index_col='placeID') df_res_accept['placeID'] = df_res_accept.index df_res_cuisine['placeID'] = df_res_cuisine.index df_res_parking['placeID'] = df_res_parking.index df_res_hours['placeID'] = df_res_hours.index df_res_location['placeID'] = df_res_location.index # User Features df_user_cuisine = pd.read_csv('data/usercuisine.csv', index_col='userID') df_user_payment = pd.read_csv('data/userpayment.csv', index_col='userID') df_user_profile = pd.read_csv('data/userprofile.csv', index_col='userID') df_user_cuisine['userID'] = df_user_cuisine.index df_user_payment['userID'] = df_user_payment.index df_user_profile['userID'] = df_user_profile.index late_hours = [ int(item.split('-')[1].strip(';').split(':')[0]) > 21 for item in df_res_hours.hours ] # Pre-process Hours into binary categorical feature df_res_latehours = pd.DataFrame(index=df_res_hours.index, data=late_hours, columns=['late_hours']) df_res_latehours['placeID'] = df_res_latehours.index # Pre-process cuisine of both restaurants and users glv_model = load_glove() user_cuisine_list = [re.sub("[^a-zA-Z0-9 ]", " ", item.lower()).strip().split(' ') \ for item in df_user_cuisine['Rcuisine'].values] res_cuisine_list = [re.sub("[^a-zA-Z0-9 ]", " ", item.lower()).strip().split(' ') \ for item in df_res_cuisine['Rcuisine'].values] total_list = user_cuisine_list + res_cuisine_list vector_list = [] for item_list in total_list: cur_vector = np.zeros([ 300, ]) for item in item_list: cur_vector += glv_model[item] cur_vector = cur_vector / len(item_list) vector_list.append(cur_vector) vector_matrix = np.asarray(vector_list) model = MiniBatchKMeans(n_clusters=4) clusters = model.fit_transform(vector_matrix) cluster_idx = np.argmin(clusters, axis=1) df_user_cuisine['cuisine_id'] = cluster_idx[:df_user_cuisine.shape[0]] df_res_cuisine['cuisine_id'] = cluster_idx[df_user_cuisine.shape[0]:] df_res_location = df_res_location.rename( columns={'Rambience': 'Rambiance'}) res_location_cols = [ 'placeID', 'latitude', 'longitude', 'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'franchise', 'Rambiance' ] df_res_info = df_res_location[res_location_cols] # Merge all rest. features in one frame dfs = [ df_res_accept, df_res_cuisine, df_res_latehours, df_res_parking, df_res_info ] df_res = reduce( lambda left, right: pd.merge( left, right, on='placeID', how='outer'), dfs) df_res.set_index('placeID', inplace=True) dfs = [df_user_cuisine, df_user_payment, df_user_profile] df_user = reduce( lambda left, right: pd.merge(left, right, on='userID', how='outer' ), dfs) df_user.set_index('userID', inplace=True) # interaction data df_interaction = pd.read_csv('data/rating_final.csv') place_ids = list( set(df_interaction['placeID'].tolist()).intersection( set(df_res.index))) interaction_res = df_res.loc[place_ids] # fill na values interaction_res.loc[:, 'cuisine_id'] = interaction_res.loc[:, 'cuisine_id'].fillna( method= 'bfill') interaction_res.loc[:, 'late_hours'] = interaction_res.loc[:, 'late_hours'].fillna( interaction_res['late_hours'].value_counts( ).idxmax()) interaction_res.loc[:, 'Rpayment'] = interaction_res.loc[:, 'Rpayment'].fillna( 'cash') interaction_res['placeID'] = interaction_res.index user_ids = list( set(df_interaction['userID'].tolist()).intersection( set(df_user.index))) interaction_user = df_user.loc[user_ids, :] # fill na values interaction_user = interaction_user.fillna(method='bfill') interaction_user['userID'] = interaction_user.index restaurant_cat_colums = [ 'Rpayment', 'cuisine_id_x', 'late_hours', 'parking_lot', 'alcohol', 'smoking_area', 'dress_code', 'accessibility', 'franchise', 'Rambiance' ] user_cat_columns = [ 'cuisine_id_y', 'smoker', 'drink_level', 'dress_preference', 'ambiance', 'transport', 'marital_status', 'hijos', 'interest', 'personality', 'religion', 'activity', 'color', 'budget', 'Upayment' ] merged = pd.merge(df_interaction, interaction_res, on='placeID', how='left') merged = merged.drop_duplicates(keep='first', subset=['placeID', 'userID']) merged = pd.merge(merged, interaction_user, on='userID', how='left') merged = merged.drop_duplicates(keep='first', subset=['placeID', 'userID']) merged = merged.rename(columns={'ambience': 'ambiance'}) self.data = merged self.features = [ 'Rambiance', 'Upayment', 'accessibility', 'activity', 'alcohol', 'ambiance', 'birth_year', 'budget', 'color', 'cuisine_id_x', 'cuisine_id_y', 'dress_code', 'dress_preference', 'drink_level', 'franchise', 'height', 'hijos', 'interest', 'late_hours', 'marital_status', 'parking_lot', 'personality', 'religion', 'smoker', 'smoking_area', 'transport', 'weight', 'Rpayment' ] categorical_features = restaurant_cat_colums + user_cat_columns self.categorical_feature_indices = [ self.features.index(feature_name) for feature_name in categorical_features ] self.encoded_data, self.categorical_names = encode_data( merged, self.features, categorical_features) self.target = merged['rating'] > 1 self.one_hot_encoder = OneHotEncoder( categorical_features=self.categorical_feature_indices) self.one_hot_encoder.fit(self.encoded_data) self.data_with_ids = self.encoded_data.copy() self.data_with_ids['userID'] = merged['userID'] self.data_with_ids['placeID'] = merged['placeID']