def __init__(self, dataset): super(BOWTrainer, self).__init__(dataset, num_classes=None) print(datetime.datetime.now(), 'Building vocabulary') self.vocabulary = util.build_vocabulary(self.dataset.df_train['words'].values) print(datetime.datetime.now(), 'Preparing data') #ids_string is a hack to make *Vectorizer work with a given dictionary so we have more flexibility when building the dictionary self.dataset.df_train['ids'] = self.dataset.df_train['words'].apply(lambda x: util.words2ids(x, self.vocabulary)) self.dataset.df_train['ids_string'] = self.dataset.df_train['ids'].apply(lambda x: ' '.join([str(word).zfill(4) for word in x])) self.dataset.df_test['ids'] = self.dataset.df_test['words'].apply(lambda x: util.words2ids(x, self.vocabulary)) self.dataset.df_test['ids_string'] = self.dataset.df_test['ids'].apply(lambda x: ' '.join([str(word).zfill(4) for word in x]))
def prepare_wordlevel(self, opt_trainer): print(datetime.datetime.now(), 'Loading Glove') glove_vocabulary = util.load_glove_vocabulary(opt_trainer) print(datetime.datetime.now(), 'Building vocabulary') self.vocabulary = util.build_vocabulary(self.df_train['words'].values, glove_vocabulary) print(datetime.datetime.now(), 'Preparing data') self.df_train['ids'] = self.df_train['words'].apply(lambda x: util.words2ids(x, self.vocabulary)) self.df_test['ids'] = self.df_test['words'].apply(lambda x: util.words2ids(x, self.vocabulary)) print(datetime.datetime.now(), 'Buidling embeddings') self.embeddings = util.build_embeddings(opt_trainer, self.vocabulary) self.status = 'WordLevel'
def __init__(self, batch_size, dynamic_padding=False, preprocessing=False, embedding=True, saved=False, max_length=None): train = ElectionData.read_data('../data/election-data/training/') test = ElectionData.read_data('../data/election-data/testing/') self.batch_size = batch_size self.dynamic_padding = dynamic_padding self.train_tweets, self.train_targets, self.train_y = zip(*train) self.test_tweets, self.test_targets, self.test_y = zip(*test) self.train_left_tweets = [ElectionData.split_tweet(self.train_tweets[i], self.train_targets[i])[0] for i in range(len(self.train_tweets))] self.train_right_tweets = [ElectionData.split_tweet(self.train_tweets[i], self.train_targets[i])[1] for i in range(len(self.train_tweets))] self.test_left_tweets = [ElectionData.split_tweet(self.test_tweets[i], self.test_targets[i])[0] for i in range(len(self.test_tweets))] self.test_right_tweets = [ElectionData.split_tweet(self.test_tweets[i], self.test_targets[i])[1] for i in range(len(self.test_tweets))] self.train_tweets = [ElectionData.replace_target(self.train_tweets[i], self.train_targets[i]) for i in range(len(self.train_tweets))] self.test_tweets = [ElectionData.replace_target(self.test_tweets[i], self.test_targets[i]) for i in range(len(self.test_tweets))] self.train_targets = [train_target.split('_') for train_target in self.train_targets] self.test_targets = [test_target.split('_') for test_target in self.test_targets] # Padding tweets (manually adding '<PAD> tokens') if not self.dynamic_padding: self.train_tweets = util.pad_sequences(self.train_tweets, pad_location='RIGHT') self.test_tweets = util.pad_sequences(self.test_tweets, pad_location='RIGHT') # Building vocabulary self.vocab, self.vocab_inv = util.build_vocabulary(self.train_tweets + self.test_tweets) if embedding: # Vectorizing tweets - Glove embedding start = time.clock() print(' - Loading embedding..') glove, self.glove_vec, self.glove_shape, glove_vocab = util.gensim_load_vec('../resources/wordemb/glove.twitter.word2vec.27B.100d.txt') glove_vocab = [token.encode('utf-8') for token in glove_vocab] self.glove_vocab_dict = {j:i for i, j in enumerate(glove_vocab)} self.glove_vec = np.append(self.glove_vec, [[0]*self.glove_shape[1]], axis=0) self.glove_shape = [self.glove_shape[0]+1, self.glove_shape[1]] print(' - DONE') print("time taken: %f mins"%((time.clock() - start)/60)) if saved==False: start = time.clock() print(' - Matching words-indices') self.train_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_tweets]) self.train_left_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_left_tweets]) self.train_right_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_right_tweets]) self.test_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_tweets]) self.test_left_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_left_tweets]) self.test_right_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_right_tweets]) self.train_target_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in target] for target in self.train_targets]) self.test_target_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in target] for target in self.test_targets]) self.train_y = pd.get_dummies(self.train_y).values.astype(np.int32) self.train_df = [(self.train_x[i], self.train_left_x[i], self.train_right_x[i], self.train_target_x[i], self.train_y[i]) for i in range(len(self.train_x))] self.test_df = [(self.test_x[i], self.test_left_x[i], self.test_right_x[i], self.test_target_x[i], self.test_y[i]) for i in range(len(self.test_x))] train_y = np.array([d[-1] for d in self.train_df]) self.train_df, self.dev_df = self.build_train_dev(train_y) # Dividing to train and dev set print(' - DONE') print("time taken: %f mins"%((time.clock() - start)/60)) print(" - Saving data") np.save('../data/election-data/train_df.npy', self.train_df) np.save('../data/election-data/dev_df.npy', self.dev_df) np.save('../data/election-data/test_df.npy', self.test_df) print(' - DONE') else: print(" - Loading data") self.train_df = np.load('../data/election-data/train_df.npy') self.dev_df = np.load('../data/election-data/dev_df.npy') self.test_df = np.load('../data/election-data/test_df.npy') print(' - DONE') else: # Vectorizing tweets - one-hot-vector self.train_x = np.array([[self.vocab[token] for token in tweet] for tweet in self.train_tweets]) self.test_x = np.array([[self.vocab[token] for token in tweet] for tweet in self.test_tweets]) self.create_batches() self.reset_batch_pointer()
# creating the categories from train_image_path categories = [] for i in train_image_paths: a = i.split("/") if a[2] not in categories: categories.append(a[2]) print("Categories: ", categories) ''' Step 1: Represent each image with the appropriate feature Each function to construct features should return an N x d matrix, where N is the number of paths passed to the function and d is the dimensionality of each image representation. See the starter code for each function for more details. ''' print('Extracting SIFT features\n') #TODO: You code build_vocabulary function in util.py voc_size = 50 kmeans = build_vocabulary(train_image_paths, vocab_size=voc_size) #TODO: You code get_bags_of_sifts function in util.py train_image_feats = get_bags_of_sifts(train_image_paths, kmeans) test_image_feats = get_bags_of_sifts(test_image_paths, kmeans) #If you want to avoid recomputing the features while debugging the #classifiers, you can either 'save' and 'load' the extracted features #to/from a file. #np.save('train_image_feats', train_image_feats) #np.save('test_image_feats', test_image_feats) #train_image_feats = np.load('train_image_feats.npy') #test_image_feats = np.load('test_image_feats.npy')
#to/from a file. # ============== # Params # ============== # Use precomputed models is_using_saved = True VOCAB_SIZE = 800 # k in KNN for BoW of SIFT images K_CLASSIFIER = 10 # k in KNN for classifier. C_PEN_CLASSIFIER = 16 # SVM penality parameter if(not is_using_saved): print("Creating new BoW representation.") kmeans = build_vocabulary(train_image_paths, vocab_size=VOCAB_SIZE) train_image_feats = get_bags_of_sifts(train_image_paths, kmeans, 'train') test_image_feats = get_bags_of_sifts(test_image_paths, kmeans, 'test') else: print("Using saved BoW representation.") kmeans = pickle.load(open(f"models/kmeans-vocab-{VOCAB_SIZE}.pkl", "rb")) train_image_feats = pickle.load(open(f"models/train-features-{VOCAB_SIZE}.pkl", "rb")) test_image_feats = pickle.load(open(f"models/test-features-{VOCAB_SIZE}.pkl", "rb")) ''' Step 2: Classify each test image by training and using the appropriate classifier Each function to classify test features will return an N x l cell array, where N is the number of test cases and each entry is a string indicating the predicted one-hot vector for each test image. See the starter code for each function for more details. '''