Exemplo n.º 1
0
    def __init__(self, dataset):
        super(BOWTrainer, self).__init__(dataset, num_classes=None)    

        print(datetime.datetime.now(), 'Building vocabulary')
        self.vocabulary = util.build_vocabulary(self.dataset.df_train['words'].values)

        print(datetime.datetime.now(), 'Preparing data')
        #ids_string is a hack to make *Vectorizer work with a given dictionary so we have more flexibility when building the dictionary
        self.dataset.df_train['ids'] = self.dataset.df_train['words'].apply(lambda x: util.words2ids(x, self.vocabulary))
        self.dataset.df_train['ids_string'] = self.dataset.df_train['ids'].apply(lambda x: ' '.join([str(word).zfill(4) for word in x]))
        self.dataset.df_test['ids'] = self.dataset.df_test['words'].apply(lambda x: util.words2ids(x, self.vocabulary))
        self.dataset.df_test['ids_string'] = self.dataset.df_test['ids'].apply(lambda x: ' '.join([str(word).zfill(4) for word in x]))
Exemplo n.º 2
0
 def prepare_wordlevel(self, opt_trainer):
     print(datetime.datetime.now(), 'Loading Glove')
     glove_vocabulary = util.load_glove_vocabulary(opt_trainer)
     print(datetime.datetime.now(), 'Building vocabulary')
     self.vocabulary = util.build_vocabulary(self.df_train['words'].values, glove_vocabulary)
     print(datetime.datetime.now(), 'Preparing data')
     self.df_train['ids'] = self.df_train['words'].apply(lambda x: util.words2ids(x, self.vocabulary))
     self.df_test['ids'] = self.df_test['words'].apply(lambda x: util.words2ids(x, self.vocabulary))
     print(datetime.datetime.now(), 'Buidling embeddings')
     self.embeddings = util.build_embeddings(opt_trainer, self.vocabulary)
     
     self.status = 'WordLevel'
Exemplo n.º 3
0
	def __init__(self, batch_size, dynamic_padding=False, preprocessing=False, embedding=True, saved=False, max_length=None):
		train = ElectionData.read_data('../data/election-data/training/')
		test = ElectionData.read_data('../data/election-data/testing/')
		self.batch_size = batch_size
		self.dynamic_padding = dynamic_padding
		self.train_tweets, self.train_targets, self.train_y = zip(*train)
		self.test_tweets, self.test_targets, self.test_y = zip(*test)

		self.train_left_tweets = [ElectionData.split_tweet(self.train_tweets[i], self.train_targets[i])[0] for i in range(len(self.train_tweets))]
		self.train_right_tweets = [ElectionData.split_tweet(self.train_tweets[i], self.train_targets[i])[1] for i in range(len(self.train_tweets))]
		self.test_left_tweets = [ElectionData.split_tweet(self.test_tweets[i], self.test_targets[i])[0] for i in range(len(self.test_tweets))]
		self.test_right_tweets = [ElectionData.split_tweet(self.test_tweets[i], self.test_targets[i])[1] for i in range(len(self.test_tweets))]

		self.train_tweets = [ElectionData.replace_target(self.train_tweets[i], self.train_targets[i]) for i in range(len(self.train_tweets))]
		self.test_tweets = [ElectionData.replace_target(self.test_tweets[i], self.test_targets[i]) for i in range(len(self.test_tweets))]
		self.train_targets = [train_target.split('_') for train_target in self.train_targets]
		self.test_targets = [test_target.split('_') for test_target in self.test_targets]

		# Padding tweets (manually adding '<PAD> tokens')
		if not self.dynamic_padding:
			self.train_tweets = util.pad_sequences(self.train_tweets, pad_location='RIGHT')
			self.test_tweets = util.pad_sequences(self.test_tweets, pad_location='RIGHT')

		# Building vocabulary
		self.vocab, self.vocab_inv = util.build_vocabulary(self.train_tweets + self.test_tweets)

		if embedding:
			# Vectorizing tweets - Glove embedding
			start = time.clock()
			print(' - Loading embedding..')
			glove, self.glove_vec, self.glove_shape, glove_vocab = util.gensim_load_vec('../resources/wordemb/glove.twitter.word2vec.27B.100d.txt')
			glove_vocab = [token.encode('utf-8') for token in glove_vocab]
			self.glove_vocab_dict = {j:i for i, j in enumerate(glove_vocab)}
			self.glove_vec = np.append(self.glove_vec, [[0]*self.glove_shape[1]], axis=0)
			self.glove_shape = [self.glove_shape[0]+1, self.glove_shape[1]]
			print(' - DONE')
			print("time taken: %f mins"%((time.clock() - start)/60))

			if saved==False:
				start = time.clock()
				print(' - Matching words-indices')
				self.train_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_tweets])
				self.train_left_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_left_tweets])
				self.train_right_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.train_right_tweets])
				self.test_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_tweets])
				self.test_left_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_left_tweets])
				self.test_right_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in tweet] for tweet in self.test_right_tweets])
				self.train_target_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in target] for target in self.train_targets])
				self.test_target_x = np.array([[self.glove_vocab_dict[token] if token in glove_vocab else 1193514 for token in target] for target in self.test_targets])
				self.train_y = pd.get_dummies(self.train_y).values.astype(np.int32)

				self.train_df = [(self.train_x[i], self.train_left_x[i], self.train_right_x[i], self.train_target_x[i], self.train_y[i]) 
								for i in range(len(self.train_x))]
				self.test_df = [(self.test_x[i], self.test_left_x[i], self.test_right_x[i], self.test_target_x[i], self.test_y[i]) 
								for i in range(len(self.test_x))]

				train_y = np.array([d[-1] for d in self.train_df])
				self.train_df, self.dev_df = self.build_train_dev(train_y) # Dividing to train and dev set
				print(' - DONE')
				print("time taken: %f mins"%((time.clock() - start)/60))
				print(" - Saving data")
				np.save('../data/election-data/train_df.npy', self.train_df)
				np.save('../data/election-data/dev_df.npy', self.dev_df)
				np.save('../data/election-data/test_df.npy', self.test_df)
				print(' - DONE')
			else:
				print(" - Loading data")
				self.train_df = np.load('../data/election-data/train_df.npy')
				self.dev_df = np.load('../data/election-data/dev_df.npy')
				self.test_df = np.load('../data/election-data/test_df.npy')
				print(' - DONE')

		else:
			# Vectorizing tweets - one-hot-vector
			self.train_x = np.array([[self.vocab[token] for token in tweet] for tweet in self.train_tweets])
			self.test_x = np.array([[self.vocab[token] for token in tweet] for tweet in self.test_tweets])

		self.create_batches()
		self.reset_batch_pointer()
Exemplo n.º 4
0
# creating the categories from train_image_path
categories = []
for i in train_image_paths:
    a = i.split("/")
    if a[2] not in categories: categories.append(a[2])
print("Categories: ", categories)
''' Step 1: Represent each image with the appropriate feature
 Each function to construct features should return an N x d matrix, where
 N is the number of paths passed to the function and d is the 
 dimensionality of each image representation. See the starter code for
 each function for more details. '''

print('Extracting SIFT features\n')
#TODO: You code build_vocabulary function in util.py
voc_size = 50
kmeans = build_vocabulary(train_image_paths, vocab_size=voc_size)

#TODO: You code get_bags_of_sifts function in util.py

train_image_feats = get_bags_of_sifts(train_image_paths, kmeans)
test_image_feats = get_bags_of_sifts(test_image_paths, kmeans)

#If you want to avoid recomputing the features while debugging the
#classifiers, you can either 'save' and 'load' the extracted features
#to/from a file.

#np.save('train_image_feats', train_image_feats)
#np.save('test_image_feats', test_image_feats)

#train_image_feats = np.load('train_image_feats.npy')
#test_image_feats = np.load('test_image_feats.npy')
#to/from a file.

# ==============
#   Params
# ==============

# Use precomputed models
is_using_saved = True

VOCAB_SIZE = 800 # k in KNN for BoW of SIFT images
K_CLASSIFIER = 10 # k in KNN for classifier.
C_PEN_CLASSIFIER = 16 # SVM penality parameter

if(not is_using_saved):
    print("Creating new BoW representation.")
    kmeans = build_vocabulary(train_image_paths, vocab_size=VOCAB_SIZE)
    train_image_feats = get_bags_of_sifts(train_image_paths, kmeans, 'train')
    test_image_feats = get_bags_of_sifts(test_image_paths, kmeans, 'test')

else:
    print("Using saved BoW representation.")
    kmeans = pickle.load(open(f"models/kmeans-vocab-{VOCAB_SIZE}.pkl", "rb"))
    train_image_feats = pickle.load(open(f"models/train-features-{VOCAB_SIZE}.pkl", "rb"))
    test_image_feats = pickle.load(open(f"models/test-features-{VOCAB_SIZE}.pkl", "rb"))


''' Step 2: Classify each test image by training and using the appropriate classifier
 Each function to classify test features will return an N x l cell array,
 where N is the number of test cases and each entry is a string indicating
 the predicted one-hot vector for each test image. See the starter code for each function
 for more details. '''