def process_text_data(file_path, vocab_size): """ This function is responsible for preprocessing the text data we will use to train our model. It will perform the following steps: * Create an word array for the file we have received. For example, if our text is: 'I want to learn wordvec to do cool stuff' It will produce the following array: ['I', 'want', 'to', 'learn', 'wordvec', 'to', 'do', 'cool', 'stuff'] * Create the frequency count for every word in our array: [('I', 1), ('want', 1), ('to', 2), ('learn', 1), ('wordvec', 1), ('do', 1), ('cool', 1), ('stuff', 1)] * With the count array, we choose as our vocabulary the words with the highest count. The number of words will be decided by the variable vocab_size. * After that we will create a dictionary to map a word to an index and an index to a word: index2word: {0: 'I', 1: 'want', 2: 'to', 3: 'learn', 4: 'wordvec', 5: 'do', 6: 'cool', 7: 'stuff'} word2index: {'I': 0, 'want': 1, 'to': 2, 'learn': 3, 'wordvec': 4, 'do': 5, 'cool': 6, 'stuff': 7} Both of these dictionaries are based on the words provided by the count array. * Finally, we will transform the words array to a number array, using the word2vec dictionary. Therefore, our words array: ['I', 'want', 'to', 'learn', 'wordvec', 'to', 'do', 'cool', 'stuff'] Will be translated to: [0, 1, 2, 3, 4, 2, 5, 6, 7] If a word is not present in the word2index array, it will be considered an unknown word. Every unknown word will be mapped to the same index. """ my_data = DataReader(file_path) my_data.process_data(vocab_size) return my_data
def test_run_training(self): """ Test to check if the read_text function return a list of words given a txt file. """ my_data = DataReader(get_path_basic_corpus()) my_vocab_size = 500 my_data.process_data(my_vocab_size) my_config = wv.Config(num_steps=200, vocab_size=my_vocab_size, show_step=2) my_model = wv.SkipGramModel(my_config) duration, loss = wv.run_training(my_model, my_data, verbose=False, visualization=False, debug=True) self.assertTrue(duration <= 1.7) self.assertTrue(loss < 7)