class PreProcessor(): """Pre Processing class used to convert json file into index list""" def __init__(self, padding_size=20, max_dictionary_size=500000): self.text_processor = TextProcessor() self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size) self.embedding.load_embedding_dictionary(self.embedding.dictionary_path) self.padding_size = padding_size def pre_process_text(self, text): cleaned_text = self.text_processor.clean_text(text) tokens = self.text_processor.tokenize_text(cleaned_text) print(tokens) embeding_indexes = self.embedding.replace_tokens_with_index(tokens) padded_indexes = self.pad_sequence(embeding_indexes) return padded_indexes def pad_sequence(self, input_sequence): """Step 4 pad_sequence""" sequence = input_sequence[-self.padding_size:] if len(sequence) < self.padding_size: pad_sequence = [0]*(self.padding_size - len(sequence)) sequence += pad_sequence return sequence
def setUp(self): """ call wordembedding lib """ self.embeddings = WordEmbedding(max_dictionary_size=1000) self.embedding_dictionary = self.embeddings.load_embedding_dictionary( self.embeddings.dictionary_path)
def __init__(self, padding_size=20, max_dictionary_size=500000): self.text_processor = TextProcessor() self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size) self.embedding.load_embedding_dictionary(self.embedding.dictionary_path) self.padding_size = padding_size
class PreProcessor(): """ Run the pre processing end to end """ def __init__(self, padding_size=20, max_dictionary_size=500000): self.text_processor = TextProcessor() self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size) self.embedding.load_embedding_dictionary( self.embedding.dictionary_path) self.padding_size = padding_size def pre_process_text(self, text): """ Run the pre processing end to end """ cleaned_text = self.text_processor.clean_text(text) tokens = self.text_processor.tokenize_text(cleaned_text) embedding_indexs = self.embedding.replace_tokens_with_index(tokens) padded_index = self.pad_sequence(embedding_indexs) return padded_index def pad_sequence(self, input_squence): """ Add 0 padding to a seqence """ sequence = input_squence[-self.padding_size:] if len(sequence) < self.padding_size: pad_sequence = [0] * (self.padding_size - len(sequence)) sequence = sequence + pad_sequence return sequence
class PreProcessor(): """ Pre-process tweets """ def __init__(self, padding_size=20, max_dictionary_size=500000): self.text_processor = TextProcessor() self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size) self.embedding.load_embedding_dictionary( self.embedding.dictionary_path) self.padding_size = padding_size def pre_process_text(self, text): """ Clean and tokenize text, replace tokens with index """ cleaned_text = self.text_processor.clean_text(text) tokens = self.text_processor.tokenize_text(cleaned_text) embedding_indexs = self.embedding.replace_tokens_with_index(tokens) padded_index = self.pad_sequence(embedding_indexs) return padded_index def pad_sequence(self, input_squence): """ Padding: add 0 until max length """ sequence = input_squence[-self.padding_size:] if len(sequence) < self.padding_size: pad_sequence = [0] * (self.padding_size - len(sequence)) sequence = sequence + pad_sequence return sequence
class Test(unittest.TestCase): """ pytest word embedding """ def setUp(self): """ call wordembedding lib """ self.embeddings = WordEmbedding(max_dictionary_size=1000) self.embedding_dictionary = self.embeddings.load_embedding_dictionary( self.embeddings.dictionary_path) def test_load_embedding_dictionary_resource(self): """ Testing proper embedding loading from resource dictionary """ self.embeddings.load_embedding_dictionary( self.embeddings.dictionary_path) self.assertEqual(len(self.embeddings.embedding_dictionary), 1000) def test_load_from_zip(self): """ test whether can load zip file """ zip_resource = os.path.join( self.embeddings.dictionary_path.replace("glove.twitter.txt", "test_resources.zip"), "pre_processing", "..", "test_resources", "glove.twitter.txt") self.embeddings.load_embedding_dictionary(zip_resource) self.assertEqual(len(self.embeddings.embedding_dictionary), 1000) def test_replace_tokens_with_index(self): """ Testing proper token replacement """ token_list = ['love', '<user>', 'moon', 'back'] expected_result = [56, 2, 1, 98] result = self.embeddings.replace_tokens_with_index(token_list) self.assertEqual(result, expected_result)