def merge_dictionary(candidates): dictionary = Dictionary([]) for candidate in candidates: dictionary = dictionary.merge(candidate.dictionary()) return dictionary
def test_get_part_of_speech(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) pos = dictionary.get_part_of_speech("torch") self.assertEqual(pos, "noun")
def test_words(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) words = dictionary.words self.assertTrue(len(words), 16)
def init_systems(self): self.dictionary = Dictionary() self.input_handler = InputHandler(self.dictionary) self.mob_factory = MobFactory() self.renderer = Renderer() self.event_queue = EventQueue() self.event_queue.register_system(self) self.message_log = MessageLog(self.event_queue) self.combat_system = CombatSystem(self.event_queue) self.init_crafting_systems()
def test_lexer(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "take the torch from the table" tokens = parser.lexer(src) self.assertEqual(len(tokens), 6)
def test_parser_with_adjectives_wrong(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "take the key rusty from the table" tree = parser.parse(src) self.assertFalse(tree)
def test_verb(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "move" tree = parser.parse(src) self.assertTrue(tree) self.assertEqual(tree.verb.verb.word, "move")
def test_parser_with_adjectives(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "take the rusty key from the table" tree = parser.parse(src) self.assertTrue(tree) self.assertTrue(tree.noun_phrase) self.assertEqual(tree.noun_phrase.modifier.adjective.word, "rusty")
def test_parser_phrasal_verb(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "pick up the torch" tree = parser.parse(src) self.assertTrue(tree) self.assertEqual(tree.verb.verb.word, "pick up") self.assertEqual(tree.noun_phrase.noun.word, "torch")
def test_visitor_simple(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "move" tree = parser.parse(src) visitor = Visitor() command = tree.accept(visitor) self.assertTrue(command) self.assertEqual(command["verb"], "move")
def test_parser_complex(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "take the torch from the table" tree = parser.parse(src) self.assertTrue(tree) self.assertTrue(tree.prep_phrase) self.assertEqual(tree.prep_phrase.prep.prep.word, "from") self.assertEqual(tree.prep_phrase.noun_phrase.noun.word, "table")
def read_txt_embeddings(lang,full_vocab=False): """ Reload pretrained embeddings from a text file. """ word2id = {} vectors = [] # load pretrained embeddings emb_path = 'vectors/wiki.{}.vec'.format(lang) _emb_dim_file = 300 max_vocab = 200000 with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f: for i, line in enumerate(f): if i == 0: split = line.split() assert len(split) == 2 assert _emb_dim_file == int(split[1]) else: word, vect = line.rstrip().split(' ', 1) if not full_vocab: word = word.lower() vect = np.fromstring(vect, sep=' ') if np.linalg.norm(vect) == 0: # avoid to have null embeddings vect[0] = 0.01 if word in word2id: if full_vocab: print("Word '%s' found twice in embedding file" % (word)) pass else: if not vect.shape == (_emb_dim_file,): print("Invalid dimension (%i) for word '%s' in line %i." % (vect.shape[0], word, i)) continue assert vect.shape == (_emb_dim_file,), i word2id[word] = len(word2id) vectors.append(vect[None]) if max_vocab > 0 and len(word2id) >= max_vocab and not full_vocab: break assert len(word2id) == len(vectors) # logger.info("Loaded %i pre-trained word embeddings." % len(vectors)) # compute new vocabulary / embeddings id2word = {v: k for k, v in word2id.items()} dico = Dictionary(id2word, word2id, lang) embeddings = np.concatenate(vectors, 0) assert embeddings.shape == (len(dico), _emb_dim_file) return dico, embeddings
def test_parser_simple(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "take the torch" tree = parser.parse(src) self.assertTrue(tree) self.assertTrue(isinstance(tree, VerbPhrase)) self.assertTrue(isinstance(tree.verb.verb.word, str)) self.assertTrue(tree.verb.verb.word, "take") self.assertTrue(tree.noun_phrase.noun.word, "torch")
def test_parser_with_adverb(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "move north" tree = parser.parse(src) self.assertTrue(tree) self.assertTrue(isinstance(tree, VerbPhrase)) self.assertTrue(isinstance(tree.verb.verb.word, str)) self.assertTrue(tree.verb.verb.word, "move") self.assertTrue(tree.adverb_phrase.adverb.word, "north")
def test_load_words(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") try: dictionary.load_words(filename) except Exception as e: self.assertTrue(False) self.assertTrue(len(dictionary.verbs), 3) self.assertTrue(len(dictionary.nouns), 3) self.assertTrue(len(dictionary.adverbs), 4) self.assertTrue(len(dictionary.prepositions), 3) self.assertTrue(len(dictionary.adjectives), 1) self.assertTrue(len(dictionary.articles), 3)
def test_parser_phrasal_verb_complex(self): dictionary = Dictionary() filedir = os.path.dirname(os.path.realpath('__file__')) filename = os.path.join(filedir, "tests/dictionary.json") dictionary.load_words(filename) parser = Parser(dictionary) src = "pick up the rusty key from the table" tree = parser.parse(src) self.assertTrue(tree) self.assertEqual(tree.verb.verb.word, "pick up") self.assertEqual(tree.noun_phrase.modifier.adjective.word, "rusty") self.assertEqual(tree.noun_phrase.noun_phrase.noun.word, "key") self.assertEqual(tree.prep_phrase.prep.prep.word, "from") self.assertEqual(tree.prep_phrase.noun_phrase.noun.word, "table")
def test_attack_when_one_repeated(self): self.attacker = DictionaryAttacker(FREQUENCIES, Dictionary(["foo"])) self.assertEqual(self.attacker.attack(self.cipher.encrypt("foo foo fo")), "foo foo fo")
from src.dictionary import Dictionary from Crypto.Cipher import AES from Crypto.Util.Padding import pad, unpad d = Dictionary() message = "e18d1c933da3f3bf1517ec5033fff8f7998d8108d88d04688510f94dde36d5d69f3a3fb92e4515d72204fd420d079156e0434caeee957c9784390c85e4354acb" for i in range(0, len(d.wordarray)): length = len(d.wordarray[i]) key = bytearray.fromhex(d.wordarray[i].encode("utf-8").hex()) if length < 16: key = pad(key, 16) elif 16 < length < 24: key = pad(key, 24) elif 24 < length < 32: key = pad(key, 32) if not len(key) % 8 == 0: continue cipher = AES.new(key, AES.MODE_ECB) plain_text = cipher.decrypt(pad(bytearray.fromhex(message), 16)) try: print(plain_text.decode('utf-8')) except: continue
def test_contains_when_dictionary_is_empty(self): self.assertFalse("baz" in Dictionary([]))
def test_contains_when_dictionary_is_none(self): with self.assertRaises(TypeError, msg="Words must be a list."): "baz" in Dictionary(None)
def test_merge_when_other_is_empty(self): self.assertEqual(set(self.dictionary.merge(Dictionary([])).words), set(self.dictionary.words))
def test_merge_when_this_is_empty(self): self.assertEqual(set(Dictionary([]).merge(self.dictionary)), set(self.dictionary.words))
def setUp(self): self.dictionary = Dictionary(["foo", "bar"])
def __init__( self, checkpoint_path=r"C:\Workplaces\NLP\Project\test\MachineTranslation\outputs\train1", dictionary_path='datasets/vi_zh') -> None: loader = DataLoader() content_cn = loader.np_load('lst_cn_all_with6k_except_1001') content_vn = loader.np_load('lst_vi_all_with6k_except_1001') for i in range(len(content_vn)): content_vn[i] = content_vn[i].lower() for i in range(len(content_cn)): content_cn[i] = self.preproces_cn(content_cn[i]) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(content_cn, content_vn, test_size=0.2, random_state=1) X_val, y_val = [X_train[0]], [y_train[0]] full_dataset = self.create_dataset(content_cn, content_vn) train_examples = self.create_dataset(X_train, y_train) test_dataset = self.create_dataset(X_test, y_test) val_dataset = self.create_dataset(X_val, y_val) self.tokenizer_cn = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus( (en.numpy() for en, _ in full_dataset), target_vocab_size=2**13) self.tokenizer_vn = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus( (vn.numpy() for _, vn in full_dataset), target_vocab_size=2**13) BUFFER_SIZE = 2000 BATCH_SIZE = 64 train_dataset = train_examples.map(self.tf_encode) train_dataset = train_dataset.filter(self.filter_max_length) train_dataset = train_dataset.cache() train_dataset = train_dataset.shuffle(BUFFER_SIZE) train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE) val_dataset = val_dataset.map(self.tf_encode) val_dataset = val_dataset.filter(self.filter_max_length) test_dataset = test_dataset.map(self.tf_encode) test_dataset = test_dataset.filter(self.filter_max_length) num_layers = 6 d_model = 256 # model dim dff = 512 # feed forward dim num_heads = 8 # number of multi head attention d_model%num_heads == 0 input_vocab_size = self.tokenizer_cn.vocab_size + 2 target_vocab_size = self.tokenizer_vn.vocab_size + 2 dropout_rate = 0.1 learning_rate = CustomSchedule(d_model) optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9) self.dic = Dictionary().create_dict(dictionary_path) self.transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input=input_vocab_size, pe_target=target_vocab_size, rate=dropout_rate) ckpt = tf.train.Checkpoint(transformer=self.transformer, optimizer=optimizer) ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5) if ckpt_manager.latest_checkpoint: ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial() print('Latest checkpoint restored!!') else: raise 'Checkpoint not found!'
def dictionary(self): return Dictionary(self.words)
def convert_dic(dic, lang): id2word, word2id = {}, {} for i in range(len(dic)): id2word[i] = dic[i][0] word2id[dic[i][0]] = i return Dictionary(id2word, word2id, lang)
logger = create_logger(config.root_path + '/logs/train.log') logger.info('Building dictionary') # 构建字典 if os.path.exists(args.dictionary): dictionary = joblib.load(args.dictionary) else: logger.info("Loading data...") data = build_dict_dataset() # 构建字典数据集 # 词粒度或者字符粒度 if args.word: data = data['raw_words'].values.tolist() else: data = data['raw_words'].apply( lambda x: " ".join("".join(x.split()))) dictionary = Dictionary() dictionary.build_dictionary(data) del data joblib.dump(dictionary, config.dict_path) logger.info('Loading dataset') # 数据集的定义 train_dataset = NewsDataset(config.train_path, dictionary=dictionary, word=args.word) train_dataloader = DataLoader(train_dataset, batch_size=config.batch_size, collate_fn=collate_fn, shuffle=True) dev_dataset = NewsDataset(config.valid_path, dictionary=dictionary,