Пример #1
0
def merge_dictionary(candidates):
    dictionary = Dictionary([])

    for candidate in candidates:
        dictionary = dictionary.merge(candidate.dictionary())

    return dictionary
Пример #2
0
    def test_get_part_of_speech(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))

        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        pos = dictionary.get_part_of_speech("torch")
        self.assertEqual(pos, "noun")
Пример #3
0
    def test_words(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))

        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        words = dictionary.words

        self.assertTrue(len(words), 16)
Пример #4
0
 def init_systems(self):
     self.dictionary = Dictionary()
     self.input_handler = InputHandler(self.dictionary)
     self.mob_factory = MobFactory()
     self.renderer = Renderer()
     self.event_queue = EventQueue()
     self.event_queue.register_system(self)
     self.message_log = MessageLog(self.event_queue)
     self.combat_system = CombatSystem(self.event_queue)
     self.init_crafting_systems()
Пример #5
0
    def test_lexer(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "take the torch from the table"
        tokens = parser.lexer(src)
        self.assertEqual(len(tokens), 6)
Пример #6
0
    def test_parser_with_adjectives_wrong(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "take the key rusty from the table"
        tree = parser.parse(src)

        self.assertFalse(tree)
Пример #7
0
    def test_verb(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "move"
        tree = parser.parse(src)

        self.assertTrue(tree)
        self.assertEqual(tree.verb.verb.word, "move")
Пример #8
0
    def test_parser_with_adjectives(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "take the rusty key from the table"
        tree = parser.parse(src)

        self.assertTrue(tree)
        self.assertTrue(tree.noun_phrase)
        self.assertEqual(tree.noun_phrase.modifier.adjective.word, "rusty")
Пример #9
0
    def test_parser_phrasal_verb(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "pick up the torch"
        tree = parser.parse(src)

        self.assertTrue(tree)
        self.assertEqual(tree.verb.verb.word, "pick up")
        self.assertEqual(tree.noun_phrase.noun.word, "torch")
Пример #10
0
    def test_visitor_simple(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "move"
        tree = parser.parse(src)

        visitor = Visitor()
        command = tree.accept(visitor)
        self.assertTrue(command)
        self.assertEqual(command["verb"], "move")
Пример #11
0
    def test_parser_complex(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "take the torch from the table"
        tree = parser.parse(src)

        self.assertTrue(tree)
        self.assertTrue(tree.prep_phrase)
        self.assertEqual(tree.prep_phrase.prep.prep.word, "from")
        self.assertEqual(tree.prep_phrase.noun_phrase.noun.word, "table")
Пример #12
0
def read_txt_embeddings(lang,full_vocab=False):
    """
    Reload pretrained embeddings from a text file.
    """
    word2id = {}
    vectors = []

    # load pretrained embeddings
    emb_path = 'vectors/wiki.{}.vec'.format(lang)
    _emb_dim_file = 300
    max_vocab = 200000

    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for i, line in enumerate(f):
            if i == 0:
                split = line.split()
                assert len(split) == 2
                assert _emb_dim_file == int(split[1])
            else:
                word, vect = line.rstrip().split(' ', 1)
                if not full_vocab:
                    word = word.lower()
                vect = np.fromstring(vect, sep=' ')
                if np.linalg.norm(vect) == 0:  # avoid to have null embeddings
                    vect[0] = 0.01
                if word in word2id:
                    if full_vocab:
                        print("Word '%s' found twice in embedding file" % (word))
                        pass
                else:
                    if not vect.shape == (_emb_dim_file,):
                        print("Invalid dimension (%i) for word '%s' in line %i." % (vect.shape[0], word, i))
                        continue
                assert vect.shape == (_emb_dim_file,), i
                word2id[word] = len(word2id)
                vectors.append(vect[None])
            if max_vocab > 0 and len(word2id) >= max_vocab and not full_vocab:
                break

    assert len(word2id) == len(vectors)
    # logger.info("Loaded %i pre-trained word embeddings." % len(vectors))

    # compute new vocabulary / embeddings
    id2word = {v: k for k, v in word2id.items()}
    dico = Dictionary(id2word, word2id, lang)
    embeddings = np.concatenate(vectors, 0)

    assert embeddings.shape == (len(dico), _emb_dim_file)
    return dico, embeddings
Пример #13
0
    def test_parser_simple(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "take the torch"
        tree = parser.parse(src)

        self.assertTrue(tree)
        self.assertTrue(isinstance(tree, VerbPhrase))
        self.assertTrue(isinstance(tree.verb.verb.word, str))
        self.assertTrue(tree.verb.verb.word, "take")
        self.assertTrue(tree.noun_phrase.noun.word, "torch")
Пример #14
0
    def test_parser_with_adverb(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "move north"
        tree = parser.parse(src)

        self.assertTrue(tree)
        self.assertTrue(isinstance(tree, VerbPhrase))
        self.assertTrue(isinstance(tree.verb.verb.word, str))
        self.assertTrue(tree.verb.verb.word, "move")
        self.assertTrue(tree.adverb_phrase.adverb.word, "north")
Пример #15
0
    def test_load_words(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))

        filename = os.path.join(filedir, "tests/dictionary.json")
        try:
            dictionary.load_words(filename)
        except Exception as e:
            self.assertTrue(False)

        self.assertTrue(len(dictionary.verbs), 3)
        self.assertTrue(len(dictionary.nouns), 3)
        self.assertTrue(len(dictionary.adverbs), 4)
        self.assertTrue(len(dictionary.prepositions), 3)
        self.assertTrue(len(dictionary.adjectives), 1)
        self.assertTrue(len(dictionary.articles), 3)
Пример #16
0
    def test_parser_phrasal_verb_complex(self):
        dictionary = Dictionary()
        filedir = os.path.dirname(os.path.realpath('__file__'))
        filename = os.path.join(filedir, "tests/dictionary.json")
        dictionary.load_words(filename)

        parser = Parser(dictionary)
        src = "pick up the rusty key from the table"
        tree = parser.parse(src)

        self.assertTrue(tree)
        self.assertEqual(tree.verb.verb.word, "pick up")
        self.assertEqual(tree.noun_phrase.modifier.adjective.word, "rusty")
        self.assertEqual(tree.noun_phrase.noun_phrase.noun.word, "key")
        self.assertEqual(tree.prep_phrase.prep.prep.word, "from")
        self.assertEqual(tree.prep_phrase.noun_phrase.noun.word, "table")
Пример #17
0
 def test_attack_when_one_repeated(self):
     self.attacker = DictionaryAttacker(FREQUENCIES, Dictionary(["foo"]))
     self.assertEqual(self.attacker.attack(self.cipher.encrypt("foo foo fo")), "foo foo fo")
Пример #18
0
from src.dictionary import Dictionary
from Crypto.Cipher import AES
from Crypto.Util.Padding import pad, unpad

d = Dictionary()

message = "e18d1c933da3f3bf1517ec5033fff8f7998d8108d88d04688510f94dde36d5d69f3a3fb92e4515d72204fd420d079156e0434caeee957c9784390c85e4354acb"

for i in range(0, len(d.wordarray)):
    length = len(d.wordarray[i])
    key = bytearray.fromhex(d.wordarray[i].encode("utf-8").hex())
    if length < 16:
        key = pad(key, 16)
    elif 16 < length < 24:
        key = pad(key, 24)
    elif 24 < length < 32:
        key = pad(key, 32)

    if not len(key) % 8 == 0:
        continue

    cipher = AES.new(key, AES.MODE_ECB)

    plain_text = cipher.decrypt(pad(bytearray.fromhex(message), 16))
    try:
        print(plain_text.decode('utf-8'))
    except:
        continue
Пример #19
0
 def test_contains_when_dictionary_is_empty(self):
     self.assertFalse("baz" in Dictionary([]))
Пример #20
0
 def test_contains_when_dictionary_is_none(self):
     with self.assertRaises(TypeError, msg="Words must be a list."):
         "baz" in Dictionary(None)
Пример #21
0
 def test_merge_when_other_is_empty(self):
     self.assertEqual(set(self.dictionary.merge(Dictionary([])).words),
                      set(self.dictionary.words))
Пример #22
0
 def test_merge_when_this_is_empty(self):
     self.assertEqual(set(Dictionary([]).merge(self.dictionary)),
                      set(self.dictionary.words))
Пример #23
0
 def setUp(self):
     self.dictionary = Dictionary(["foo", "bar"])
Пример #24
0
    def __init__(
            self,
            checkpoint_path=r"C:\Workplaces\NLP\Project\test\MachineTranslation\outputs\train1",
            dictionary_path='datasets/vi_zh') -> None:
        loader = DataLoader()
        content_cn = loader.np_load('lst_cn_all_with6k_except_1001')
        content_vn = loader.np_load('lst_vi_all_with6k_except_1001')
        for i in range(len(content_vn)):
            content_vn[i] = content_vn[i].lower()
        for i in range(len(content_cn)):
            content_cn[i] = self.preproces_cn(content_cn[i])

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(content_cn,
                                                            content_vn,
                                                            test_size=0.2,
                                                            random_state=1)
        X_val, y_val = [X_train[0]], [y_train[0]]

        full_dataset = self.create_dataset(content_cn, content_vn)
        train_examples = self.create_dataset(X_train, y_train)
        test_dataset = self.create_dataset(X_test, y_test)
        val_dataset = self.create_dataset(X_val, y_val)

        self.tokenizer_cn = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
            (en.numpy() for en, _ in full_dataset), target_vocab_size=2**13)

        self.tokenizer_vn = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
            (vn.numpy() for _, vn in full_dataset), target_vocab_size=2**13)

        BUFFER_SIZE = 2000
        BATCH_SIZE = 64
        train_dataset = train_examples.map(self.tf_encode)
        train_dataset = train_dataset.filter(self.filter_max_length)
        train_dataset = train_dataset.cache()
        train_dataset = train_dataset.shuffle(BUFFER_SIZE)
        train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
        val_dataset = val_dataset.map(self.tf_encode)
        val_dataset = val_dataset.filter(self.filter_max_length)
        test_dataset = test_dataset.map(self.tf_encode)
        test_dataset = test_dataset.filter(self.filter_max_length)

        num_layers = 6
        d_model = 256  # model dim
        dff = 512  # feed forward dim
        num_heads = 8  # number of multi head attention d_model%num_heads == 0

        input_vocab_size = self.tokenizer_cn.vocab_size + 2
        target_vocab_size = self.tokenizer_vn.vocab_size + 2
        dropout_rate = 0.1
        learning_rate = CustomSchedule(d_model)
        optimizer = tf.keras.optimizers.Adam(learning_rate,
                                             beta_1=0.9,
                                             beta_2=0.98,
                                             epsilon=1e-9)
        self.dic = Dictionary().create_dict(dictionary_path)
        self.transformer = Transformer(num_layers,
                                       d_model,
                                       num_heads,
                                       dff,
                                       input_vocab_size,
                                       target_vocab_size,
                                       pe_input=input_vocab_size,
                                       pe_target=target_vocab_size,
                                       rate=dropout_rate)

        ckpt = tf.train.Checkpoint(transformer=self.transformer,
                                   optimizer=optimizer)

        ckpt_manager = tf.train.CheckpointManager(ckpt,
                                                  checkpoint_path,
                                                  max_to_keep=5)
        if ckpt_manager.latest_checkpoint:
            ckpt.restore(ckpt_manager.latest_checkpoint).expect_partial()
            print('Latest checkpoint restored!!')
        else:
            raise 'Checkpoint not found!'
Пример #25
0
 def dictionary(self):
     return Dictionary(self.words)
Пример #26
0
def convert_dic(dic, lang):
    id2word, word2id = {}, {}
    for i in range(len(dic)):
        id2word[i] = dic[i][0]
        word2id[dic[i][0]] = i
    return Dictionary(id2word, word2id, lang)
Пример #27
0
    logger = create_logger(config.root_path + '/logs/train.log')

    logger.info('Building dictionary')
    # 构建字典
    if os.path.exists(args.dictionary):
        dictionary = joblib.load(args.dictionary)
    else:
        logger.info("Loading data...")
        data = build_dict_dataset()  # 构建字典数据集
        # 词粒度或者字符粒度
        if args.word:
            data = data['raw_words'].values.tolist()
        else:
            data = data['raw_words'].apply(
                lambda x: " ".join("".join(x.split())))
        dictionary = Dictionary()
        dictionary.build_dictionary(data)
        del data
        joblib.dump(dictionary, config.dict_path)

    logger.info('Loading dataset')
    # 数据集的定义
    train_dataset = NewsDataset(config.train_path,
                                dictionary=dictionary,
                                word=args.word)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config.batch_size,
                                  collate_fn=collate_fn,
                                  shuffle=True)
    dev_dataset = NewsDataset(config.valid_path,
                              dictionary=dictionary,