Пример #1
0
    def __init__(self,
                 path,
                 vocab_path=None,
                 batch_size=1,
                 shuffle=False,
                 pin_memory=False,
                 update_vocab=False,
                 min_freq=1,
                 concat=False,
                 bptt=35):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.pin_memory = pin_memory
        self.base_path = path
        self.update_vocab = update_vocab
        self.bptt = bptt
        self.concat = concat

        self.vocab = get_vocab(path, ['train.txt'],
                               min_freq=min_freq,
                               vocab_file=vocab_path)
        if self.concat:
            # set the frequencies for special tokens by miracle trial
            self.vocab.idx2count[1] = self.vocab.freqs[BOS]  # <s>
            self.vocab.idx2count[2] = 0  # </s>

        self.train = self.get_dataloader('train.txt', self.batch_size)
        self.valid = self.get_dataloader('valid.txt', 1)
        self.test = self.get_dataloader('test.txt', 1)
Пример #2
0
def _gen_embedding(ndim, alignment=False):
    print "Generating %d-dim word embedding ..." %ndim
    int2ch, ch2int = get_vocab()
    ch_lists = []
    quatrains = get_quatrains()
    for idx, poem in enumerate(quatrains):
        for sentence in poem['sentences']:
            ch_lists.append(filter(lambda ch: ch in ch2int, sentence))
        if alignment:
            # the i-th characters in the poem, used to boost Dui Zhang
            i_characters = [[sentence[j] for sentence in poem['sentences']] for j in range(len(poem['sentences'][0]))]
            for characters in i_characters:
                ch_lists.append(filter(lambda ch: ch in ch2int, characters))
        if 0 == (idx+1)%10000:
            print "[Word2Vec] %d/%d poems have been processed." %(idx+1, len(quatrains))
    print "Hold on. This may take some time ..."
    model = models.Word2Vec(ch_lists, size = ndim, min_count = 5)
    embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim])
    for idx, ch in enumerate(int2ch):
        if ch in model.wv:
            embedding[idx,:] = model.wv[ch]
    if alignment:
        model.save(_w2v_with_alignment_model_path)
        print "Word2Vec model is saved."
        np.save(_w2v_with_alignment_path, embedding)
        print "Word embedding is saved."
    else:
        model.save(_w2v_model_path)
        print "Word2Vec model is saved."
        np.save(_w2v_path, embedding)
        print "Word embedding is saved."
Пример #3
0
    def __init__(self, path: str, vocab_path: str) -> None:
        self.model = kenlm.Model(path)

        def probability_function(tokens: List[str]) -> float:
            return self.model.score(" ".join(tokens))

        super().__init__(self.model,
                         probability_function=probability_function,
                         vocab=get_vocab(vocab_path))
Пример #4
0
def get_quatrains():
    _, ch2int = get_vocab()
    def quatrain_filter(poem):
        if not is_quatrain(poem):
            return False
        else:
            for sentence in poem['sentences']:
                for ch in sentence:
                    if ch not in ch2int:
                        return False
            return True
    return list(filter(quatrain_filter, get_all_corpus()))
Пример #5
0
def get_quatrains():  # 返回每个字符都在字库ch2int中的四行诗的诗句
    _, ch2int = get_vocab()
    def quatrain_filter(poem):
        if not is_quatrain(poem):
            return False
        else:
            for sentence in poem['sentences']:
                for ch in sentence:
                    if ch not in ch2int:
                        return False
            return True
    return filter(quatrain_filter, get_all_corpus())  # get_all_corpus()方法返回的是所有诗句文件数据中的诗的记录,每一行代表一首诗的名、作者、朝代、诗句
Пример #6
0
def get_deck(jlpt_level):
    vocab = get_vocab(jlpt_level)
    deck = genanki.Deck(DECK_BASE_ID + jlpt_level, 'JLPT Vocab::N{}'.format(jlpt_level))
    media = []
    for v in vocab:
        if v.path is not None:
            media.append(v.path)
            audio = '[sound:{}]'.format(v.path)
        else:
            audio = ''

        note = KanjiNote(
            model=VOCAB_MODEL,
            fields=[str(v.id), v.kana, v.kanji, ', '.join(v.pos), v.defn, audio]
        )

        deck.add_note(note)
    return deck, media
Пример #7
0
def _gen_embedding(ndim):
    print "Generating %d-dim word embedding ..." % ndim
    int2ch, ch2int = get_vocab()
    ch_lists = []
    quatrains = get_quatrains()
    for idx, poem in enumerate(quatrains):
        for sentence in poem['sentences']:
            ch_lists.append(filter(lambda ch: ch in ch2int, sentence))
        if 0 == (idx + 1) % 10000:
            print "[Word2Vec] %d/%d poems have been processed." % (
                idx + 1, len(quatrains))
    print "Hold on. This may take some time ..."
    model = models.Word2Vec(ch_lists, size=ndim, min_count=5)
    embedding = uniform(-1.0, 1.0, [VOCAB_SIZE, ndim])
    for idx, ch in enumerate(int2ch):
        if ch in model.wv:
            embedding[idx, :] = model.wv[ch]
    np.save(_w2v_path, embedding)
    print "Word embedding is saved."
Пример #8
0
def _gen_embedding(ndim):  # 生成ndim维度的词向量
    print "Generating %d-dim word embedding ..." % ndim
    int2ch, ch2int = get_vocab()  # 得到词库
    ch_lists = []
    quatrains = get_quatrains()  # 得到所有符合要求规则的四行诗的诗句
    for idx, poem in enumerate(quatrains):  # 对于四行诗中的每一首诗
        for sentence in poem['sentences']:  # 对于诗中的每一句诗
            ch_lists.append(filter(lambda ch: ch in ch2int,
                                   sentence))  # 检查诗句的每一行中哪些在ch2int词典中
        if 0 == (idx + 1) % 10000:
            print "[Word2Vec] %d/%d poems have been processed." % (
                idx + 1, len(quatrains))
    print "Hold on. This may take some time ..."
    model = models.Word2Vec(ch_lists, size=ndim,
                            min_count=5)  # ch_list是词库,ndim是要生成的词向量的维度
    embedding = uniform(-1.0, 1.0,
                        [VOCAB_SIZE, ndim])  # 平均分布的矩阵,每一行代表一个词向量,每一个词向量维度ndim
    for idx, ch in enumerate(int2ch):
        if ch in model.wv:  # 如果int2ch中的该词在model生成的词向量中
            embedding[idx, :] = model.wv[ch]  # embedding中的该行代表ch对应的词向量
    np.save(_w2v_path, embedding)
    print "Word embedding is saved."
Пример #9
0
    def __init__(self,
                 path,
                 vocab_path=None,
                 batch_size=1,
                 shuffle=False,
                 pin_memory=False,
                 update_vocab=False,
                 min_freq=1,
                 concat=False,
                 bptt=35):
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.pin_memory = pin_memory
        self.base_path = path
        self.update_vocab = update_vocab
        self.bptt = bptt
        self.concat = concat

        self.vocab = get_vocab(path, ['train.txt'],
                               min_freq=min_freq,
                               vocab_file=vocab_path)
        self.train = self.get_dataloader('train.txt', self.batch_size)
        self.valid = self.get_dataloader('valid.txt', 1)
        self.test = self.get_dataloader('test.txt', 1)
Пример #10
0
 def __init__(self):
     self.int2ch, self.ch2int = get_vocab(if_segment)
Пример #11
0
        
        TRAIN ACC:  90.906 VALID ACC:  91.181  LOSS:  0.02215
    """
    s = "TRAIN ACC: {: 3.3f} VALID ACC: {: 3.3f}  LOSS: {: 3.5f}"
    print(s.format(100 * train_acc, 100 * valid_acc, loss))


################################################################################
#                                                                           DATA
################################################################################
hyper = load_hyper_params(HYPERPARAMS_FILE)

# LOAD VOCAB
# TODO: make vocab files contain FULL vocab from imdb
#       And make get_vocab() load only the first MAX_VOCAB words
id2word, word2id = get_vocab(VOCAB_FILE, DATA_DIR, hyper["MAX_VOCAB"])
n_words = len(id2word)

# CLASS MAPPINGS
id2class = ["neg", "pos"]
class2id = {label: id for id, label in enumerate(id2class)}

# LOAD DATA
data = get_data(DATA_DIR, CACHED_DATA, vocab_file=VOCAB_FILE)
limit_data_vocab(data, n=hyper["MAX_VOCAB"], unknown_id=1)
n_samples = len(data["xtrain"])

################################################################################
#                                                                          MODEL
################################################################################
model = Model(n_vocab=n_words,
Пример #12
0
    parser.add_argument('--batch_size',
                        help="Enter the batch size",
                        type=int,
                        default=64)
    parser.add_argument('--epochs',
                        help="Enter the number of epochs",
                        type=int,
                        default=2)
    args = parser.parse_args()
    # path = 'data/eng-fra.txt'

    train, val, test = data_preprocess.split(args.path)
    eng_lm = spacy.load('en')
    fre_lm = spacy.load('fr')

    w2i_eng_train, _, w2i_fre_train, _ = vocab.get_vocab(train, eng_lm, fre_lm)
    w2i_eng_val, i2w_eng_val, w2i_fre_val, i2w_fre_val = vocab.get_vocab(
        val, eng_lm, fre_lm)
    w2i_eng_test, i2w_eng_test, w2i_fre_test, i2w_fre_test = vocab.get_vocab(
        test, eng_lm, fre_lm)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    inp_vocab_dim = len(w2i_eng_train)
    label_vocab_dim = len(w2i_fre_train)

    m = model.enc_dec_attn(args.enc_hid, args.dec_hid, args.emb_dim,
                           args.drop_prob, device, inp_vocab_dim,
                           label_vocab_dim)
    # print(m)
    # print(f'The model has {model.count_parameters(m):,} trainable parameters')