def build_dict_by_gensim(docs, special_tokens=None):
    if special_tokens is None:
        special_tokens = {}
    dct = Dictionary(docs)
    dct.filter_extremes(no_below=3, no_above=1.0, keep_n=None)
    dct.patch_with_special_tokens(special_tokens)
    return dct
Exemplo n.º 2
0
def build_dictionary():
    letters = []

    with open('data/unicodes.txt') as f:
        for line in f:
            c = line.split()[1]
            letters.append(c)

    special_tokens = {
        '<pad>': 0,
        '<sos>': 1,
        '<eos>': 2,
        '<ctx>': 3,
        '<unk>': 4,
        '<sep>': 5
    }
    # special_tokens = {'<pad>':0, '<sos>':1, '<eos>':2, '<sep>':3, '<unk>':4}

    dictionary = Dictionary([letters])  # initialize a Dictionary
    dictionary.patch_with_special_tokens(special_tokens)

    return dictionary
Exemplo n.º 3
0
 def test_patch_with_special_tokens(self):
     special_tokens = {'pad': 0, 'space': 1, 'quake': 3}
     corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
     d = Dictionary(corpus)
     self.assertEqual(len(d.token2id), 5)
     d.patch_with_special_tokens(special_tokens)
     self.assertEqual(d.token2id['pad'], 0)
     self.assertEqual(d.token2id['space'], 1)
     self.assertEqual(d.token2id['quake'], 3)
     self.assertEqual(len(d.token2id), 8)
     self.assertNotIn((0, 1), d.doc2bow(corpus[0]))
     self.assertIn((0, 1), d.doc2bow(['pad'] + corpus[0]))
     corpus_with_special_tokens = [["máma", "mele", "maso"], ["ema", "má", "máma", "space"]]
     d = Dictionary(corpus_with_special_tokens)
     self.assertEqual(len(d.token2id), 6)
     self.assertNotEqual(d.token2id['space'], 1)
     d.patch_with_special_tokens(special_tokens)
     self.assertEqual(len(d.token2id), 8)
     self.assertEqual(max(d.token2id.values()), 7)
     self.assertEqual(d.token2id['space'], 1)
     self.assertNotIn((1, 1), d.doc2bow(corpus_with_special_tokens[0]))
     self.assertIn((1, 1), d.doc2bow(corpus_with_special_tokens[1]))
Exemplo n.º 4
0
            for word_document in tqdm(word_documents):
                sws = [sp.tokenize(word) for word in word_document]
                sw_documents.append(list(chain.from_iterable(sws)))
            sw_documents_dict[sp_key] = sw_documents

            if os.path.exists(config_dic.get(
                    "cache_dir")) and not os.path.exists(sp_cache_path):
                print(f"Write Cache data. {sp_cache_path}")
                with open(sp_cache_path, "wb") as f:
                    f.write(cloudpickle.dumps(sw_documents))

    print("=========== Build vocabulary ===========")
    special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3}
    word_dic = Dictionary(word_documents)
    word_dic.filter_extremes(no_below=5, no_above=1.0, keep_n=None)
    word_dic.patch_with_special_tokens(special_token_dict)

    sw_dicts = {}
    for sp_key, sw_documents in sw_documents_dict.items():
        sw_dic = Dictionary(sw_documents)
        sw_dic.filter_extremes(no_below=5, no_above=1.0, keep_n=None)
        sw_dic.patch_with_special_tokens(special_token_dict)
        sw_dicts[sp_key] = sw_dic

    # char_dic = Dictionary([[char for word in word_document for char in word] for word_document in word_documents])
    # char_dic.filter_extremes(no_below=5, no_above=1.0, keep_n=None)
    # char_dic.patch_with_special_tokens(special_token_dict)

    print("============= Save Vocabulary ================")
    word_dic.save(
        os.path.join(config_dic.get("vocab_dir"), f"{args.config}.word.dic"))
Exemplo n.º 5
0
            sw_dicts[sp_key] = Dictionary.load(
                os.path.join(config_dic.get("vocab_dir"),
                             f"{args.config}.{sp_key}.dic"))
    else:
        special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3}
        word_dic = Dictionary()
        word_dic.token2id = special_token_dict
        #char_dic = Dictionary()
        #char_dic.token2id = special_token_dict
        sw_dicts = {}
        for sp_key, sp in sps.items():
            _dic = Dictionary()
            _dic.token2id = special_token_dict
            sw_dicts[sp_key] = _dic
    label_dic = Dictionary(train_label_documents)
    label_dic.patch_with_special_tokens({PADDING: 0})
    label_dic.id2token = {
        _id: label
        for label, _id in label_dic.token2id.items()
    }

    # add vocabulary
    word_dic.add_documents(train_word_documents)
    #char_dic.add_documents(list(chain.from_iterable(train_char_documents)))
    for sp_key, train_sw_documents in train_sw_documents_dicts.items():
        sw_dicts[sp_key].add_documents(train_sw_documents)

    # load GloVe
    if config_dic.get("glove_path"):
        print("========= Load Pretrain Word Embeddings ==========")
        word2vec = load_pretrain_embeddings(
Exemplo n.º 6
0
class Dataset(object):
    '''
    Create dataset for training supervised model
    '''
    def __init__(self, config):
        self.config = config
        self.train_data = None
        self.test_data = None
        self.val_data = None
        self.vocab = None
        self.word_embeddings = None

    def get_pandas_df(self, filename):
        '''
        Load the data into Pandas.DataFrame object
        This will be used to convert data to torchtext object
        '''
        with open(filename, 'r', encoding='utf-8') as datafile:
            data = [line.strip().split(' ', maxsplit=1) for line in datafile]
            data_text = list(map(lambda x: x[1], data))
            data_label = list(map(lambda x: x[0], data))

        full_df = pd.DataFrame({"text": data_text, "label": data_label})
        return full_df

    def load_data(self,
                  train_file,
                  test_file,
                  dataname,
                  embed_file=None,
                  val_file=None):
        '''
        Loads the data from files   
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data

        Inputs:
            embed_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec)
            train_file (String): absolute path to training file
            test_file (String): absolute path to test file
            val_file (String): absolute path to validation file
        '''
        # load embeddings
        voc_file = dataname + '_vocab.txt'
        new_embed = dataname + '_embed.pkl'
        train_X, train_Y = read_labeled(train_file)
        test_X, test_Y = read_labeled(test_file)
        val_X = None
        val_Y = None
        if val_file:
            val_X, val_Y = read_labeled(val_file)
        else:
            sp = int(len(train_X) * 0.8)
            train_X, val_X = (train_X[:sp], train_X[sp:])
            train_Y, val_Y = (train_Y[:sp], train_Y[sp:])
        train_X = [doc_padding(x, self.config.max_sen_len) for x in train_X]
        test_X = [doc_padding(x, self.config.max_sen_len) for x in test_X]
        val_X = [doc_padding(x, self.config.max_sen_len) for x in val_X]

        if os.path.isfile(voc_file):
            self.vocab = Dictionary.load_from_text(voc_file)
        else:
            self.vocab = Dictionary(train_X)
            special_tokens = {'<pad>': 0, '<unk>': 1}
            self.vocab.patch_with_special_tokens(special_tokens)
            self.vocab.save_as_text(voc_file)
        # build vocab
        train_X = [self.vocab.doc2idx(x, 1) for x in train_X]
        test_X = [self.vocab.doc2idx(x, 1) for x in test_X]
        val_X = [self.vocab.doc2idx(x, 1) for x in val_X]
        # transform words to index
        if os.path.isfile(new_embed):
            self.word_embeddings = torch.load(new_embed)
        else:
            embeds = Vectors(embed_file,
                             unk_init=lambda x: torch.Tensor(
                                 np.random.normal(scale=0.6, size=(x.size()))))
            self.word_embeddings = weight_matrix(self.vocab, embeds)
            torch.save(self.word_embeddings, new_embed)
        self.train_data = (train_X, train_Y)
        self.test_data = (test_X, test_Y)
        self.val_data = (val_X, val_Y)

        print("Loaded {} training examples".format(len(train_X)))
        print("Loaded {} test examples".format(len(test_X)))
        print("Loaded {} validation examples".format(len(val_X)))

    def train_iterator(self):
        return batch_iter(*self.train_data, self.config.batch_size)

    def test_iterator(self):
        return batch_iter(*self.test_data, self.config.batch_size, False)

    def val_iterator(self):
        return batch_iter(*self.val_data, self.config.batch_size, False)