Exemplo n.º 1
0
def make_vocab(filename, vocab_size, ngram_max, pct_bpe, sep, ignore_cols, v):
    '''
    Creates word or byte-pair encoding vocabulary and mappings from a sample of
    text. Because this script will load the entire input text into memory, for
    large corpora it is recommended to use a representative sample of text.

    Vocabulary will be saved in a JSON file with the same base name as the input
    file, suffixed with "_word" or "_bpe" depending on the encoding used.
    '''

    kind = _BPE if pct_bpe else _WORD

    with open(filename, 'r') as f:
        sample = f.readlines()

    new_sep = f' {sep} ' if kind == _BPE else ' '
    sample = ['<s> ' + x.replace(' ', '_').replace(sep, new_sep) + '</s>' \
                  for i, x in enumerate(sample) if i not in ignore_cols]

    enc = Encoder(vocab_size,
                  pct_bpe=pct_bpe,
                  silent=not v,
                  ngram_max=ngram_max,
                  required_tokens={'<s>', '</s>'},
                  PAD='<pad>',
                  UNK='<unk>')
    enc.fit(sample)
    enc.vocab_size = len(enc.word_vocab) + len(enc.bpe_vocab)
    enc.mute()
    dir_, name = split(filename)
    enc.save(join(dir_, name.split('.')[0] + f'_{kind[1]}.json'))
Exemplo n.º 2
0
def fit_encoder(df):

    df = preparedata(df)
    encoder = Encoder(200, pct_bpe=0.88)
    encoder.fit(df["Text"].values)

    return encoder
Exemplo n.º 3
0
def run_bpe(params):
    bpe_encoder = Encoder(vocab_size=params.vocab_size,
                          pct_bpe=params.pct_bpe,
                          silent=not params.verbose)
    if params.encoder_load_file:
        sys.stdout.write('Using pre-computed BPE encoder\n')
        sys.stdout.flush()
        bpe_encoder = Encoder.load(params.encoder_load_file)
    else:
        sys.stdout.write('Generating new BPE encoder\n')
        sys.stdout.flush()
        text = open(params.source_file).read().split('\n')
        bpe_encoder.fit(text)
        bpe_encoder.save(params.encoder_save_file)
    f_src = open(params.source_file)
    f_dst = open(params.destination_file, 'w')

    for line in tqdm.tqdm(f_src.readlines()):
        line = line.strip()
        tokens = bpe_encoder.tokenize(line)
        encoded_line = ' '.join(tokens).strip()
        if encoded_line.strip() != '':
            f_dst.write(encoded_line + '\n')
    f_src.close()
    f_dst.close()
Exemplo n.º 4
0
def bpe_encoder_for_lines(cfg: Seq2SeqConfig, lines) -> Encoder:
    """ Calculate BPE encoder for provided lines of text """
    encoder = Encoder(vocab_size=cfg.vocab_size,
                      required_tokens=[
                          cfg.start_token, AT_TOKEN, HASH_TOKEN,
                          SIGNATURE_TOKEN
                      ])
    encoder.fit(lines)
    encoder.save('latest_encoder.json')
    return encoder
class BPETokenizer():
    """ Wrapper class for calling the BPE tokenizer in the bpe module
    """
    def set_train(self, data_file):
        """ Set training data for the BPE tokenizer.

        :param data_file: The file with the data.
        """
        self.data = list(open(data_file).readlines())

    def set_data(self, data):
        """ Set the training data via a list of strings.

        :param data: The data matrix.
        """
        self.data = data

    def train_model(self, iterations=1000, pct_bpe=0.9):
        """ Train the BPE model.

        :param iterations: The number of iterations to perform.
        :param pct_bpe: The percentage of splits to perform.
        """
        self.encoder = Encoder(iterations, pct_bpe=pct_bpe)
        self.encoder.fit([x.lower() for x in self.data])

    def tokenize(self, data):
        """ Tokenize new data with a trained model.

        :param data: The list of strings to tokenize.
        """
        return self.encoder.tokenize(data)

    def save_model(self, model_file):
        """ Save the BPE model to a file.

        :param model_file: The file to save the model to.
        """
        logger.info("Saving BPE model to {}".format(model_file))
        import pickle
        pickle.dump(self.encoder, open(model_file, 'wb'))

    def load_model(self, model_file):
        """ Load the BPE model from a file.

        :param model_file: The file to load the model from.
        """
        logger.info("Loading BPE model from {}".format(model_file))
        import pickle
        self.encoder = pickle.load(open(model_file, 'rb'))
Exemplo n.º 6
0
def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("data", help="Path to data file")
	ap.add_argument("-v", "--vocabulary", help="Path to output vocab file")
	args = ap.parse_args()

	encoder = Encoder(vocab_size=32000, pct_bpe=1.0)

	with open(args.data) as f:
		data = json.load(f)

	data = list(get_data(data))
	data = list(itertools.chain.from_iterable(data))
	encoder.fit(data)
	encoder.save(args.vocabulary)
Exemplo n.º 7
0
def train_and_test_lr(xtrain, ytrain, xtest, ytest, xdev, ydev):
    encoder = Encoder(1000, pct_bpe=0.88)
    encoder.fit(xtrain)
    xtrain = [' '.join(encoder.tokenize(name)) for name in xtrain]
    xtest = [' '.join(encoder.tokenize(name)) for name in xtest]
    xdev = [' '.join(encoder.tokenize(name)) for name in xdev]

    vectorizer = CountVectorizer(ngram_range=(1, 2), lowercase=False)
    x_train = vectorizer.fit_transform(xtrain)
    x_test = vectorizer.transform(xtest)
    x_dev = vectorizer.transform(xdev)

    maxC = tuning_lr(x_train, ytrain, x_dev, ydev)
    clf = LogisticRegression(C=maxC)
    clf.fit(x_train.toarray(), ytrain)
    pred = clf.predict(x_test.toarray())
    f1score = f1_score(ytest, pred)
    return f1score
Exemplo n.º 8
0
def train_and_test_nb(xtrain, ytrain, xtest, ytest, xdev, ydev):
    encoder = Encoder(5000, pct_bpe=0.88)
    encoder.fit(xtrain)
    xtrain = [' '.join(encoder.tokenize(name)) for name in xtrain]
    xtest = [' '.join(encoder.tokenize(name)) for name in xtest]
    xdev = [' '.join(encoder.tokenize(name)) for name in xdev]

    vectorizer = CountVectorizer(ngram_range=(1, 2), lowercase=False)
    x_train = vectorizer.fit_transform(xtrain)
    x_test = vectorizer.transform(xtest)
    x_dev = vectorizer.transform(xdev)

    maxAlpha = tuning_nb(x_train, ytrain, x_dev, ydev)
    clf = MultinomialNB(maxAlpha)
    clf.fit(x_train.toarray(), ytrain)
    pred = clf.predict(x_test.toarray())
    f1score = f1_score(ytest, pred)
    return f1score
Exemplo n.º 9
0
def create_vocab_bpe(data_path='data/mt_corpus_ts.txt',
                     vocab_path='data/vocab.txt',
                     vocab_size=25000,
                     simple_subword=False):
    print('start create vocab from:', data_path)
    line_num = 0
    encoder = Encoder(vocab_size,
                      pct_bpe=0.75,
                      ngram_min=1,
                      ngram_max=4,
                      required_tokens=required_tokens,
                      word_tokenizer=WhitespaceTokenizer().tokenize)
    texts = []
    with open(data_path, encoding='utf-8') as fin:
        for line in fin:
            line_num += 1
            if line_num == 1:
                continue
            if line_num % 100000 == 0:
                print(line_num)
            #tuples = line.strip().split('\t')
            #zh = tuples[1]
            texts.append(line)
    encoder.fit(texts)
    bpe_dict = encoder.vocabs_to_dict()
    with open(vocab_path + '.dict', 'w', encoding='utf-8') as fout:
        fout.write(json.dumps(bpe_dict))

    terms = list(encoder.bpe_vocab.keys())
    terms += list(encoder.word_vocab.keys())
    terms = set(terms)
    terms = list(terms)
    vocabs = special_terms + terms
    vocabs_dict = dict()
    for i, term in enumerate(vocabs):
        vocabs_dict[term] = i
    if not simple_subword:
        for i, term in enumerate(vocabs):
            if term not in special_terms and term not in required_tokens:
                vocabs_dict["@@" + term] = i + len(vocabs)
    with open(vocab_path, 'w', encoding='utf-8') as fout:
        fout.write(json.dumps(vocabs_dict, indent=0))
    print('create vocab done. save to: ', vocab_path)
Exemplo n.º 10
0
def pipe_run(poem_path, save_path, min_count=5, embed_size=512):

    poems = load_poems(poem_path)
    logging.info('read %d poems ' % len(poems))

    logging.info('BPE based vocab mining.....')
    bpe_encoder = Encoder(
        vocab_size=30000, pct_bpe=1.0, ngram_min=2,
        ngram_max=2)  # params chosen for demonstration purposes
    bpe_poems = ["".join(s[:-1]) for p in poems for s in p]
    # print(bpe_poems[:10])
    bpe_encoder.fit(bpe_poems)

    flatten_poems = [[BEGIN_TOKEN] +
                     [SEP if c in {',', '。'} else c for sen in p
                      for c in sen] + [END_TOKEN] for p in poems]

    logging.info('%d training lines' % len(flatten_poems))
    logging.info('training poem embedding model......')

    model = gensim.models.Word2Vec(flatten_poems,
                                   size=embed_size,
                                   min_count=min_count,
                                   workers=4,
                                   window=7,
                                   iter=5,
                                   negative=5)

    logging.info('store poem embedding model.......')
    logging.info('%d vocab is stored', len(model.wv.vocab))
    model.save(os.path.join(save_path, "word2vec.model"))
    logging.info('compute swap embedding...')
    compute_embed_swap(poems, bpe_encoder)

    logging.info('compute the phrase rank in all poems')
    phrase_rank = compute_poem_rank(poems, bpe_encoder)
    # print( [(k, phrase_rank[k])for k in sorted(phrase_rank, key=phrase_rank.get, reverse=True)])
    with open(save_path + 'phrase_rank.json', 'w', encoding='utf-8') as fp:
        json.dump(phrase_rank, fp)
    logging.info('build the vocabulary....')
    token2id = {v: i + 2 for i, v in enumerate(set(model.wv.vocab))}
    token2id[PAD] = 0
    token2id[UNK] = 1
    id2token = {v: k for k, v in token2id.items()}

    with open(save_path + 'token2id.json', 'w', encoding='utf-8') as fp:
        json.dump(token2id, fp)
    with open(save_path + 'id2token.json', 'w', encoding='utf-8') as fp:
        json.dump(id2token, fp)

    logging.info('vocab size is %d ' % len(token2id))
    logging.info('build embedding table......')
    embed_table = np.zeros((len(token2id), embed_size))
    for index, token in id2token.items():

        if token != UNK and token != PAD:
            embed_table[token2id[token]] = model.wv[token]
        else:
            embed_table[token2id[token]] = np.random.uniform(size=embed_size)

    logging.info(embed_table.shape)
    np.save(save_path + 'embed_table.npy', embed_table)
    logging.info('generate train and val and test......')
    dgen = process_utils.DatasetGen(token2id, phrase_rank, bpe_encoder)
    train_dataset, val_dataset, test_dataset = dgen.gen_train_val_test(
        poems, size=8000)
    logging.info('size of training %d' % len(train_dataset))
    logging.info('size of validation %d' % len(val_dataset))
    logging.info('size of test %d' % len(test_dataset))
    with open(save_path + 'train.pkl', 'wb') as fp:
        pickle.dump(train_dataset, fp)
    with open(save_path + 'test.pkl', 'wb') as fp:
        pickle.dump(test_dataset, fp)
    with open(save_path + 'val.pkl', 'wb') as fp:
        pickle.dump(val_dataset, fp)

    return bpe_encoder
def prepare_data(data_path,
                 freq_dist_path,
                 embedding_path,
                 vocabulary_size=10000,
                 embedding_size=200,
                 predict=False,
                 max_length=None,
                 use_bpe=False):
    max_length_provided = max_length is not None

    separator = ","
    if data_path.endswith("tsv"):
        separator = "\t"

    # construct vocabulary
    vocabulary = None
    if not use_bpe:
        with open(freq_dist_path, "rb") as freq_dist_file:
            freq_dist = pickle.load(freq_dist_file)
        vocabulary = {"<pad>": 0, "<unk>": 1, "<user>": 2, "<url>": 3}
        most_common = freq_dist.most_common(vocabulary_size - len(vocabulary))
        vocabulary.update({w[0]: i + 2 for i, w in enumerate(most_common)})
        print("Constructed vocabulary of size {}.".format(vocabulary_size))

    # load data and convert it to indices
    data = []
    labels = []
    if not max_length_provided:
        max_length = 0
    with open(data_path, "r") as data_file:
        lines = data_file.readlines()
        for i, line in enumerate(lines):
            if not predict:
                tweet_id, sentiment, tweet = line.split(separator)
            else:
                tweet_id, tweet = line.split(separator)
            data.append(tweet.strip())

            if not predict:
                labels.append(int(sentiment))
    print("Loaded data ({} tweets).".format(len(data)))

    if not use_bpe:
        new_data = []
        for tweet in data:
            words = tweet.split()
            indices = []
            for w_idx, w in enumerate(words):
                if max_length_provided and w_idx == max_length:
                    break

                index = vocabulary.get(w)
                if index is not None:
                    indices.append(index)
                else:
                    indices.append(vocabulary.get("<unk>"))

            if not max_length_provided and len(indices) > max_length:
                max_length = len(indices)

            new_data.append(indices)
        data = new_data

        pad_value = vocabulary.get("<pad>")
    else:
        print("Training BPE encoder...")
        encoder = Encoder(vocab_size=vocabulary_size,
                          required_tokens=["<user>", "<url>"],
                          UNK="<unk>",
                          PAD="<pad>")
        encoder.fit(data)
        vocabulary = encoder.vocabs_to_dict()
        print("Constructed BPE vocabulary of size {}.".format(vocabulary_size))

        new_data = []
        for tweet in data:
            indices = list(next(encoder.transform([tweet])))
            if not max_length_provided and len(indices) > max_length:
                max_length = len(indices)
            new_data.append(indices)
        data = new_data

        pad_value = encoder.word_vocab[encoder.PAD]

    # load embedding vectors
    embedding_vectors = {}
    if not use_bpe:
        with open(embedding_path, "r") as glove_file:
            for i, line in enumerate(glove_file):
                tokens = line.split()
                word = tokens[0]
                if vocabulary.get(word):
                    vector = [float(e) for e in tokens[1:]]
                    embedding_vectors[word] = np.array(vector)
        print("Found {} GLOVE vectors for vocabulary of size {}.".format(
            len(embedding_vectors), len(vocabulary)))
        print(
            "Loaded embedding vectors ({} dimensions).".format(embedding_size))

    # construct embedding matrix
    embedding_matrix = np.random.randn(vocabulary_size, embedding_size) * 0.01
    if not use_bpe:
        for word, i in list(vocabulary.items()):
            embedding_vector = embedding_vectors.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    print("Constructed embedding matrix.")

    # pad data (might want to change max_length to be CLI argument)
    data = pad_sequences(data,
                         maxlen=max_length,
                         padding="post",
                         value=pad_value)
    if not predict:
        labels = np.array(labels)
    print("Padded sequences to length {}.".format(max_length))

    if not predict:
        return vocabulary, data, labels, embedding_matrix
    return vocabulary, data, embedding_matrix
Exemplo n.º 12
0
def construct_vocabulary(data: Union[str, List[Union[List[str], str]]],
                         vocabulary_size: int = 10000,
                         use_bpe: bool = False,
                         bpe_percentage: float = 0.2,
                         vocabulary_save_file: str = None) -> dict:
    counts = None
    if type(data) == str and ".pkl" in data:
        with open(data, "rb") as f:
            counts = pickle.load(f)
        if type(counts) != nltk.FreqDist:
            logger.info("Loaded vocabulary from file.")
            return counts
        elif use_bpe:
            logger.error("Cannot construct BPE vocabulary from frequency distribution file.")
            raise ValueError("Cannot construct BPE vocabulary from frequency distribution file.")
        else:
            logger.info("Constructing vocabulary from frequency distribution file.")
    elif not use_bpe:
        logger.info("Constructing vocabulary from data.")

        if type(data) == str:
            separator = ","
            if data.endswith("tsv"):
                separator = "\t"

            # load data from file
            new_data = []
            with open(data, "r") as data_file:
                lines = data_file.readlines()
                for i, line in enumerate(lines):
                    _, _, tweet = line.split(separator)
                    new_data.append(TOKENIZER.tokenize(tweet))
            data = new_data
        elif type(data[0]) != list:
            data = [TOKENIZER.tokenize(t) for t in data]

        all_words = []
        for tweet in data:
            all_words.extend(tweet)

        counts = nltk.FreqDist(all_words)

    if use_bpe:
        logger.info("Training BPE encoder...")
        encoder = Encoder(vocab_size=vocabulary_size,
                          pct_bpe=bpe_percentage,
                          word_tokenizer=lambda x: TOKENIZER.tokenize(x),
                          required_tokens=["<start>", "<extract>", "<user>", "<url>"],
                          UNK="<unk>", PAD="<pad>")
        encoder.fit(data)
        vocabulary = encoder.vocabs_to_dict()
        logger.info("Constructed BPE vocabulary of size {}.".format(vocabulary_size))
    else:
        vocabulary = {"<pad>": 0, "<unk>": 1, "<start>": 2, "<extract>": 3}
        initial_vocab_length = len(vocabulary)
        most_common = counts.most_common(vocabulary_size - initial_vocab_length)
        vocabulary.update({w[0]: i + initial_vocab_length for i, w in enumerate(most_common)})
        logger.info("Constructed embedding vocabulary of size {}.".format(len(vocabulary)))

    if vocabulary_save_file:
        if not vocabulary_save_file.endswith(".pkl"):
            vocabulary_save_file += ".pkl"
        with open(vocabulary_save_file, "wb") as f:
            pickle.dump(vocabulary, f)
        logger.info("Saved vocabulary to \"{}\".".format(vocabulary_save_file))

    return vocabulary
Exemplo n.º 13
0
from tqdm import tqdm

sequences = []
with open("../../datasets/multi30k/train.en") as f:
    for line in f:
        sequences.append(line.strip())

ref = [x.split() for x in sequences]
ref_len = [len(x) for x in ref]
print("REF:", max(ref_len))

def parse(x):
    return x.split()

enc = Encoder(4096, ngram_min=1, ngram_max=2, pct_bpe=0.8, silent=True, word_tokenizer=parse)
enc.fit(sequences)

base = enc.vocabs_to_dict()
duplicate_keys = []
for key in base['byte_pairs']:
    if key in base['words']:
        duplicate_keys.append(key)
if len(duplicate_keys) > 0:
    print("got duplicates:")
    print(duplicate_keys)
else:
    print("NO DUPLICATES! :)")

keybase = {**base['words'], **base['byte_pairs']}

Exemplo n.º 14
0
from bpe import Encoder

# Generated with http://pythonpsum.com
test_corpus = '''
    Object raspberrypi functools dict kwargs. Gevent raspberrypi functools. Dunder raspberrypi decorator dict didn't lambda zip import pyramid, she lambda iterate?
    Kwargs raspberrypi diversity unit object gevent. Import fall integration decorator unit django yield functools twisted. Dunder integration decorator he she future. Python raspberrypi community pypy. Kwargs integration beautiful test reduce gil python closure. Gevent he integration generator fall test kwargs raise didn't visor he itertools...
    Reduce integration coroutine bdfl he python. Cython didn't integration while beautiful list python didn't nit!
    Object fall diversity 2to3 dunder script. Python fall for: integration exception dict kwargs dunder pycon. Import raspberrypi beautiful test import six web. Future integration mercurial self script web. Return raspberrypi community test she stable.
    Django raspberrypi mercurial unit import yield raspberrypi visual rocksdahouse. Dunder raspberrypi mercurial list reduce class test scipy helmet zip?
'''

encoder = Encoder(200,
                  pct_bpe=0.88)  # params chosen for demonstration purposes
encoder.fit(test_corpus.split('\n'))

example = "Vizzini: He didn't fall? INCONCEIVABLE!"
print(encoder.tokenize(example))
# ['__sow', 'vi', 'z', 'zi', 'ni', '__eow', '__sow', ':', '__eow', 'he', 'didn', "'", 't', 'fall', '__sow', '?', '__eow', '__sow', 'in', 'co', 'n', 'ce', 'iv', 'ab', 'le', '__eow', '__sow', '!', '__eow']
print(next(encoder.transform([example])))
# [26, 108, 79, 104, 72, 24, 26, 117, 24, 9, 11, 8, 12, 10, 26, 90, 24, 26, 154, 56, 37, 149, 80, 169, 84, 24, 26, 156, 24]
print(next(encoder.inverse_transform(encoder.transform([example]))))
# vizzini : he didn ' t fall ? inconceivable !