예제 #1
0
def main(_):
	tf.logging.set_verbosity(tf.logging.INFO)

	output_files = FLAGS.output_file.split(",")
	writers = [tf.python_io.TFRecordWriter(out) for out in output_files]

	rng = random.Random(FLAGS.random_seed)

	tokenizer = tokenization.WordpieceTokenizer(
			vocab=tokenization.load_vocab(FLAGS.vocab_file))

	estimator = get_embedding_estimator()

	sample = get_sample(FLAGS.input_sentence_file, FLAGS.input_mapping_file,
		rng, FLAGS.sample_size)
	batches = list(range(0, len(sample), 3000)) + [len(sample)]

	for brange in zip(batches, batches[1:]):
		batch_sample = sample[brange[0]:brange[1]]

		instances = create_training_instances(
			FLAGS.input_sentence_file, FLAGS.input_mapping_file, tokenizer,
			FLAGS.max_seq_length, rng, FLAGS.do_lower_case, batch_sample)

		write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
																		writers, estimator)

	for writer in writers:
		writer.close()
예제 #2
0
    def test_wordpiece_tokenizer(self):
        vocab_tokens = [
            "[UNK]",
            "[CLS]",
            "[SEP]",
            "want",
            "##want",
            "##ed",
            "wa",
            "un",
            "runn",
            "##ing",
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i
        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)

        self.assertAllEqual(tokenizer.tokenize(""), [])

        self.assertAllEqual(
            tokenizer.tokenize("unwanted running"),
            ["un", "##want", "##ed", "runn", "##ing"],
        )

        self.assertAllEqual(tokenizer.tokenize("unwantedX running"),
                            ["[UNK]", "runn", "##ing"])
예제 #3
0
 def test_wordpiece_tokenizer(self):
     vocab_tokens = [
         "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
         "runn", "##ing"
     ]
     vocab = {token: i for i, token in enumerate(vocab_tokens)}
     tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
     self.assertAllEqual(tokenizer.tokenize(""), [])
     self.assertAllEqual(tokenizer.tokenize("unwanted running"),
                         ["un", "##want", "##ed", "runn", "##ing"])
     self.assertAllEqual(tokenizer.tokenize("unwantedX running"),
                         ["[UNK]", "runn", "##ing"])
     print('test_wordpiece_tokenizer',
           tokenizer.tokenize("unwa wanted warunn runned wawant want"))
예제 #4
0
    def test_wordpiece_tokenizer(self):
        vocab_tokens = [
            '[UNK]', '[CLS]', '[SEP]', 'want', '##want', '##ed', 'wa', 'un',
            'runn', '##ing'
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i
        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)

        self.assertAllEqual(tokenizer.tokenize(''), [])

        self.assertAllEqual(tokenizer.tokenize('unwanted running'),
                            ['un', '##want', '##ed', 'runn', '##ing'])

        self.assertAllEqual(tokenizer.tokenize('unwantedX running'),
                            ['[UNK]', 'runn', '##ing'])
예제 #5
0
 def __init__(self, vocab_file, do_lower_case=False):
     self.vocab = tokenization.load_vocab(vocab_file)
     self.inv_vocab = {v: k for k, v in self.vocab.items()}
     self.wordpiece_tokenizer = tokenization.WordpieceTokenizer(
         vocab=self.vocab)
     self.do_lower_case = do_lower_case
예제 #6
0
# counts = pair_counts(lines, maps)
# most_frequent_pairs(counts)

pairs = [
    ('year', 'ano'),
    ('wanted', 'queria'),
    ('question', 'questão'),
    ('I', 'eu'),
    ('opportunity', 'oportunidade'),
    ('problem', 'problema'),
    ('love', 'amor'),
]
K = 10

tokenizer = tokenization.WordpieceTokenizer(vocab=tokenization.load_vocab(
    "/home/arthur/Projects/bert/models/multi_aligned_cased_L-12_H-768_A-12/vocab.txt"
))

rng = random.Random(1234)
sample = set(get_sample(sent_path, map_path, rng, 50000))
lines = [l for i, l in enumerate(lines) if i in sample]

sents = get_sentences(lines, pairs, tokenizer, k=K)
embs = get_embeddings(sents)

from sklearn.manifold import TSNE
X = embs.reshape((-1, embs.shape[-1]))
X_embedded = TSNE(n_components=2, perplexity=20,
                  metric='cosine').fit_transform(X)
X_embedded.shape
    def __init__(self):
        vocab_file = 'vocab.txt'
        vocab = tokenization.load_vocab(vocab_file=vocab_file)
        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)
        path = 'train_processed.txt'

        train_file = open(path, 'r', encoding='utf-8')
        lines = train_file.read().split('\n')

        max_length = 0

        for i in range(len(lines)):
            TK = lines[i].split(' \t')

            if max_length < len(TK[0]):
                max_length = len(TK[0])

        max_length += 1

        self.input_ids = np.zeros(shape=[len(lines), max_length],
                                  dtype=np.int32)
        self.input_mask = np.zeros(shape=[len(lines), max_length],
                                   dtype=np.int32)
        self.label = np.zeros(shape=[len(lines)], dtype=np.int32)

        for i in range(len(lines) - 1):
            TK = lines[i].split(' \t')
            if len(TK) != 2:
                TK = lines[i].split('\t')

            sentence = TK[0]
            token = tokenizer.tokenize(sentence)
            tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab,
                                                        tokens=token)

            for j in range(len(tk_ids)):
                self.input_ids[i, j + 1] = tk_ids[j]
                self.input_mask[i, j + 1] = 1
            self.input_ids[i, 0] = tokenization.convert_tokens_to_ids(
                vocab=vocab, tokens=['[CLS]'])[0]
            self.input_mask[i, 0] = 1
            self.label[i] = int(TK[1])

        path = 'test_processed.txt'

        test_file = open(path, 'r', encoding='utf-8')
        lines = test_file.read().split('\n')

        max_length = 0

        for i in range(len(lines)):
            TK = lines[i].split(' \t')

            if max_length < len(TK[0]):
                max_length = len(TK[0])

        print(max_length)
        max_length += 1

        self.test_input_ids = np.zeros(shape=[len(lines), max_length],
                                       dtype=np.int32)
        self.test_input_ids_masking = np.zeros(shape=[len(lines), max_length],
                                               dtype=np.int32)
        self.test_label = np.zeros(shape=[len(lines)], dtype=np.int32)

        for i in range(len(lines) - 1):
            TK = lines[i].split(' \t')
            if len(TK) != 2:
                TK = lines[i].split('\t')

            sentence = TK[0]
            token = tokenizer.tokenize(sentence)
            tk_ids = tokenization.convert_tokens_to_ids(vocab=vocab,
                                                        tokens=token)

            for j in range(len(tk_ids)):
                self.test_input_ids[i, j + 1] = tk_ids[j]
                self.test_input_ids_masking[i, j + 1] = 1
            self.test_input_ids[i, 0] = tokenization.convert_tokens_to_ids(
                vocab=vocab, tokens=['[CLS]'])[0]
            self.test_input_ids_masking[i, 0] = 1

            self.test_label[i] = int(TK[1])

        self.Batch_Size = 8

        self.random_idx = np.array(range(self.label.shape[0]), dtype=np.int32)
        np.random.shuffle(self.random_idx)

        self.Batch_Idx = 0
        self.Test_Batch_Idx = 0