示例#1
0
 def __init__(self,
              tokenizer,
              n_classes,
              min_bucket=5,
              max_bucket=45,
              bucket_steps=5,
              preprocessing=True,
              multi_label=False):
     self.min_bucket = min_bucket
     self.max_bucket = max_bucket
     self.bucket_steps = bucket_steps
     self.buckets = ut.single_finetuning_bucketing(self.min_bucket,
                                                   self.max_bucket,
                                                   self.bucket_steps)
     self.tokenizer = tokenizer
     self.n_classes = n_classes
     self.multi_label = multi_label
     self.preprocessing = ut.preprocessing() if preprocessing else None
示例#2
0
    def generator(self, ids, x1, x2, y):
        bucked_samples = {}
        n_samples = len(x1)
        for i in range(n_samples):
            x1_i = x1[i]
            x2_i = x2[i]
            if self.preprocessing:
                x1_i = self.preprocessing(x1_i)
                x2_i = self.preprocessing(x2_i)
            x1_tok = ut.tokenize(x1_i, self.tokenizer)
            x2_tok = ut.tokenize(x2_i, self.tokenizer)
            bucked_samples = self.buckets(ids[i], x1_tok, x2_tok, y[i])

        while True:
            for bucket in bucked_samples:
                bucket_size = len(bucked_samples[bucket])
                bucket_1 = bucket[0]
                bucket_2 = bucket[1]

                position_indices = list(range((bucket_1 + bucket_2 + 3)))
                position_indices = np.array(
                    [position_indices for _ in range(bucket_size)],
                    dtype="int32")

                segment_indices = [0 for _ in range(bucket_1 + 2)] + \
                                  [1 for _ in range(0, bucket_2 + 1, 1)]
                segment_indices = np.array(
                    [segment_indices for _ in range(bucket_size)],
                    dtype="int32")
                batch_x = np.zeros((bucket_size, bucket_1 + bucket_2 + 3),
                                   dtype="int32")
                batch_y = np.zeros((bucket_size, ), dtype="int32")

                for i in range(bucket_size):
                    ids_i, x1_i, x2_i, y_i = bucked_samples[bucket][i]
                    x = ut.prepare_input(x1_i, x2_i)
                    x_ids = convert_tokens_to_ids(self.tokenizer.vocab, x)
                    batch_x[i] = x_ids
                    batch_y[i] = y_i
                p = np.random.permutation(bucket_size)
                batch_x = batch_x[p]
                batch_y = batch_y[p]
                batch_y = to_categorical(batch_y, num_classes=self.n_classes)
                yield ([batch_x, position_indices, segment_indices], batch_y)
示例#3
0
 def __init__(self,
              tokenizer,
              n_classes,
              min_bucket_a=5,
              min_bucket_b=5,
              max_bucket_a=45,
              max_bucket_b=45,
              bucket_steps=5,
              preprocessing=True):
     self.min_bucket_a = min_bucket_a
     self.min_bucket_b = min_bucket_b
     self.max_bucket_a = max_bucket_a
     self.max_bucket_b = max_bucket_b
     self.bucket_steps = bucket_steps
     self.buckets = ut.multiple_finetuning_bucketing(
         self.min_bucket_a, self.min_bucket_b, self.max_bucket_a,
         self.max_bucket_b, self.bucket_steps)
     self.tokenizer = tokenizer
     self.n_classes = n_classes
     self.preprocessing = ut.preprocessing() if preprocessing else None
示例#4
0
    def generator(self, ids, x, y):
        bucked_samples = {}
        lx = len(x)
        for i in range(lx):
            x_i = x[i]
            if self.preprocessing:
                x_i = self.preprocessing(x_i)
            x_tok = ut.tokenize(x_i, self.tokenizer)
            bucked_samples = self.buckets(ids[i], x_tok, y[i])

        while True:
            for bucket in bucked_samples:
                bucket_size = len(bucked_samples[bucket])
                position_indices = list(range((bucket + 2)))
                position_indices = np.array(
                    [position_indices for _ in range(bucket_size)],
                    dtype="int32")
                segment_indices = [0 for _ in range(bucket + 2)]
                segment_indices = np.array(
                    [segment_indices for _ in range(bucket_size)],
                    dtype="int32")

                batch_x = np.zeros((bucket_size, bucket + 2), dtype="int32")
                if self.multi_label:
                    batch_y = np.zeros((bucket_size, self.n_classes),
                                       dtype="int32")
                else:
                    batch_y = np.zeros((bucket_size, ), dtype="int32")
                for i in range(bucket_size):
                    ids_i, x_i, y_i = bucked_samples[bucket][i]
                    x_i = ut.prepare_single_input(x_i)
                    x_ids = convert_tokens_to_ids(self.tokenizer.vocab, x_i)
                    batch_x[i] = x_ids
                    batch_y[i] = y_i
                p = np.random.permutation(bucket_size)
                batch_x = batch_x[p]
                batch_y = batch_y[p]
                if not self.multi_label:
                    batch_y = to_categorical(batch_y,
                                             num_classes=self.n_classes)
                yield ([batch_x, position_indices, segment_indices], batch_y)
示例#5
0
    def generator(self):
        while True:

            fr = open(self.dataset_file, "r", encoding="utf8")
            fr.readline()
            for line in fr.readlines():
                id_, text, id_reply, reply = line.strip().split("\t")
                text, reply = text.strip(), reply.strip()
                text = ut.tokenize(text, self.tokenizer)
                reply = ut.tokenize(reply, self.tokenizer)

                batch = self.buckets(text, reply, y=1)
                res = self.__batching(batch)
                if res is not None:
                    yield res

                batch = self.buckets(reply, text, y=0)
                res = self.__batching(batch)
                if res is not None:
                    yield res

            fr.close()
示例#6
0
    def __init__(self, dataset_file, tokenizer, batch_size, mlm_type,
                 mlm_max_span, mask_prob, probs_mlm, min_bucket_a,
                 min_bucket_b, max_bucket_a, max_bucket_b, bucket_steps,
                 use_rop):

        self.dataset_file = dataset_file
        self.batch_size = batch_size if batch_size % 2 == 0 else batch_size + 1
        self.min_bucket_a = min_bucket_a
        self.min_bucket_b = min_bucket_b
        self.max_bucket_a = max_bucket_a
        self.max_bucket_b = max_bucket_b
        self.bucket_steps = bucket_steps
        self.buckets = ut.bucketing(self.min_bucket_a, self.min_bucket_b,
                                    self.max_bucket_a, self.max_bucket_b,
                                    self.bucket_steps, self.batch_size)
        self.mask_prob = mask_prob
        self.probs_mlm = probs_mlm
        self.mlm_type = mlm_type
        self.mlm_max_span = mlm_max_span
        self.tokenizer = tokenizer
        self.vocab_words = list(self.tokenizer.vocab.keys())[5:]
        self.vocab_size = len(self.vocab_words)
        self.use_rop = use_rop
示例#7
0
    pkm = config["model"]["pkm"]
    pkm_params = config["model"]["pkm_params"]

    use_rop = config["model"]["rop"]["use_rop"]
    rop_n_hidden = config["model"]["rop"]["n_hidden"]
    rop_hidden_size = config["model"]["rop"]["hidden_size"]

    output_encoder_size = [hidden_size for i in range(n_encoders)]
    attention_size = [attention_size for i in range(n_encoders)]
    n_heads = [n_heads for i in range(n_encoders)]

    ##################################

    # Load Data #
    ids_tr, x_tr, y_tr = ut.load_dataset(train_file, id_header, text_header,
                                         class_header, categories, multi_label,
                                         delimiter)
    ids_dv, x_dv, y_dv = ut.load_dataset(dev_file, id_header, text_header,
                                         class_header, categories, multi_label,
                                         delimiter)
    ids_ts, x_ts, y_ts = ut.load_dataset(test_file, id_header, text_header,
                                         class_header, categories, multi_label,
                                         delimiter)

    if multi_label:
        n_classes = len(y_tr[0])

    gen_tr = SingleFinetuningGenerator(tokenizer, n_classes, bucket_min,
                                       bucket_max, bucket_steps, preprocessing,
                                       multi_label)
    gen_dv = SingleFinetuningGenerator(tokenizer, n_classes, bucket_min,
示例#8
0
                               None,
                               None,
                               pkm,
                               pkm_params,
                               input_length=None,
                               use_rop=use_rop)

    twilbert_model.build()
    model = twilbert_model.model

    twilbert_model.compile(model)

    twilbert_model.load(model, path_load_weights)
    print(model.summary())

    dataset = ut.load_lm_dataset(dataset_file)
    preprocess = ut.preprocessing()
    dataset = [ut.tokenize(preprocess(text), tokenizer) for text in dataset]
    gamma = 0.
    N = len(dataset)
    for i in range(N):
        if i % 50 == 0:
            print("T=%d P(X)=%.3f" % (i + 1, (gamma / (i + 1))))
        X = ut.prepare_single_input(dataset[i])  # Añadir [CLS] y [SEP]
        T = len(X)
        # Cada muestra X tiene tantos posibles enmascaramientos como |X| #
        maskings = [ut.mask_lm_eval(X, t) for t in range(T)][1:-1]
        c = 1
        alpha = 0
        for masking in maskings:
            x = convert_tokens_to_ids(tokenizer.vocab, masking)
示例#9
0
    def __batching(self, batch):

        if batch is not None:
            x1, x2, y = batch[0], batch[1], batch[2]
            bucket_1, bucket_2 = len(x1[0]), len(x2[0])
            batch_x = np.zeros((self.batch_size, bucket_1 + bucket_2 + 3),
                               dtype="int32")
            batch_rop = np.zeros(self.batch_size, dtype="int32")
            batch_mlm = np.zeros((self.batch_size, bucket_1 + bucket_2 + 3),
                                 dtype="int32")
            position_indices = list(range((bucket_1 + bucket_2 + 3)))
            position_indices = np.array(
                [position_indices for _ in range(self.batch_size)],
                dtype="int32")
            segment_indices = [0 for _ in range(bucket_1 + 2)] + \
                              [1 for _ in range(0, bucket_2 + 1, 1)]

            segment_indices = np.array(
                [segment_indices for _ in range(self.batch_size)],
                dtype="int32")

            for i in range(self.batch_size):
                x = ut.prepare_input(x1[i], x2[i])
                x_ids = convert_tokens_to_ids(self.tokenizer.vocab, x)
                masked_x, mask = None, None
                if self.mlm_type == "token":
                    try:
                        masked_x, mask = ut.mask_tokens(
                            x, self.mask_prob, self.probs_mlm,
                            self.vocab_words)
                    except:
                        print("Error sample")
                        continue

                elif self.mlm_type == "span":
                    try:
                        masked_x, mask = ut.mask_spans(x, self.mask_prob,
                                                       self.probs_mlm,
                                                       self.vocab_words,
                                                       self.mlm_max_span)
                    except:
                        print("Error sample")
                        continue

                mask = np.array(mask, dtype="int")
                masked_x_ids = convert_tokens_to_ids(self.tokenizer.vocab,
                                                     masked_x)
                mlm_output = ut.prepare_mlm_output(x_ids, mask)
                batch_x[i] = masked_x_ids
                batch_mlm[i] = mlm_output
                batch_rop[i] = y[i]
            p = np.random.permutation(self.batch_size)
            batch_x = batch_x[p]
            batch_rop = batch_rop[p]
            batch_mlm = batch_mlm[p]
            batch_mlm = np.expand_dims(batch_mlm, -1)
            if self.use_rop:
                return ([batch_x, position_indices,
                         segment_indices], [batch_rop, batch_mlm])
            else:
                return ([batch_x, position_indices,
                         segment_indices], [batch_mlm])
示例#10
0
    model = twilbert_model.pretrained_model

    print(model.summary())

    fr = open(dataset_file, "r", encoding="utf8")
    fr.readline()
    tweets = []
    replies = []
    labels = []
    for line in fr.readlines():
        sline = line.strip().split("\t")
        tweets.append(sline[0].strip())
        replies.append(sline[1].strip())
        labels.append(int(sline[2].strip()))

    preprocess = ut.preprocessing()
    tweets = [ut.tokenize(preprocess(text), tokenizer) for text in tweets]
    replies = [ut.tokenize(preprocess(text), tokenizer) for text in replies]
    N = len(tweets)
    embeddings = []
    for i in range(N):
        X = ut.prepare_input(tweets[i], replies[i])
        indices = convert_tokens_to_ids(tokenizer.vocab, X)
        position_indices = list(range(len(X)))
        segment_indices = [0 for _ in range(len(tweets[i]) + 2)] + \
                          [1 for _ in range(0, len(replies[i]) + 1, 1)]
        pred = model.predict([
            np.array([indices]),
            np.array([position_indices]),
            np.array([segment_indices])
        ])[0]
示例#11
0
                               pkm,
                               pkm_params,
                               input_length=None)

    twilbert_model.build()

    model = twilbert_model.model
    pretrained_model = twilbert_model.pretrained_model
    twilbert_model.compile(model)
    model.load_weights(pretrained_model_weights)

    #########################

    # Load Data #
    ids_tr, x1_tr, x2_tr, y_tr = ut.load_multiple_dataset(
        train_file, id_header, text_header, aux_header, class_header,
        categories, delimiter)

    ids_dv, x1_dv, x2_dv, y_dv = ut.load_multiple_dataset(
        dev_file, id_header, text_header, aux_header, class_header, categories,
        delimiter)

    ids_ts, x1_ts, x2_ts, y_ts = ut.load_multiple_dataset(
        test_file, id_header, text_header, aux_header, class_header,
        categories, delimiter)

    gen_tr = MultipleFinetuningGenerator(tokenizer, n_classes, bucket_min_a,
                                         bucket_min_b, bucket_max_a,
                                         bucket_max_b, bucket_steps,
                                         preprocessing)
示例#12
0
                               pkm,
                               pkm_params,
                               input_length=None)

    twilbert_model.build()

    model = twilbert_model.model
    pretrained_model = twilbert_model.pretrained_model
    twilbert_model.compile(model)
    model.load_weights(pretrained_model_weights)

    #########################

    # Load Data #
    ids_tr, x_tr, y_tr = ut.load_dataset(train_file, id_header, text_header,
                                         class_header, categories, multi_label,
                                         delimiter)
    ids_dv, x_dv, y_dv = ut.load_dataset(dev_file, id_header, text_header,
                                         class_header, categories, multi_label,
                                         delimiter)
    ids_ts, x_ts, y_ts = ut.load_dataset(test_file, id_header, text_header,
                                         class_header, categories, multi_label,
                                         delimiter)

    if multi_label:
        n_classes = len(y_tr[0])

    gen_tr = SingleFinetuningGenerator(tokenizer, n_classes, bucket_min,
                                       bucket_max, bucket_steps, preprocessing,
                                       multi_label)
    gen_dv = SingleFinetuningGenerator(tokenizer, n_classes, bucket_min,