def create_examples(self, x_s, y_s=None): examples = [] valid_classes = np.zeros((y_s.shape[0], len(self.list_classes))) accumul = 0 for (i, x) in enumerate(x_s): y = y_s[i] guid = i text_a = tokenization.convert_to_unicode(x) #the_class = self._rewrite_classes(y, i) ind, = np.where(y == 1) the_class = self.list_classes[ind[0]] if the_class is None: #print(text_a) continue if the_class not in self.list_classes: #the_class = 'other' continue label = tokenization.convert_to_unicode(the_class) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) valid_classes[accumul] = y accumul += 1 return examples, valid_classes
def create_inputs(self, x_s, dummy_label='dummy'): examples = [] # dummy label to avoid breaking the bert base code label = tokenization.convert_to_unicode(dummy_label) for (i, x) in enumerate(x_s): guid = i text_a = tokenization.convert_to_unicode(x) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _get_example(self, x, y): """ Gets a collection of `InputExample` already labelled (for training and eval) """ examples = [] for i in range(len(x)): guid = i tokens = [] labels = [] for j in range(len(x[i])): tokens.append(tokenization.convert_to_unicode(x[i][j])) labels.append(tokenization.convert_to_unicode(y[i][j])) example = InputExample(guid=guid, tokens=tokens, labels=labels) examples.append(example) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[3]) text_b = tokenization.convert_to_unicode(line[4]) if set_type == "test": label = "0" else: label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): # Only the test set has a header if set_type == "test" and i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": text_a = tokenization.convert_to_unicode(line[1]) label = "0" else: text_a = tokenization.convert_to_unicode(line[3]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def get_dev_examples(self, data_dir): """See base class.""" lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "dev-%d" % (i) language = tokenization.convert_to_unicode(line[0]) if language != tokenization.convert_to_unicode(self.language): continue text_a = tokenization.convert_to_unicode(line[6]) text_b = tokenization.convert_to_unicode(line[7]) label = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def get_train_examples(self, data_dir): """See base class.""" lines = self._read_tsv( os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language)) examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "train-%d" % (i) text_a = tokenization.convert_to_unicode(line[0]) text_b = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[2]) if label == tokenization.convert_to_unicode("contradictory"): label = tokenization.convert_to_unicode("contradiction") examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples
def create_inputs(self, x_s, dummy_label='O'): """ Gets a collection of `InputExample` for input to be labelled (for prediction) """ examples = [] # dummy label to avoid breaking the BERT base code for (i, x) in enumerate(x_s): guid = i tokens = [] labels = [] # if x is not already segmented: if isinstance(x, list): simple_tokens = x else: simple_tokens = tokenizeAndFilterSimple(x) for j in range(len(simple_tokens)): tokens.append(tokenization.convert_to_unicode( simple_tokens[j])) labels.append(tokenization.convert_to_unicode(dummy_label)) examples.append( InputExample(guid=guid, tokens=tokens, labels=labels)) return examples