def _create_example(self, lines, set_type): examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) # if i == 0: # print('label: ', label) examples.append(InputExample(guid=guid, text=text, label=label)) return examples
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples
def _create_examples(self, lines, set_type): examples = [] np.random.shuffle(lines) for i, line in enumerate(lines): guid = '%s-%s' % (set_type, i) # if set_type == 'test': # text_a = tokenization.convert_to_unicode(line[1]) # label = '0' # else: # text_a = tokenization.convert_to_unicode(line[1]) # label = tokenization.convert_to_unicode(line[0]) # self.labels.add(label) text_a = tokenization.convert_to_unicode(line[1]) label = tokenization.convert_to_unicode(line[0]) self.labels.add(label) examples.append( InputExample(guid=guid, text_a=text_a, label=label, text_b=None)) return examples
def read_line_examples(lst_strs): """Read a list of `InputExample`s from a list of strings.""" unique_id = 0 for ss in lst_strs: line = tokenization.convert_to_unicode(ss) if not line: continue line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) unique_id += 1
def read_tokenized_examples(lst_strs): """ :param lst_strs: [[]] 每个子元素为一个序列,子元素的每一个元素为这个序列的一个index :return: """ unique_id = 0 # 对lst_list中的数据进行转化为ID lst_strs = [[tokenization.convert_to_unicode(w) for w in s] for s in lst_strs] for ss in lst_strs: text_a = ss text_b = None try: # 这里使用|||对输入的句子进行切分如果存在这个符号,表示输入的是两个句子,即text_a 和text_b, 否则index出错,只会存在test_a j = ss.index('|||') text_a = ss[:j] text_b = ss[(j + 1):] except ValueError: pass yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) unique_id += 1
def read_examples(input_file): """Read a list of `InputExample`s from an input file.""" examples = [] unique_id = 0 with tf.gfile.GFile(input_file, "r") as reader: while True: line = tokenization.convert_to_unicode(reader.readline()) if not line: break line = line.strip() text_a = None text_b = None m = re.match(r"^(.*) \|\|\| (.*)$", line) if m is None: text_a = line else: text_a = m.group(1) text_b = m.group(2) examples.append( InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) unique_id += 1 return examples