예제 #1
0
    def _read_file(self, input_file, phase=None):
        """Reads a tab separated value file."""
        has_warned = False
        with io.open(input_file, "r", encoding="UTF-8") as file:
            reader = csv.reader(file, delimiter="\t", quotechar=None)
            examples = []
            for (i, line) in enumerate(reader):

                if i == 0:
                    ncol = len(line)
                    if self.if_file_with_header[phase]:
                        continue
                if (len(line) != ncol):
                    print(line)
                if phase != "predict":
                    if ncol == 1:
                        raise Exception(
                            "the %s file: %s only has one column but it is not a predict file"
                            % (phase, input_file))
                    elif ncol == 2:
                        example = InputExample(guid=i,
                                               text_a=line[0],
                                               label=line[1])
                    elif ncol == 3:
                        example = InputExample(guid=i,
                                               text_a=line[0],
                                               text_b=line[1],
                                               label=line[2])
                    else:
                        raise Exception(
                            "the %s file: %s has too many columns (should <=3_实体识别)"
                            % (phase, input_file))
                else:
                    if ncol == 1:
                        example = InputExample(guid=i, text_a=line[0])
                    elif ncol == 2:
                        if not has_warned:
                            logger.warning(
                                "the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b"
                                % (input_file))
                            has_warned = True
                        example = InputExample(guid=i,
                                               text_a=line[0],
                                               text_b=line[1])
                    else:
                        raise Exception(
                            "the predict file: %s has too many columns (should <=2)"
                            % (input_file))
                examples.append(example)
                # print(example)
            return examples
    def _read_file(self, input_file, phase=None):
        """Reads a tab separated value file."""
        has_warned = False
        with open(input_file, 'r', encoding='utf-8') as f:
            examples = []
            for i, line in enumerate(f.readlines()[1:]):
                # line=line[:-1]
                # print(line)
                line = line.split('\t')
                text1 = line[0]
                text = eval(text1)
                if len(line) < 2:
                    example = InputExample(guid=i, text_a=text)
                else:
                    label1 = line[1]
                    label = eval(label1)
                    example = InputExample(guid=i, text_a=text, label=label)
                examples.append(example)

        return examples
예제 #3
0
    def _read_file(self, input_file, phase=None):
        """Reads a tab separated value file."""
        with io.open(input_file, "r", encoding="UTF-8") as file:
            examples = []
            for line in file:
                data = line.strip().split("_!_")
                example = InputExample(guid=data[0],
                                       label=data[1],
                                       text_a=data[3])
                examples.append(example)

            return examples
예제 #4
0
    def _read_csv(self, input_file, quotechar=None):
        """Reads a tab separated value file."""
        data = pd.read_csv(input_file, encoding="UTF-8")
        examples = []
        for index, row in data.iterrows():
            guid = row["id"]
            text = row["comment_text"]
            labels = [int(value) for value in row[2:]]
            example = InputExample(guid=guid, label=labels, text_a=text)
            examples.append(example)

        return examples
예제 #5
0
 def _read_file(self, input_file, is_training):
     """Reads a tab separated value file."""
     with io.open(input_file, "r", encoding="UTF-8") as file:
         examples = []
         for (i, line) in enumerate(file):
             if i == 0 and is_training:
                 continue
             data = line.strip().split("_!_")
             example = InputExample(
                 guid=i, label=data[0], text_a=data[2], text_b=data[3])
             examples.append(example)
         return examples
예제 #6
0
파일: SelfDataset.py 프로젝트: wshzd/Paddle
def _read_tsv(input_file):
    """Reads a tab separated value file."""
    examples = []
    seq_id = 0
    for line in input_file:
        #这一步非常的中药就是我们拼接 text_a 使用未分割的字符串(框架内部有实现分词)
        #label这个参数拼接分类类别
        example = InputExample(
            guid=seq_id, label=line[1], text_a=line[0])
        seq_id += 1
        examples.append(example)
    return examples
예제 #7
0
 def _read_file(self, input_file, phase=None):
     """Reads a tab separated value file."""
     with io.open(input_file, "r", encoding="UTF-8") as file:
         examples = []
         for (i, line) in enumerate(file):
             data = line.strip().split("_!_")
             try:
                 example = InputExample(
                     guid=i, label=str(data[0]), text_a=data[1], text_b=None)
                 examples.append(example)
             except:
                 pass
         return examples
 def _read_csv(self, input_file, quotechar=None):
     """Reads a tab separated value file."""
     with codecs.open(input_file, "r", encoding="UTF-8") as f:
         reader = csv.reader(f, delimiter=",", quotechar=quotechar)
         examples = []
         seq_id = 0
         header = next(reader)  # skip header
         for line in reader:
             example = InputExample(
                 guid=seq_id, label=line[0], text_a=line[1])
             seq_id += 1
             examples.append(example)
         return examples
예제 #9
0
    def _read_file(self, input_file, phase=None):
        """Reads a tab separated value file."""
        data = pd.read_csv(input_file, sep='\t', header=None)
        examples = []
        i = 0
        if self.model == 'mcls':
            if (phase != 'predict'):
                for sent, entity, label in data.values:
                    # print(type(label))
                    examples.append(
                        InputExample(guid=i,
                                     text_a=sent,
                                     text_b=str(entity),
                                     label=eval(label)))
                    i += 1
            else:
                for sent, entity, label in data.values:
                    examples.append(
                        InputExample(guid=i,
                                     text_a=sent,
                                     text_b=str(entity),
                                     label=None))
                    i += 1
        else:  #if self.model=='mcls_onlysentence':

            if (phase != 'predict'):
                for sent, label in data.values:
                    # print(type(label))
                    examples.append(
                        InputExample(guid=i, text_a=sent, label=eval(label)))
                    # print(examples[-1])
                    i += 1
            else:
                for sent, label in data.values:
                    examples.append(
                        InputExample(guid=i, text_a=sent, label=None))
                    i += 1
        return examples
예제 #10
0
 def _read_file(self, input_file, phase=False):
     """
     读入json格式数据集
     """
     examples = []
     drop = 0
     with open(input_file, "r") as reader:
         input_data = json.load(reader)["data"]
     for entry in input_data:
         for paragraph in entry["paragraphs"]:
             paragraph_text = paragraph["context"]
             guid = []
             labels = [0] * len(self.label_list)
             for qa in paragraph["qas"]:
                 guid.append(qa["id"])
                 labels[self.label_list.index(qa["question"].replace(
                     '的主体是什么?', ''))] = 1
             guid = str(list(set(guid)))
             example = InputExample(guid=guid,
                                    label=labels,
                                    text_a=paragraph_text)
             examples.append(example)
     logger.warning("%i bad examples has been dropped" % drop)
     return examples
예제 #11
0
파일: glue.py 프로젝트: VVJY/PaddleHub
    def _read_tsv(self, input_file, quotechar=None, wo_label=False):
        """Reads a tab separated value file."""
        with io.open(input_file, "r", encoding="UTF-8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            examples = []
            seq_id = 0
            if self.sub_dataset != 'CoLA' or wo_label:
                header = next(reader)  # skip header
            if self.sub_dataset in [
                    'MRPC',
            ]:
                if wo_label:
                    label_index, text_a_index, text_b_index = [None, -2, -1]
                else:
                    label_index, text_a_index, text_b_index = [0, -2, -1]
            elif self.sub_dataset in [
                    'QNLI',
            ]:
                if wo_label:
                    label_index, text_a_index, text_b_index = [None, 1, 2]
                else:
                    label_index, text_a_index, text_b_index = [3, 1, 2]
            elif self.sub_dataset in [
                    'QQP',
            ]:
                if wo_label:
                    label_index, text_a_index, text_b_index = [None, 1, 2]
                else:
                    label_index, text_a_index, text_b_index = [5, 3, 4]
            elif self.sub_dataset in [
                    'RTE',
            ]:
                if wo_label:
                    label_index, text_a_index, text_b_index = [None, 1, 2]
                else:
                    label_index, text_a_index, text_b_index = [3, 1, 2]
            elif self.sub_dataset in [
                    'SST-2',
            ]:
                if wo_label:
                    label_index, text_a_index, text_b_index = [None, 1, None]
                else:
                    label_index, text_a_index, text_b_index = [1, 0, None]
            elif self.sub_dataset in [
                    'MNLI',
            ]:
                if wo_label:
                    label_index, text_a_index, text_b_index = [None, 8, 9]
                else:
                    label_index, text_a_index, text_b_index = [-1, 8, 9]
            elif self.sub_dataset in ['CoLA']:
                if wo_label:
                    label_index, text_a_index, text_b_index = [None, 1, None]
                else:
                    label_index, text_a_index, text_b_index = [1, 3, None]
            elif self.sub_dataset in ['STS-B']:
                if wo_label:
                    label_index, text_a_index, text_b_index = [None, -2, -1]
                else:
                    label_index, text_a_index, text_b_index = [-1, -3, -2]

            for line in reader:
                try:
                    example = InputExample(guid=seq_id,
                                           text_a=line[text_a_index],
                                           text_b=line[text_b_index] if
                                           text_b_index is not None else None,
                                           label=line[label_index] if
                                           label_index is not None else None)
                    seq_id += 1
                    examples.append(example)
                except:
                    logger.info("[Discard Incorrect Data] " + "\t".join(line))
            return examples