def _read_classification_record(self, line, label=0): """ data format: label.txt <source words> [tab] <target words> :param line: :return: """ parts = line.strip().split("\t") assert len(parts) == 2, "wrong format for classification\n" + \ "the format is: <source words> [tab] <target words>" source = sent2ids(parts[0], self.source_dic) target = sent2ids(parts[1], self.target_dic) if not self.is_infer: # train or test return source, target, label return source, target
def _read_rank_record(self, line): ''' data format: <source words> [TAB] <left_target words> [TAB] <right_target words> [TAB] <label> ''' fs = line.strip().split('\t') assert len(fs) == 4, "wrong format for rank\n" + \ "the format should be " +\ "<source words> [TAB] <left_target words> [TAB] <right_target words> [TAB] <label>" source = sent2ids(fs[0], self.source_dic) left_target = sent2ids(fs[1], self.target_dic) right_target = sent2ids(fs[2], self.target_dic) if not self.is_infer: label = int(fs[3]) return (source, left_target, right_target, label) return source, left_target, right_target
def _read_regression_record(self, line): ''' data format: <source words> [TAB] <target words> [TAB] <label> @line: str a string line which represent a record. ''' fs = line.strip().split('\t') assert len(fs) == 3, "wrong format for regression\n" + \ "the format shoud be " +\ "<source words> [TAB] <target words> [TAB] <label>'" source = sent2ids(fs[0], self.source_dic) target = sent2ids(fs[1], self.target_dic) if not self.is_infer: label = float(fs[2]) return ( source, target, [label], ) return source, target