def construct_noise_corpus(path: str, noise_prob: float) -> List[Dict]: ori_datas = loads(open(path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] count = 0 src_len, tgt_len = [], [] for d in ori_datas: data = { 'source': add_noise(d['source'], noise_prob), 'target': d['target'] } datas.append(data) src_len.append(len(data['source'].split(' '))) tgt_len.append(len(data['target'].split(' '))) count += 1 if count % 100000 == 0: print(count) print('max len: ', max(src_len), max(tgt_len)) print('avg len: ', sum(src_len) * 1.0 / len(src_len), sum(tgt_len) * 1.0 / len(tgt_len)) return datas
def construct_table2pivot(path: str, index: List[int] = None) -> List[Dict]: ori_datas = loads(open(path))[1:] if index is not None: ori_datas = partion_list(ori_datas, index) statistic = {'length': len(ori_datas)} datas = [statistic] count = 0 src_len, tgt_len = [], [] for d in ori_datas: data = { 'value': d['value'], 'label': d['label'], 'field': d['field'], 'lpos': d['lpos'], 'rpos': d['rpos'] } datas.append(data) src_len.append(len(data['value'].split(' '))) tgt_len.append(len(data['label'].split(' '))) count += 1 if count % 100000 == 0: print(count) print('max len: ', max(src_len), max(tgt_len)) print('avg len: ', sum(src_len) * 1.0 / len(src_len), sum(tgt_len) * 1.0 / len(tgt_len)) return datas
def construct_pivot2text(path: str, index: List[int] = None) -> List[Dict]: ori_datas = loads(open(path))[1:] if index is not None: index = set(index) statistic = {'length': len(ori_datas)} datas = [statistic] count = 0 src_len, tgt_len = [], [] for i, d in enumerate(ori_datas): if index is None or i in index: data = {'source': d['pivot'], 'target': d['text']} else: data = {'source': d['entity'], 'target': d['text']} datas.append(data) src_len.append(len(data['source'].split(' '))) tgt_len.append(len(data['target'].split(' '))) count += 1 if count % 100000 == 0: print(count) print('max len: ', max(src_len), max(tgt_len)) print('avg len: ', sum(src_len) * 1.0 / len(src_len), sum(tgt_len) * 1.0 / len(tgt_len)) return datas
def construct_super_corpus(path: str) -> List[Dict]: ori_datas = loads(open(path))[1:] ori_datas = partion_list(ori_datas, index) statistic = {'length': len(ori_datas)} datas = [statistic] + ori_datas return datas
def test_p2t_dataset(r_path: str, w_path: str) -> List[Dict]: ''' value, text, field, lpos, rpos, pivot, entity ''' ori_datas = loads(open(r_path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] for d in ori_datas: datas.append({'source': d['pivot'], 'target': d['text']}) dumps(datas, open(w_path, 'w'))
def train_p2t_dataset(r_path: str, w_path: str, index: List[int]) -> List[Dict]: ''' value, text, field, lpos, rpos, pivot, entity ''' ori_datas = loads(open(r_path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] index = set(index) for i, d in enumerate(ori_datas): if i in index: #datas.append({'source': d['pivot'], 'target': d['text']}) datas.append(get_filter_data(d)) else: datas.append({'source': d['entity'], 'target': d['text']}) dumps(datas, open(w_path, 'w'))
def print_result_into_file(self, model_outputs: Dict, dataset: Dataset): model_ids = model_outputs['output_ids'] sources, model_words = [], [] fields, lposs, rposs = [], [], [] _fields, _lposs, _rposs = [], [], [] for data in dataset.read(): sources.append(data['value'].split(' ')) fields.append(data['field'].split(' ')) lposs.append(data['lpos'].split(' ')) rposs.append(data['rpos'].split(' ')) for ids, source, field, lpos, rpos in zip(model_ids, sources, fields, lposs, rposs): words = [s for id, s in zip(ids, source) if id > 0] _field = [s for id, s in zip(ids, field) if id > 0] _lpos = [s for id, s in zip(ids, lpos) if id > 0] _rpos = [s for id, s in zip(ids, rpos) if id > 0] model_words.append(' '.join(words)) _fields.append(' '.join(_field)) _lposs.append(' '.join(_lpos)) _rposs.append(' '.join(_rpos)) with open( os.path.join(self.data_path, 'predict-{0}.txt'.format(self.scale)), 'w') as f: print('\n'.join(model_words), file=f) ori_datas = loads(open(os.path.join(self.data_path, 'test.p2t.jsonl'))) for m, d, f, l, r in zip(model_words, ori_datas[1:], _fields, _lposs, _rposs): d['source'] = m d['field'] = f d['lpos'] = l d['rpos'] = r dumps( ori_datas, open( os.path.join(self.data_path, 'test.predict.{0}.jsonl'.format(self.scale)), 'w'))
def test_t2p_dataset(r_path: str, w_path: str) -> List[Dict]: ''' value, label, field, lpos, rpos ''' ori_datas = loads(open(r_path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] for d in ori_datas: data = { 'value': d['value'], 'label': d['label'], 'field': d['field'], 'lpos': d['lpos'], 'rpos': d['rpos'] } datas.append(data) dumps(datas, open(w_path, 'w'))
def test_parallel_dataset(r_path: str, w_path: str) -> List[Dict]: ''' source: value; target: text. ''' ori_datas = loads(open(r_path))[1:] statistic = {'length': len(ori_datas)} datas = [statistic] for d in ori_datas: data = { 'source': d['value'], 'target': d['text'], 'field': d['field'], 'lpos': d['lpos'], 'rpos': d['rpos'] } datas.append(data) dumps(datas, open(w_path, 'w'))
def get_partion_index(path: str, limit: int) -> List[int]: ori_datas = loads(open(path))[1:] index = list(range(len(ori_datas))) random.shuffle(index) return index[:limit]