def rouge_extraction(read_file, write_file): rouge = Rouge() data = JSON.load(read_file) for line in data: scores = rouge.get_scores(line["data"]["text_information"]["comment"], line["data"]["text_information"]["reply"])[0] line["data"]["rouge_scores"] = scores JSON.dump(data, write_file) return data
def prepare_iter(filename, firstline=True, task=2): # load datasets to map into indexes if filename.split(".")[-1] == "csv": data_iter = CSV.get_iterator(filename, firstline=firstline, task=task) num_lines = CSV._len(filename) elif filename.split(".")[-1] == "json": data_iter = JSON.get_iterator(filename, task=task) num_lines = JSON._len(filename) else: raise Exception("Not implement yet") return data_iter, num_lines
def pack_batch(self, test_file, batch_size=8): ftype = test_file.split(".")[-1] if ftype == "json": data = JSON.load(test_file) # random.shuffle(data) entries = [] for entry in data: if len(entries) == batch_size: yield entries entries = [] entries.append(entry) if len(entries) != 0: yield entries elif ftype == "csv": data = CSV.read(test_file) entries = [] for row in data: if len(entries) == batch_size: yield entries entries = [] entry = self.prepare_entry(row[0]) entry["gold_output"] = CSV.process_target(row[-1]) entries.append(entry) if len(entries) != 0: yield entries else: print("not implement yet") return
def write_csv(data, file_name, title=""): # data.sort(key=lambda x: len(x[0]), reverse=False) file_type = file_name.split(".")[-1] if file_type == "json": JSON.dump(data, file_name) else: with open(file_name, "w", newline='') as f: if file_name.split(".")[-1] != "csv": if len(title) != 0: f.write(title + "\n") for line in data: f.write(line + "\n") else: writer = csv.writer(f, delimiter=",") if len(title) != 0: writer.writerow(title) writer.writerows(data)
def read_data(filename, firstline=True): # load datasets to map into indexes if filename.split(".")[-1] == "csv": data = CSV.read(filename, firstline=firstline, slices=[0, 1]) elif filename.split(".")[-1] == "txt": data = TXT.read(filename, firstline=firstline) elif filename.split(".")[-1] == "json": data = JSON.load(filename) else: raise Exception("Not implement yet") return data
def load_file(files, firstline=True, task=2): datasets = [] for fname in files: # Read input files if fname.split(".")[-1] == "csv": datasets.append( CSV(fname, limit=-1, firstline=firstline, task=task)) elif fname.split(".")[-1] == "json": datasets.append(JSON(fname, limit=-1, task=task)) else: raise Exception("Not implement yet") return datasets
def build(self, files, limit=-1, firstline=True): """ Read a list of file names, return vocabulary :param files: list of file names :param limit: read number of lines """ swcnt, swl = Counter(), 0 twcnt, twl = Counter(), 0 count = 0 for fname in files: # Read input files if fname.split(".")[-1] == "csv": raw = CSV(fname, limit=limit, firstline=firstline) elif fname.split(".")[-1] == "json": raw = JSON(fname, source2idx=None, target2idx=None, limit=-1) else: raise Exception("Not implement yet") for line in raw: count += 1 (nl, target) = line nl = Vocab.process_nl(nl) target = Vocab.process_target(target) swcnt, swl = Vocab.update_sent(nl, swcnt, swl) twcnt, twl = Vocab.update_sent(target, twcnt, twl) swvocab = Vocab.update_vocab(swcnt, self.swcutoff, sys_tokens) twvocab = Vocab.update_vocab(twcnt, self.twcutoff, sys_tokens) self.sw2i = swvocab self.i2sw = Vocab.reversed_dict(swvocab) self.swl = swl if self.swl < 0 else min(swl, self.swl) self.tw2i = twvocab self.i2tw = Vocab.reversed_dict(twvocab) self.twl = twl if self.twl < 0 else min(twl, self.twl) print("\t- Extracting vocabulary: %d total samples" % count) print("\t\t- Natural Language Side: ") print("\t\t\t- %d total words" % (sum(swcnt.values()))) print("\t\t\t- %d unique words" % (len(swcnt))) print("\t\t\t- %d unique words appearing at least %d times" % (len(swvocab) - 4, self.swcutoff)) print("\t\t- Label Side: ") print("\t\t\t- %d total words" % (sum(twcnt.values()))) print("\t\t\t- %d unique words" % (len(twcnt))) print("\t\t\t- %d unique words appearing at least %d times" % (len(twvocab) - 4, self.twcutoff))
def write_dataset(data_file, train_file, val_file, test_file, tr_ratio=0.9, val_ratio=0.95, shuffle=True, readfirstline=False, writefirstline=False): title = "" file_type = data_file.split(".")[-1] if file_type == "json": corpus = JSON.load(data_file) else: corpus = set() with open(data_file, "r") as f: if file_type == "csv": csvreader = csv.reader(f) if readfirstline: title = next(csvreader) for line in csvreader: corpus.update([tuple([line[0], line[-1]])]) else: if readfirstline: title = next(f) for line in f: corpus.update([line.strip()]) corpus = list(corpus) train_len = int(tr_ratio * len(corpus)) val_len = int(val_ratio * len(corpus)) if shuffle: np.random.shuffle(corpus) train, val, test = np.split(corpus, [train_len, val_len]) train = train.tolist() val = val.tolist() test.tolist() else: train = corpus[:train_len] val = corpus[train_len:val_len] test = corpus[val_len:] if not writefirstline: title = "" if len(train) != 0: write_csv(train, train_file, title) if len(val) != 0: write_csv(val, val_file, title) if len(test) != 0: write_csv(test, test_file, title)
def prepare_entry(task, rv_text, rv_rate, rv_name, rv_title, rv_hotel, rouge_score=None): if task == "sentiment": prompt_text = rv_text elif task == "paraphrase": prompt_text = rv_text else: if rouge_score is not None: # prompt_text = " ".join([rouge_score, SENSP, rv_rate.lower(), SENSP, rv_hotel.lower(), SENSP, # rv_name.lower(), SENSP, rv_title.lower(), SENSP, rv_text.lower(), SENGE]) prompt_text = " ".join([rouge_score, rv_hotel.lower(), rv_name.lower(), rv_text.lower(), SENGE]) else: # prompt_text = " ".join([rv_rate.lower(), SENSP, rv_hotel.lower(), SENSP, rv_name.lower(), SENSP, # rv_title.lower(), SENSP, rv_text.lower(), SENGE]) prompt_text = " ".join([rv_hotel.lower(), rv_name.lower(), rv_text.lower(), SENGE]) prompt_text = JSON.process_nl(prompt_text) return prompt_text
tg2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False) pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id( BPAD) else 0 collate_fn = BPE.collate_fn(pad_id, True) # load datasets to map into indexes if filename.split(".")[-1] == "csv": train_data = CSV.get_iterator(filename, firstline=True, task=2) num_lines = CSV._len(filename) elif filename.split(".")[-1] == "json": train_data = JSON.get_iterator(filename, task=2) num_lines = JSON._len(filename) else: raise Exception("Not implement yet") train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=lb2ids, num_lines=num_lines, bpe=True) train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8, collate_fn=collate_fn) for i, batch in enumerate(train_dataloader):
filename = "/media/data/review_response/Dev.json" s_paras = [-1, 1] t_paras = [-1, 1] vocab = Vocab(s_paras, t_paras) vocab.build([filename]) nl2ids = vocab.lst2idx(vocab_words=vocab.sw2i, unk_words=True, eos=True) tg2ids = vocab.lst2idx(vocab_words=vocab.tw2i, unk_words=False, sos=True, eos=True) train_data = JSON(filename, source2idx=nl2ids, target2idx=tg2ids) # train_data = Csvfile(filename) data_idx = [] batch = 8 for d in Vocab.minibatches(train_data, batch): data_idx.append(d) nl, target = list(zip(*d)) nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl, pad_tok=vocab.sw2i[PAD], nlevels=1) nl_tensor = Data2tensor.idx2tensor(nl_pad_ids, dtype=torch.long, device=device) nl_len_tensor = Data2tensor.idx2tensor(nl_lens,