Пример #1
0
def rouge_extraction(read_file, write_file):
    rouge = Rouge()
    data = JSON.load(read_file)
    for line in data:
        scores = rouge.get_scores(line["data"]["text_information"]["comment"],
                                  line["data"]["text_information"]["reply"])[0]
        line["data"]["rouge_scores"] = scores
    JSON.dump(data, write_file)
    return data
Пример #2
0
 def prepare_iter(filename, firstline=True, task=2):
     # load datasets to map into indexes
     if filename.split(".")[-1] == "csv":
         data_iter = CSV.get_iterator(filename,
                                      firstline=firstline,
                                      task=task)
         num_lines = CSV._len(filename)
     elif filename.split(".")[-1] == "json":
         data_iter = JSON.get_iterator(filename, task=task)
         num_lines = JSON._len(filename)
     else:
         raise Exception("Not implement yet")
     return data_iter, num_lines
Пример #3
0
    def pack_batch(self, test_file, batch_size=8):
        ftype = test_file.split(".")[-1]
        if ftype == "json":
            data = JSON.load(test_file)
            # random.shuffle(data)
            entries = []
            for entry in data:
                if len(entries) == batch_size:
                    yield entries
                    entries = []
                entries.append(entry)
            if len(entries) != 0:
                yield entries
        elif ftype == "csv":
            data = CSV.read(test_file)
            entries = []
            for row in data:
                if len(entries) == batch_size:
                    yield entries
                    entries = []
                entry = self.prepare_entry(row[0])
                entry["gold_output"] = CSV.process_target(row[-1])
                entries.append(entry)
            if len(entries) != 0:
                yield entries

        else:
            print("not implement yet")
            return
Пример #4
0
def write_csv(data, file_name, title=""):
    # data.sort(key=lambda x: len(x[0]), reverse=False)
    file_type = file_name.split(".")[-1]
    if file_type == "json":
        JSON.dump(data, file_name)
    else:
        with open(file_name, "w", newline='') as f:
            if file_name.split(".")[-1] != "csv":
                if len(title) != 0:
                    f.write(title + "\n")
                for line in data:
                    f.write(line + "\n")
            else:
                writer = csv.writer(f, delimiter=",")
                if len(title) != 0:
                    writer.writerow(title)
                writer.writerows(data)
Пример #5
0
def read_data(filename, firstline=True):
    # load datasets to map into indexes
    if filename.split(".")[-1] == "csv":
        data = CSV.read(filename, firstline=firstline, slices=[0, 1])
    elif filename.split(".")[-1] == "txt":
        data = TXT.read(filename, firstline=firstline)
    elif filename.split(".")[-1] == "json":
        data = JSON.load(filename)
    else:
        raise Exception("Not implement yet")
    return data
Пример #6
0
 def load_file(files, firstline=True, task=2):
     datasets = []
     for fname in files:
         # Read input files
         if fname.split(".")[-1] == "csv":
             datasets.append(
                 CSV(fname, limit=-1, firstline=firstline, task=task))
         elif fname.split(".")[-1] == "json":
             datasets.append(JSON(fname, limit=-1, task=task))
         else:
             raise Exception("Not implement yet")
     return datasets
Пример #7
0
    def build(self, files, limit=-1, firstline=True):
        """
        Read a list of file names, return vocabulary
        :param files: list of file names
        :param limit: read number of lines
        """
        swcnt, swl = Counter(), 0
        twcnt, twl = Counter(), 0
        count = 0

        for fname in files:
            # Read input files
            if fname.split(".")[-1] == "csv":
                raw = CSV(fname, limit=limit, firstline=firstline)
            elif fname.split(".")[-1] == "json":
                raw = JSON(fname, source2idx=None, target2idx=None, limit=-1)
            else:
                raise Exception("Not implement yet")

            for line in raw:
                count += 1
                (nl, target) = line
                nl = Vocab.process_nl(nl)
                target = Vocab.process_target(target)
                swcnt, swl = Vocab.update_sent(nl, swcnt, swl)
                twcnt, twl = Vocab.update_sent(target, twcnt, twl)

        swvocab = Vocab.update_vocab(swcnt, self.swcutoff, sys_tokens)

        twvocab = Vocab.update_vocab(twcnt, self.twcutoff, sys_tokens)

        self.sw2i = swvocab
        self.i2sw = Vocab.reversed_dict(swvocab)
        self.swl = swl if self.swl < 0 else min(swl, self.swl)

        self.tw2i = twvocab
        self.i2tw = Vocab.reversed_dict(twvocab)
        self.twl = twl if self.twl < 0 else min(twl, self.twl)

        print("\t- Extracting vocabulary: %d total samples" % count)

        print("\t\t- Natural Language Side: ")
        print("\t\t\t- %d total words" % (sum(swcnt.values())))
        print("\t\t\t- %d unique words" % (len(swcnt)))
        print("\t\t\t- %d unique words appearing at least %d times" %
              (len(swvocab) - 4, self.swcutoff))
        print("\t\t- Label Side: ")
        print("\t\t\t- %d total words" % (sum(twcnt.values())))
        print("\t\t\t- %d unique words" % (len(twcnt)))
        print("\t\t\t- %d unique words appearing at least %d times" %
              (len(twvocab) - 4, self.twcutoff))
Пример #8
0
def write_dataset(data_file,
                  train_file,
                  val_file,
                  test_file,
                  tr_ratio=0.9,
                  val_ratio=0.95,
                  shuffle=True,
                  readfirstline=False,
                  writefirstline=False):

    title = ""
    file_type = data_file.split(".")[-1]
    if file_type == "json":
        corpus = JSON.load(data_file)
    else:
        corpus = set()
        with open(data_file, "r") as f:
            if file_type == "csv":
                csvreader = csv.reader(f)
                if readfirstline:
                    title = next(csvreader)
                for line in csvreader:
                    corpus.update([tuple([line[0], line[-1]])])
            else:
                if readfirstline:
                    title = next(f)
                for line in f:
                    corpus.update([line.strip()])
            corpus = list(corpus)
    train_len = int(tr_ratio * len(corpus))
    val_len = int(val_ratio * len(corpus))
    if shuffle:
        np.random.shuffle(corpus)
        train, val, test = np.split(corpus, [train_len, val_len])
        train = train.tolist()
        val = val.tolist()
        test.tolist()
    else:
        train = corpus[:train_len]
        val = corpus[train_len:val_len]
        test = corpus[val_len:]

    if not writefirstline:
        title = ""
    if len(train) != 0:
        write_csv(train, train_file, title)
    if len(val) != 0:
        write_csv(val, val_file, title)
    if len(test) != 0:
        write_csv(test, test_file, title)
Пример #9
0
 def prepare_entry(task, rv_text, rv_rate, rv_name, rv_title, rv_hotel, rouge_score=None):
     if task == "sentiment":
         prompt_text = rv_text
     elif task == "paraphrase":
         prompt_text = rv_text
     else:
         if rouge_score is not None:
             # prompt_text = " ".join([rouge_score, SENSP, rv_rate.lower(), SENSP, rv_hotel.lower(), SENSP,
             #                         rv_name.lower(), SENSP, rv_title.lower(), SENSP, rv_text.lower(), SENGE])
             prompt_text = " ".join([rouge_score, rv_hotel.lower(), rv_name.lower(), rv_text.lower(), SENGE])
         else:
             # prompt_text = " ".join([rv_rate.lower(), SENSP, rv_hotel.lower(), SENSP, rv_name.lower(), SENSP,
             #                         rv_title.lower(), SENSP, rv_text.lower(), SENGE])
             prompt_text = " ".join([rv_hotel.lower(), rv_name.lower(), rv_text.lower(), SENGE])
     prompt_text = JSON.process_nl(prompt_text)
     return prompt_text
Пример #10
0
        tg2ids = BPE.tokens2ids(tokenizer,
                                sos=False,
                                eos=False,
                                add_special_tokens=False)

        pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id(
            BPAD) else 0

    collate_fn = BPE.collate_fn(pad_id, True)

    # load datasets to map into indexes
    if filename.split(".")[-1] == "csv":
        train_data = CSV.get_iterator(filename, firstline=True, task=2)
        num_lines = CSV._len(filename)
    elif filename.split(".")[-1] == "json":
        train_data = JSON.get_iterator(filename, task=2)
        num_lines = JSON._len(filename)
    else:
        raise Exception("Not implement yet")

    train_iterdataset = IterDataset(train_data,
                                    source2idx=nl2ids,
                                    target2idx=lb2ids,
                                    num_lines=num_lines,
                                    bpe=True)
    train_dataloader = DataLoader(train_iterdataset,
                                  pin_memory=True,
                                  batch_size=8,
                                  collate_fn=collate_fn)

    for i, batch in enumerate(train_dataloader):
Пример #11
0
    filename = "/media/data/review_response/Dev.json"

    s_paras = [-1, 1]
    t_paras = [-1, 1]

    vocab = Vocab(s_paras, t_paras)
    vocab.build([filename])

    nl2ids = vocab.lst2idx(vocab_words=vocab.sw2i, unk_words=True, eos=True)

    tg2ids = vocab.lst2idx(vocab_words=vocab.tw2i,
                           unk_words=False,
                           sos=True,
                           eos=True)

    train_data = JSON(filename, source2idx=nl2ids, target2idx=tg2ids)
    # train_data = Csvfile(filename)

    data_idx = []
    batch = 8
    for d in Vocab.minibatches(train_data, batch):
        data_idx.append(d)
        nl, target = list(zip(*d))

        nl_pad_ids, nl_lens = seqPAD.pad_sequences(nl,
                                                   pad_tok=vocab.sw2i[PAD],
                                                   nlevels=1)
        nl_tensor = Data2tensor.idx2tensor(nl_pad_ids,
                                           dtype=torch.long,
                                           device=device)
        nl_len_tensor = Data2tensor.idx2tensor(nl_lens,