示例#1
0
def load_cola_data(input_path, task, input_format, max_seq_len):
    if input_format == "text":
        with open(input_path, "r") as f_in:
            sentences = f_in.readlines()
        tokens = [
            tokenize_and_truncate(tokenizer_name=task.tokenizer_name,
                                  sent=sentence,
                                  max_seq_len=max_seq_len)
            for sentence in sentences
        ]
        labels = None
    elif input_format == "train" or input_format == "dev":
        data = load_tsv(task.tokenizer_name,
                        input_path,
                        max_seq_len,
                        s1_idx=3,
                        s2_idx=None,
                        label_idx=1)
        tokens, labels = data[0], data[2]
    elif input_format == "test":
        data = load_tsv(
            task.tokenizer_name,
            input_path,
            max_seq_len,
            s1_idx=1,
            s2_idx=None,
            has_labels=False,
            return_indices=True,
            skip_rows=1,
        )
        tokens, labels = data[0], None
    else:
        raise KeyError(input_format)
    return tokens, labels
    def load_data(self):
        fold_no = self.fold_no
        tr_data = load_tsv(
            self._tokenizer_name,
            os.path.join(self.path, "fold{}/train.tsv".format(fold_no)),
            self.max_seq_len,
            s1_idx=1,
            s2_idx=None,
            label_idx=2,
            label_fn=lambda label_str: {
                "acceptable": 1,
                "unacceptable": 0
            }[label_str],
            skip_rows=0,
        )
        val_data = load_tsv(
            self.tokenizer_name,
            os.path.join(self.path, "fold{}/dev.tsv".format(fold_no)),
            self.max_seq_len,
            s1_idx=1,
            s2_idx=None,
            label_idx=2,
            label_fn=lambda label_str: {
                "acceptable": 1,
                "unacceptable": 0
            }[label_str],
            skip_rows=0,
        )
        te_data = load_tsv(
            self.tokenizer_name,
            os.path.join(self.path, "fold{}/test.tsv".format(fold_no)),
            self.max_seq_len,
            s1_idx=1,
            s2_idx=None,
            label_idx=2,
            label_fn=lambda label_str: {
                "acceptable": 1,
                "unacceptable": 0
            }[label_str],
            skip_rows=0,
        )

        self.train_data_text = tr_data
        self.val_data_text = val_data
        self.test_data_text = te_data
        self.sentences = self.train_data_text[0] + self.val_data_text[0]
        log.info("\tFinished loading acceptability probing {} data (fold{}).".
                 format(self.name, fold_no))
示例#3
0
 def load_data(self):
     targ_map = {"neutral": 0, "entailment": 1, "contradiction": 2}
     prob_data = load_tsv(
         self._tokenizer_name,
         os.path.join(self.path, "dev.tsv"),
         max_seq_len=self.max_seq_len,
         s1_idx=0,
         s2_idx=1,
         label_idx=2,
         label_fn=targ_map.__getitem__,
         skip_rows=0,
     )
     self.train_data_text = self.val_data_text = self.test_data_text = prob_data
     self.sentences = self.val_data_text[0] + self.val_data_text[1]
     log.info("\tFinished loading NP/S data.")
示例#4
0
 def load_data(self):
     targ_map = self.targ_map
     prob_data = load_tsv(
         self._tokenizer_name,
         os.path.join(self.path, "dev.tsv"),
         max_seq_len=self.max_seq_len,
         s1_idx=0,
         s2_idx=1,
         label_idx=2,
         label_fn=targ_map.__getitem__,
         skip_rows=0,
     )
     self.train_data_text = self.val_data_text = self.test_data_text = prob_data
     self.sentences = self.val_data_text[0] + self.val_data_text[1]
     log.info("\tFinished loading NLI-type probing data on %s." % self.name)
示例#5
0
 def load_data(self):
     prob_data = load_tsv(
         data_file=os.path.join(self.path,
                                "all.prepswap.turk.newlabels.tsv"),
         max_seq_len=self.max_seq_len,
         s1_idx=8,
         s2_idx=9,
         label_idx=0,
         skip_rows=0,
         tokenizer_name=self._tokenizer_name,
     )
     self.train_data_text = self.val_data_text = self.test_data_text = prob_data
     self.sentences = (self.train_data_text[0] + self.train_data_text[1] +
                       self.val_data_text[0] + self.val_data_text[1])
     log.info("\tFinished loading preposition swap data.")
示例#6
0
    def load_data(self):
        targ_map = {"0": 0, "1": 1, "2": 2}
        prob_data = load_tsv(
            data_file=os.path.join(self.path, self.probe_path),
            max_seq_len=self.max_seq_len,
            s1_idx=9,
            s2_idx=10,
            label_idx=1,
            label_fn=targ_map.__getitem__,
            skip_rows=1,
            return_indices=True,
            tokenizer_name=self._tokenizer_name,
        )

        self.train_data_text = self.val_data_text = self.test_data_text = prob_data
        self.sentences = self.val_data_text[0] + self.val_data_text[1]
        log.info("\tFinished loading NLI-alt probing data.")
示例#7
0
    def load_data(self):
        targ_map = {"neutral": 0, "entailment": 1, "contradiction": 2}

        prob_data = load_tsv(
            data_file=os.path.join(self.path),
            max_seq_len=self.max_seq_len,
            s1_idx=8,
            s2_idx=9,
            label_idx=10,
            label_fn=targ_map.__getitem__,
            skip_rows=1,
            tokenizer_name=self._tokenizer_name,
        )

        self.train_data_text = self.val_data_text = self.test_data_text = prob_data
        self.sentences = self.val_data_text[0] + self.val_data_text[1]
        log.info("\tFinished loading negation data.")
 def test(self):
     max_seq_len = 30
     sent1s, sent2s, labels = data_loaders.load_tsv(
         "MosesTokenizer",
         self.path,
         max_seq_len,
         s1_idx=0,
         s2_idx=None,
         label_idx=1,
         skip_rows=1,
     )
     print(sent2s)
     assert sent2s == []
     assert (
         len(sent1s) == 2
     ), "The length of the set of first sentences != total rows in data file"
     assert len(sent2s) == 0, "Second sentence does not exist yet len(sent2s) != 0"
     assert len(labels) == 2, "The length of labels should be equal to rows in data file"
示例#9
0
 def get_data_iter(self, path):
     """
     Load data file (combine the entailment and contradiction sentence), tokenize text
      and concat sentences to create long term dependencies.
     Args:
         path: (str) data file path
     """
     seq_len = self.max_seq_len
     targ_map = {"neutral": 0, "entailment": 1, "contradiction": 2}
     data = load_tsv(
         os.path.join(path),
         1000,
         skip_rows=1,
         s1_idx=8,
         s2_idx=9,
         targ_idx=11,
         targ_map=targ_map,
     )
     tokens = []
     for x, y in zip(data[0], data[1]):
         tokens += x[1:-1] + ["<EOS>"] + y[1:-1] + ["<EOS>"]
     for i in range(0, len(tokens), seq_len):
         yield tokens[i : i + seq_len]
 def test(self):
     max_seq_len = 30
     sent1s, sent2s, labels, indices = data_loaders.load_tsv(
         "MosesTokenizer",
         self.path,
         max_seq_len,
         s1_idx=0,
         s2_idx=1,
         return_indices=1,
         label_idx=1,
         skip_rows=1,
     )
     assert "charming" in sent1s[0], "sent1s is not tokenized first sentence"
     assert "agree" in sent2s[0], "sent2s is not tokenized second sentence"
     assert (
         len(sent1s) == 2
     ), "The length of the set of first sentences != total rows in data file"
     assert (
         len(sent2s) == 2
     ), "The length of the set of second sentences != total rows in data file"
     assert len(labels) == 2, "The length of labels should be equal to num rows in data file"
     assert (
         len(indices) == 2
     ), "The length of returned indices should be equal to num rows in data file"