def load_cola_data(input_path, task, input_format, max_seq_len): if input_format == "text": with open(input_path, "r") as f_in: sentences = f_in.readlines() tokens = [ tokenize_and_truncate(tokenizer_name=task.tokenizer_name, sent=sentence, max_seq_len=max_seq_len) for sentence in sentences ] labels = None elif input_format == "train" or input_format == "dev": data = load_tsv(task.tokenizer_name, input_path, max_seq_len, s1_idx=3, s2_idx=None, label_idx=1) tokens, labels = data[0], data[2] elif input_format == "test": data = load_tsv( task.tokenizer_name, input_path, max_seq_len, s1_idx=1, s2_idx=None, has_labels=False, return_indices=True, skip_rows=1, ) tokens, labels = data[0], None else: raise KeyError(input_format) return tokens, labels
def load_data(self): fold_no = self.fold_no tr_data = load_tsv( self._tokenizer_name, os.path.join(self.path, "fold{}/train.tsv".format(fold_no)), self.max_seq_len, s1_idx=1, s2_idx=None, label_idx=2, label_fn=lambda label_str: { "acceptable": 1, "unacceptable": 0 }[label_str], skip_rows=0, ) val_data = load_tsv( self.tokenizer_name, os.path.join(self.path, "fold{}/dev.tsv".format(fold_no)), self.max_seq_len, s1_idx=1, s2_idx=None, label_idx=2, label_fn=lambda label_str: { "acceptable": 1, "unacceptable": 0 }[label_str], skip_rows=0, ) te_data = load_tsv( self.tokenizer_name, os.path.join(self.path, "fold{}/test.tsv".format(fold_no)), self.max_seq_len, s1_idx=1, s2_idx=None, label_idx=2, label_fn=lambda label_str: { "acceptable": 1, "unacceptable": 0 }[label_str], skip_rows=0, ) self.train_data_text = tr_data self.val_data_text = val_data self.test_data_text = te_data self.sentences = self.train_data_text[0] + self.val_data_text[0] log.info("\tFinished loading acceptability probing {} data (fold{}).". format(self.name, fold_no))
def load_data(self): targ_map = {"neutral": 0, "entailment": 1, "contradiction": 2} prob_data = load_tsv( self._tokenizer_name, os.path.join(self.path, "dev.tsv"), max_seq_len=self.max_seq_len, s1_idx=0, s2_idx=1, label_idx=2, label_fn=targ_map.__getitem__, skip_rows=0, ) self.train_data_text = self.val_data_text = self.test_data_text = prob_data self.sentences = self.val_data_text[0] + self.val_data_text[1] log.info("\tFinished loading NP/S data.")
def load_data(self): targ_map = self.targ_map prob_data = load_tsv( self._tokenizer_name, os.path.join(self.path, "dev.tsv"), max_seq_len=self.max_seq_len, s1_idx=0, s2_idx=1, label_idx=2, label_fn=targ_map.__getitem__, skip_rows=0, ) self.train_data_text = self.val_data_text = self.test_data_text = prob_data self.sentences = self.val_data_text[0] + self.val_data_text[1] log.info("\tFinished loading NLI-type probing data on %s." % self.name)
def load_data(self): prob_data = load_tsv( data_file=os.path.join(self.path, "all.prepswap.turk.newlabels.tsv"), max_seq_len=self.max_seq_len, s1_idx=8, s2_idx=9, label_idx=0, skip_rows=0, tokenizer_name=self._tokenizer_name, ) self.train_data_text = self.val_data_text = self.test_data_text = prob_data self.sentences = (self.train_data_text[0] + self.train_data_text[1] + self.val_data_text[0] + self.val_data_text[1]) log.info("\tFinished loading preposition swap data.")
def load_data(self): targ_map = {"0": 0, "1": 1, "2": 2} prob_data = load_tsv( data_file=os.path.join(self.path, self.probe_path), max_seq_len=self.max_seq_len, s1_idx=9, s2_idx=10, label_idx=1, label_fn=targ_map.__getitem__, skip_rows=1, return_indices=True, tokenizer_name=self._tokenizer_name, ) self.train_data_text = self.val_data_text = self.test_data_text = prob_data self.sentences = self.val_data_text[0] + self.val_data_text[1] log.info("\tFinished loading NLI-alt probing data.")
def load_data(self): targ_map = {"neutral": 0, "entailment": 1, "contradiction": 2} prob_data = load_tsv( data_file=os.path.join(self.path), max_seq_len=self.max_seq_len, s1_idx=8, s2_idx=9, label_idx=10, label_fn=targ_map.__getitem__, skip_rows=1, tokenizer_name=self._tokenizer_name, ) self.train_data_text = self.val_data_text = self.test_data_text = prob_data self.sentences = self.val_data_text[0] + self.val_data_text[1] log.info("\tFinished loading negation data.")
def test(self): max_seq_len = 30 sent1s, sent2s, labels = data_loaders.load_tsv( "MosesTokenizer", self.path, max_seq_len, s1_idx=0, s2_idx=None, label_idx=1, skip_rows=1, ) print(sent2s) assert sent2s == [] assert ( len(sent1s) == 2 ), "The length of the set of first sentences != total rows in data file" assert len(sent2s) == 0, "Second sentence does not exist yet len(sent2s) != 0" assert len(labels) == 2, "The length of labels should be equal to rows in data file"
def get_data_iter(self, path): """ Load data file (combine the entailment and contradiction sentence), tokenize text and concat sentences to create long term dependencies. Args: path: (str) data file path """ seq_len = self.max_seq_len targ_map = {"neutral": 0, "entailment": 1, "contradiction": 2} data = load_tsv( os.path.join(path), 1000, skip_rows=1, s1_idx=8, s2_idx=9, targ_idx=11, targ_map=targ_map, ) tokens = [] for x, y in zip(data[0], data[1]): tokens += x[1:-1] + ["<EOS>"] + y[1:-1] + ["<EOS>"] for i in range(0, len(tokens), seq_len): yield tokens[i : i + seq_len]
def test(self): max_seq_len = 30 sent1s, sent2s, labels, indices = data_loaders.load_tsv( "MosesTokenizer", self.path, max_seq_len, s1_idx=0, s2_idx=1, return_indices=1, label_idx=1, skip_rows=1, ) assert "charming" in sent1s[0], "sent1s is not tokenized first sentence" assert "agree" in sent2s[0], "sent2s is not tokenized second sentence" assert ( len(sent1s) == 2 ), "The length of the set of first sentences != total rows in data file" assert ( len(sent2s) == 2 ), "The length of the set of second sentences != total rows in data file" assert len(labels) == 2, "The length of labels should be equal to num rows in data file" assert ( len(indices) == 2 ), "The length of returned indices should be equal to num rows in data file"