def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False): assert split in ['train', 'dev', 'test'] if split == 'train': df = self.df_train elif split == 'dev': df = self.df_dev else: df = self.df_test sentences = adjust_twitter_tokenization(df.text) input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length) labels = torch.tensor(df.label.values) return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)
def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False): """ Returns an iterable over the single Args: split: train/dev/test Returns: Iterable for the specified split """ # current iter will have only two classes; we could extend it to have more df = self._get_dataframe(split) sentences = df.Tweet_text.values sentences = adjust_twitter_tokenization(sentences) labels = df.Label.values input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length) labels = torch.tensor(labels)#.unsqueeze(1) return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)
def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False): """ Returns an iterable over the single Args: split: train/dev/test Returns: Iterable for the specified split """ assert split in ['train', 'dev', 'test'] # Load dataset into Pandas Dataframe, then extract columns as numpy arrays data_df = pd.read_csv('./data/semeval18_task1_class/{}.txt'.format(split), sep='\t') sentences = data_df.Tweet.values sentences = adjust_twitter_tokenization(sentences) labels = data_df[self.emotions].values input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length) labels = torch.tensor(labels) return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)
def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False): # Load dataset into Pandas Dataframe, then extract columns as numpy arrays if split == 'test' or split == 'dev': data_df = pd.read_csv('data/OLIDv1.0/testset-levela.tsv', sep='\t') sentences = data_df.tweet.values data_df_labels = pd.read_csv('data/OLIDv1.0/labels-levela.csv', sep=',', header=None) data_df_labels[1].replace(to_replace='OFF', value=1, inplace=True) data_df_labels[1].replace(to_replace='NOT', value=0, inplace=True) labels = data_df_labels[1].values # TODO Make Dev set else: data_df = pd.read_csv('data/OLIDv1.0/olid-training-v1.0.tsv', sep='\t') sentences = data_df.tweet.values data_df.subtask_a.replace(to_replace='OFF', value=1, inplace=True) data_df.subtask_a.replace(to_replace='NOT', value=0, inplace=True) labels = data_df.subtask_a.values sentences = adjust_twitter_tokenization(sentences) input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length) labels = torch.tensor(labels) return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)
def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False): """ Returns an iterable over the single Args: split: train/dev/test Returns: Iterable for the specified split """ assert split in ['train', 'dev', 'test'] df = pd.read_json('data/atcs_sarcasm_data/sarcasm_twitter_{}.json'.format(split), lines=True, encoding='utf8') df = df.sample(frac=1).reset_index(drop=True) df['context'] = [l[:2] for l in df['context']] df['contextstr'] = ['; '.join(map(str, l)) for l in df['context']] df['sentence'] = df['response'] + df['contextstr'] sentences = df.sentence.values sentences = adjust_twitter_tokenization(sentences) labels = np.where(df.label.values == 'SARCASM', 1, 0) input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length) labels = torch.tensor(labels)#.unsqueeze(1) return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)