示例#1
0
 def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False):
     assert split in ['train', 'dev', 'test']
     if split == 'train':
         df = self.df_train
     elif split == 'dev':
         df = self.df_dev
     else:
         df = self.df_test
     sentences = adjust_twitter_tokenization(df.text)
     input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length)
     labels = torch.tensor(df.label.values)
     return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)
示例#2
0
    def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False):
        """
        Returns an iterable over the single
        Args:
            split: train/dev/test
        Returns:
            Iterable for the specified split
        """
        # current iter will have only two classes; we could extend it to have more
        df = self._get_dataframe(split)

        sentences = df.Tweet_text.values
        sentences = adjust_twitter_tokenization(sentences)
        labels = df.Label.values

        input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length)
        labels = torch.tensor(labels)#.unsqueeze(1)

        return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)
示例#3
0
    def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False):
        """
        Returns an iterable over the single
        Args:
            split: train/dev/test
        Returns:
            Iterable for the specified split
        """
        assert split in ['train', 'dev', 'test']
        # Load dataset into Pandas Dataframe, then extract columns as numpy arrays
        data_df = pd.read_csv('./data/semeval18_task1_class/{}.txt'.format(split), sep='\t')
        sentences = data_df.Tweet.values
        sentences = adjust_twitter_tokenization(sentences)
        labels = data_df[self.emotions].values

        input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length)
        labels = torch.tensor(labels)

        return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)
示例#4
0
    def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False):
        # Load dataset into Pandas Dataframe, then extract columns as numpy arrays
        if split == 'test' or split == 'dev':
            data_df = pd.read_csv('data/OLIDv1.0/testset-levela.tsv', sep='\t')
            sentences = data_df.tweet.values
            data_df_labels = pd.read_csv('data/OLIDv1.0/labels-levela.csv', sep=',', header=None)
            data_df_labels[1].replace(to_replace='OFF', value=1, inplace=True)
            data_df_labels[1].replace(to_replace='NOT', value=0, inplace=True)
            labels = data_df_labels[1].values
        # TODO Make Dev set
        else:
            data_df = pd.read_csv('data/OLIDv1.0/olid-training-v1.0.tsv', sep='\t')
            sentences = data_df.tweet.values
            data_df.subtask_a.replace(to_replace='OFF', value=1, inplace=True)
            data_df.subtask_a.replace(to_replace='NOT', value=0, inplace=True)
            labels = data_df.subtask_a.values

        sentences = adjust_twitter_tokenization(sentences)
        input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length)
        labels = torch.tensor(labels)

        return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)
示例#5
0
    def get_iter(self, split, tokenizer, batch_size=16, shuffle=False, random_state=1, max_length=64, supp_query_split=False):
        """
        Returns an iterable over the single
        Args:
            split: train/dev/test
        Returns:
            Iterable for the specified split
        """
        assert split in ['train', 'dev', 'test']
        df = pd.read_json('data/atcs_sarcasm_data/sarcasm_twitter_{}.json'.format(split), lines=True, encoding='utf8')
        df = df.sample(frac=1).reset_index(drop=True)

        df['context'] = [l[:2] for l in df['context']]
        df['contextstr'] = ['; '.join(map(str, l)) for l in df['context']]
        df['sentence'] = df['response'] + df['contextstr']

        sentences = df.sentence.values
        sentences = adjust_twitter_tokenization(sentences)
        labels = np.where(df.label.values == 'SARCASM', 1, 0)

        input_ids, attention_masks = self.fn_tokenizer(sentences, tokenizer, max_length=max_length)
        labels = torch.tensor(labels)#.unsqueeze(1)

        return make_dataloader(self.NAME, input_ids, labels, attention_masks, batch_size, shuffle, supp_query_split=supp_query_split)