def test_load_data(self):
        """ Test Case for correct loading of dataframes """
        self.assertIsInstance(
            load_data(os.path.join(os.path.pardir, 'src', 'data',
                                   'tweets.csv')), pd.core.frame.DataFrame)

        self.assertIsInstance(
            get_datasets(
                os.path.join(os.path.pardir, 'src', 'data',
                             'labeled_data.csv'),
                os.path.join(
                    os.path.pardir, 'src', 'data',
                    'hatespeech_text_label_vote_RESTRICTED_100K.csv'))[0],
            pd.core.frame.DataFrame)

        self.assertIsInstance(
            get_datasets(
                os.path.join(os.path.pardir, 'src', 'data',
                             'labeled_data.csv'),
                os.path.join(
                    os.path.pardir, 'src', 'data',
                    'hatespeech_text_label_vote_RESTRICTED_100K.csv'))[1],
            pd.core.frame.DataFrame)

        self.assertIsInstance(
            concatenate_datasets(
                os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'),
                self.df2, self.df3), pd.core.frame.DataFrame)
예제 #2
0
def load_labeled_dataset():
    """
    Concatenate the data sets from csv-files (labeled_data.csv, hatespeech_text_label_vote_RESTRICTED_100K.csv,
    tweets.csv) together and return it as a pandas dataframe.

    Returns
    -------
    df_concatenated:        Pandas dataframe
                            The dataframe containing all data from the mentioned csv-files.
    """
    # if tweets not already loaded from TwitterAPI
    if not os.path.isfile(os.path.join('data', 'tweets.csv')):
        # load dataset from https://github.com/zeerakw/hatespeech, loads tweets via tweet id
        df = get_tweets_by_id(config, os.path.join('data',
                                                   'NAACL_SRW_2016.csv'))

    # load datasets from
    #  https://github.com/t-davidson/hate-speech-and-offensive-language/tree/master/data (df2)
    #  and https://github.com/jaeyk/intersectional-bias-in-ml (df3)
    df2, df3 = get_datasets(
        os.path.join('data', 'labeled_data.csv'),
        os.path.join('data', 'hatespeech_text_label_vote_RESTRICTED_100K.csv'))

    df_concatenated = concatenate_datasets(os.path.join('data', 'tweets.csv'),
                                           df2, df3)

    return df_concatenated
 def setUp(self):
     self.df = load_data(
         os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'))
     self.df2, self.df3 = get_datasets(
         os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'),
         os.path.join(os.path.pardir, 'src', 'data',
                      'hatespeech_text_label_vote_RESTRICTED_100K.csv'))
     self.df_concatenated = concatenate_datasets(
         os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'),
         self.df2, self.df3)
    def setUp(self):
        self.df = load_data(
            os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'))
        self.df2, self.df3 = get_datasets(
            os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'),
            os.path.join(os.path.pardir, 'src', 'data',
                         'hatespeech_text_label_vote_RESTRICTED_100K.csv'))
        self.df_concatenated = concatenate_datasets(
            os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'),
            self.df2, self.df3)

        self.training_data, self.testing_data, self.training_y, self.testing_y = split_data(
            self.df_concatenated, 'text', 'hate_speech', 0.25)
예제 #5
0
 def setUp(self):
     self.df = load_data(
         os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'))
     self.df2, self.df3 = get_datasets(
         os.path.join(os.path.pardir, 'src', 'data', 'labeled_data.csv'),
         os.path.join(os.path.pardir, 'src', 'data',
                      'hatespeech_text_label_vote_RESTRICTED_100K.csv'))
     self.df_concatenated = concatenate_datasets(
         os.path.join(os.path.pardir, 'src', 'data', 'tweets.csv'),
         self.df2, self.df3)
     self.test_set = pd.DataFrame([
         'This is the first document.',
         'This document is the second document.',
         'And this is the third one.', 'Is this the first document?'
     ],
                                  columns=["text"])
     self.test_result_count = [[0, 1, 1, 1, 0, 0, 1, 0, 1],
                               [0, 2, 0, 1, 0, 1, 1, 0, 1],
                               [1, 0, 0, 1, 1, 0, 1, 1, 1],
                               [0, 1, 1, 1, 0, 0, 1, 0, 1]]