Пример #1
0
    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
        self.dataset_name1 = 'lcc1.sentiment'
        self.file_extension1 = DATASETS[self.dataset_name1]['file_extension']

        self.dataset_dir1 = download_dataset(self.dataset_name1, cache_dir=cache_dir)
        self.file_path1 = os.path.join(self.dataset_dir1, self.dataset_name1 + self.file_extension1)
        
        self.dataset_name2 = 'lcc2.sentiment'
        self.file_extension2 = DATASETS[self.dataset_name2]['file_extension']

        self.dataset_dir2 = download_dataset(self.dataset_name2, cache_dir=cache_dir)
        self.file_path2 = os.path.join(self.dataset_dir2, self.dataset_name2 + self.file_extension2)
Пример #2
0
    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
        self.dataset_name = 'wikiann'
        self.file_extension = DATASETS[self.dataset_name]['file_extension']

        self.dataset_dir = download_dataset(self.dataset_name,
                                            process_func=_wikiann_process_func,
                                            cache_dir=cache_dir)
Пример #3
0
 def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
     self.dataset_name = 'ddt'
     self.file_extension = DATASETS[self.dataset_name]['file_extension']
     self.dataset_dir = download_dataset('ddt',
                                         process_func=_unzip_process_func,
                                         cache_dir=cache_dir,
                                         verbose=True)
Пример #4
0
    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
        self.dataset_name = 'europarl.sentiment1'
        self.file_extension = DATASETS[self.dataset_name]['file_extension']

        self.dataset_dir = download_dataset(self.dataset_name,
                                            cache_dir=cache_dir)
        self.file_path = os.path.join(self.dataset_dir,
                                      self.dataset_name + self.file_extension)
Пример #5
0
    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
        self.dataset_name = 'angrytweets.sentiment'

        self.dataset_dir = download_dataset(
            self.dataset_name,
            cache_dir=cache_dir,
            process_func=_twitter_data_process_func)
        self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv')
Пример #6
0
 def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
     self.dataset_name = 'twitter_sentiment'
     #self.file_extension = DATASETS[self.dataset_name]['file_extension']
     
     self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir)
     #self.file_path = os.path.join(self.dataset_dir, self.dataset_name + '.csv')
     
 #def load_with_pandas(self):
 #    return pd.read_csv(self.file_path, sep=',', encoding='utf-8')
Пример #7
0
    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
        self.dataset_name = 'wordsim353.da'
        self.file_extension = DATASETS[self.dataset_name]['file_extension']

        self.dataset_dir = download_dataset(
            self.dataset_name,
            process_func=_word_sim_process_func,
            cache_dir=cache_dir)
        self.file_path = os.path.join(self.dataset_dir,
                                      self.dataset_name + self.file_extension)
Пример #8
0
    def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
        self.dataset_name = 'daned'
        self.file_extension = DATASETS[self.dataset_name]['file_extension']
        self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir)

        # dictionary of Wikidata properties for each QID of the dataset
        with open(os.path.join(self.dataset_dir, self.dataset_name + '.props.json')) as f:
            self.properties = json.load(f)
        # dictionary of Wikidata description for each QID of the dataset
        with open(os.path.join(self.dataset_dir, self.dataset_name + '.desc.json')) as f:
            self.descriptions = json.load(f)
Пример #9
0
    def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):

        self.dataset_name = 'unimorph'
        self.file_extension = DATASETS[self.dataset_name]['file_extension']
        self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir)
        self.file_path = os.path.join(cache_dir, self.dataset_name + '.tsv')

        self.database = pd.read_csv(self.file_path, 
                                sep='\t', 
                                names=['lemma', 'form', 'feats'], 
                                encoding='unicode_escape', 
                                usecols=[0,1,2],
                                dtype={'lemma':str, 'form':str, 'feats':str})

        self.database['pos'] = self.database['feats'].apply(lambda feats: feats.split(';')[0])
Пример #10
0
    def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False):

        self.dataset_name = 'dannet'
        self.file_extension = DATASETS[self.dataset_name]['file_extension']

        self.dataset_dir = download_dataset(self.dataset_name,
                                            process_func=_unzip_process_func,
                                            cache_dir=cache_dir)

        self.words = pd.read_csv(os.path.join(self.dataset_dir, "words.csv"),
                                 sep='@',
                                 names=['word_id', 'form', 'pos', 'nan'],
                                 encoding='unicode_escape',
                                 usecols=[0, 1, 2],
                                 dtype={'word_id': str})
        self.wordsenses = pd.read_csv(
            os.path.join(self.dataset_dir, "wordsenses.csv"),
            sep='@',
            names=['wordsense_id', 'word_id', 'synset_id', 'register', 'nan'],
            encoding='unicode_escape',
            usecols=[1, 2],
            dtype={
                'wordsense_id': str,
                'word_id': str,
                'synset_id': str
            })
        self.relations = pd.read_csv(os.path.join(self.dataset_dir,
                                                  "relations.csv"),
                                     sep='@',
                                     names=[
                                         'synset_id', 'wordnetowl', 'relation',
                                         'value', 'taxonomic',
                                         'inheritance_comment', 'nan'
                                     ],
                                     encoding='unicode_escape',
                                     usecols=[0, 1, 2, 3, 4, 5],
                                     dtype={
                                         'synset_id': str,
                                         'value': str
                                     })
        self.synsets = pd.read_csv(
            os.path.join(self.dataset_dir, "synsets.csv"),
            sep='@',
            names=['synset_id', 'label', 'gloss', 'ontological_type'],
            encoding='unicode_escape',
            usecols=[0, 1, 2, 3],
            dtype={'synset_id': str})
Пример #11
0
 def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
     self.dataset_name = 'europarl.sentiment2'
     self.dataset_dir = download_dataset(self.dataset_name,
                                         cache_dir=cache_dir,
                                         process_func=_unzip_process_func)
     self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv')
Пример #12
0
 def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR):
     self.dataset_name = 'dacoref'
     self.dataset_dir = download_dataset(self.dataset_name,
                                         process_func=_unzip_process_func,
                                         cache_dir=cache_dir)
Пример #13
0
 def _download(self):
     for corp in self.corpuses:
         dataset_dir = download_dataset(corp,
                 process_func=_opus_process_func, cache_dir=self.cache_dir,
                 verbose = self.verbose)