def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name1 = 'lcc1.sentiment' self.file_extension1 = DATASETS[self.dataset_name1]['file_extension'] self.dataset_dir1 = download_dataset(self.dataset_name1, cache_dir=cache_dir) self.file_path1 = os.path.join(self.dataset_dir1, self.dataset_name1 + self.file_extension1) self.dataset_name2 = 'lcc2.sentiment' self.file_extension2 = DATASETS[self.dataset_name2]['file_extension'] self.dataset_dir2 = download_dataset(self.dataset_name2, cache_dir=cache_dir) self.file_path2 = os.path.join(self.dataset_dir2, self.dataset_name2 + self.file_extension2)
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'wikiann' self.file_extension = DATASETS[self.dataset_name]['file_extension'] self.dataset_dir = download_dataset(self.dataset_name, process_func=_wikiann_process_func, cache_dir=cache_dir)
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'ddt' self.file_extension = DATASETS[self.dataset_name]['file_extension'] self.dataset_dir = download_dataset('ddt', process_func=_unzip_process_func, cache_dir=cache_dir, verbose=True)
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'europarl.sentiment1' self.file_extension = DATASETS[self.dataset_name]['file_extension'] self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir) self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension)
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'angrytweets.sentiment' self.dataset_dir = download_dataset( self.dataset_name, cache_dir=cache_dir, process_func=_twitter_data_process_func) self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv')
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'twitter_sentiment' #self.file_extension = DATASETS[self.dataset_name]['file_extension'] self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir) #self.file_path = os.path.join(self.dataset_dir, self.dataset_name + '.csv') #def load_with_pandas(self): # return pd.read_csv(self.file_path, sep=',', encoding='utf-8')
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'wordsim353.da' self.file_extension = DATASETS[self.dataset_name]['file_extension'] self.dataset_dir = download_dataset( self.dataset_name, process_func=_word_sim_process_func, cache_dir=cache_dir) self.file_path = os.path.join(self.dataset_dir, self.dataset_name + self.file_extension)
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'daned' self.file_extension = DATASETS[self.dataset_name]['file_extension'] self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir) # dictionary of Wikidata properties for each QID of the dataset with open(os.path.join(self.dataset_dir, self.dataset_name + '.props.json')) as f: self.properties = json.load(f) # dictionary of Wikidata description for each QID of the dataset with open(os.path.join(self.dataset_dir, self.dataset_name + '.desc.json')) as f: self.descriptions = json.load(f)
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): self.dataset_name = 'unimorph' self.file_extension = DATASETS[self.dataset_name]['file_extension'] self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir) self.file_path = os.path.join(cache_dir, self.dataset_name + '.tsv') self.database = pd.read_csv(self.file_path, sep='\t', names=['lemma', 'form', 'feats'], encoding='unicode_escape', usecols=[0,1,2], dtype={'lemma':str, 'form':str, 'feats':str}) self.database['pos'] = self.database['feats'].apply(lambda feats: feats.split(';')[0])
def __init__(self, cache_dir=DEFAULT_CACHE_DIR, verbose=False): self.dataset_name = 'dannet' self.file_extension = DATASETS[self.dataset_name]['file_extension'] self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir) self.words = pd.read_csv(os.path.join(self.dataset_dir, "words.csv"), sep='@', names=['word_id', 'form', 'pos', 'nan'], encoding='unicode_escape', usecols=[0, 1, 2], dtype={'word_id': str}) self.wordsenses = pd.read_csv( os.path.join(self.dataset_dir, "wordsenses.csv"), sep='@', names=['wordsense_id', 'word_id', 'synset_id', 'register', 'nan'], encoding='unicode_escape', usecols=[1, 2], dtype={ 'wordsense_id': str, 'word_id': str, 'synset_id': str }) self.relations = pd.read_csv(os.path.join(self.dataset_dir, "relations.csv"), sep='@', names=[ 'synset_id', 'wordnetowl', 'relation', 'value', 'taxonomic', 'inheritance_comment', 'nan' ], encoding='unicode_escape', usecols=[0, 1, 2, 3, 4, 5], dtype={ 'synset_id': str, 'value': str }) self.synsets = pd.read_csv( os.path.join(self.dataset_dir, "synsets.csv"), sep='@', names=['synset_id', 'label', 'gloss', 'ontological_type'], encoding='unicode_escape', usecols=[0, 1, 2, 3], dtype={'synset_id': str})
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'europarl.sentiment2' self.dataset_dir = download_dataset(self.dataset_name, cache_dir=cache_dir, process_func=_unzip_process_func) self.file_path = os.path.join(cache_dir, self.dataset_name + '.csv')
def __init__(self, cache_dir: str = DEFAULT_CACHE_DIR): self.dataset_name = 'dacoref' self.dataset_dir = download_dataset(self.dataset_name, process_func=_unzip_process_func, cache_dir=cache_dir)
def _download(self): for corp in self.corpuses: dataset_dir = download_dataset(corp, process_func=_opus_process_func, cache_dir=self.cache_dir, verbose = self.verbose)