def store_pd_frame(data_frame, identifier, session_id): DiskStorageMisc.create_data_folder(session_id) data_frame.to_pickle( DiskStorage.get_file_path_pickle( identifier, session_id, create_sub_dirs=1, root_path=DiskStorageMisc.get_session_data_path(session_id)))
def store_model(model, identifier, session_id): DiskStorageMisc.create_data_folder(session_id) model.save( DiskStorage.get_file_path_model( identifier, session_id, create_sub_dirs=1, root_path=DiskStorageMisc.get_session_data_path(session_id)))
def get_file_path_h5_model(identifier, session_id, create_sub_dirs=0, root_path=None): data_path = DiskStorageMisc.get_session_data_path(session_id) identifier = DiskStorageMisc.get_identifier_path( identifier, create_sub_dirs=create_sub_dirs, root_path=root_path) return os.path.join(data_path, identifier + DiskStorage.h5_model_ext)
def set_stopwords(session_id, stopwords): data = {DiskStorageStopwordHandler.sw_list_key: []} for stopword in stopwords: data[DiskStorageStopwordHandler.sw_list_key].append(stopword) data_path = DiskStorageMisc.get_session_data_path(session_id) stopwords_path = os.path.join(data_path, DiskStorageStopwordHandler.file_name) DiskStorageMisc.create_data_folder(session_id) with open(stopwords_path, 'w+', encoding='utf8') as json_file: json.dump(data, json_file, ensure_ascii=False)
def set_categories(session_id, categories): data = {DiskStorageCategoryListHandler.cat_list_key: []} for category in categories: data[DiskStorageCategoryListHandler.cat_list_key].append(category) data_path = DiskStorageMisc.get_session_data_path(session_id) file_name = SessionConfigReader.read_value(DiskStorageCategoryListHandler.cat_id_key) + DiskStorageCategoryListHandler.ext_json categories_path = os.path.join(data_path, file_name) DiskStorageMisc.create_data_folder(session_id) with open(categories_path, 'w+', encoding='utf8') as json_file: json.dump(data, json_file, ensure_ascii=False)
def import_docs(csv_path=None): if csv_path is None: session_folder = os.path.join(TenKGnadImporter.sessions_folder, SessionConfigReader.get_session_id()) corpus_id = SessionConfigReader.read_value( TenKGnadImporter.corpus_id_key) corpus_id = DiskStorageMisc.get_identifier_path(corpus_id) csv_path = os.path.join(session_folder, corpus_id + TenKGnadImporter.csv_ext) df = pd.read_csv( csv_path, sep=';', quotechar='\'', quoting=csv.QUOTE_MINIMAL, header=None, names=[TenKGnadImporter.category_name, TenKGnadImporter.text_name]) category_list = df[TenKGnadImporter.category_name].tolist() df[TenKGnadImporter.category_name] = df.apply( lambda x: [x[TenKGnadImporter.category_name]], axis=1) head, f_name = os.path.split(csv_path) identifier = f_name.split('.')[0] Storage.store_pd_frame(df, identifier) SessionLogger.log('TenKGnad Corpus (' + str(len(df.index)) + ' entries) has been imported into \'' + identifier + '\' (columns: \'' + TenKGnadImporter.category_name + '\', \'' + TenKGnadImporter.text_name + '\').') category_set = set(category_list) category_list = list(category_set) CategoryListHandler.set_categories(category_list) return identifier
def set_config(session_id, config_id, json_f): config_path = DiskStorageSessionConfigReader.get_config_path( session_id, config_id, create_sub_dirs=1, root_path=DiskStorageMisc.get_session_path(session_id)) with open(config_path, 'w+', encoding='utf8') as json_file: json.dump(json_f, json_file, ensure_ascii=False, indent=4)
def read_stopwords(session_id): data_path = DiskStorageMisc.get_session_data_path(session_id) stopwords_path = os.path.join(data_path, DiskStorageStopwordHandler.file_name) if not os.path.exists(stopwords_path): return set() with open(stopwords_path, encoding='utf8') as json_file: file = json.load(json_file) return set(file[DiskStorageStopwordHandler.sw_list_key])
def read_categories(session_id): data_path = DiskStorageMisc.get_session_data_path(session_id) file_name = SessionConfigReader.read_value(DiskStorageCategoryListHandler.cat_id_key) + DiskStorageCategoryListHandler.ext_json categories_path = os.path.join(data_path, file_name) if not os.path.exists(categories_path): return list() with open(categories_path, encoding='utf8') as json_file: file = json.load(json_file) return file[DiskStorageCategoryListHandler.cat_list_key]
def list_ids(location, session_id): session_path = DiskStorageMisc.get_session_path(session_id) location_path = os.path.join(session_path, location) potential_ids = listdir(location_path) ids = list() for pot_id in potential_ids: if os.path.isfile(os.path.join(location_path, pot_id)): f_parts = pot_id.split('.') idx = 0 identifier = '' for part in f_parts: if idx == 0: identifier = part if 0 < idx < len(f_parts) - 1: identifier = identifier + '.' + part idx = idx + 1 ids.append(identifier) return ids
def get_config_path(session_id, config_id, create_sub_dirs=0, root_path=None): if config_id == DiskStorageSessionConfigReader.best_performing: return os.path.join( DiskStorageSessionConfigReader.best_performing_f_name, DiskStorageSessionConfigReader.json_ext) else: sessions_path = DiskStorageSessionConfigReader.sessions_dir session_path = os.path.join(sessions_path, session_id) config_id = DiskStorageMisc.get_identifier_path( config_id, create_sub_dirs=create_sub_dirs, root_path=root_path) return os.path.join( session_path, config_id + DiskStorageSessionConfigReader.json_ext)
def delete_location(location, session_id): session_path = DiskStorageMisc.get_session_path(session_id) location_path = os.path.join(session_path, location) DiskStorageMisc.delete_from_folder(location_path) SessionLogger.log('Location \'' + location + '\' has been deleted.')
def delete_session_data(session_id): DiskStorageMisc.delete_session_data(session_id)
def delete_from_folder(path): DiskStorageMisc.delete_from_folder(path)