def get_gazetteer(language: str = 'ENG'): """ return a dictionary with the types and the gazetteers associated :param language: language in wich we try to get the gazetteers :type language: str :return: dictionary {'LOC': [Paris, ...]} :rtype: dict """ if language == 'ENG': gazloc = no_caps( nltk.corpus.gazetteers.words(fileids=[ 'countries.txt', 'uscities.txt', 'usstates.txt', 'usstateabbrev.txt', 'mexstates.txt', 'caprovinces.txt' ])) gazper = no_caps( nltk.corpus.names.words(fileids=['male.txt', 'female.txt'])) gazmisc = no_caps( nltk.corpus.gazetteers.words(fileids=['nationalities.txt'])) return {cst.LOC: gazloc, cst.PER: gazper, cst.MISC: gazmisc} if language == 'FR': cfg = get_asset_root() gazloc = get_file_content(cfg, 'gazLOC') gazloc = pd.read_csv(gazloc) gazloc = gazloc.iloc[:, 0].tolist() gazper = get_file_content(cfg, 'gazPER') gazper = pd.read_csv(gazper) gazper = gazper.iloc[:, 0].tolist() return {cst.LOC: gazloc, cst.PER: gazper}
def get_already_trained( cls, name: str, language: str = cst.NO_LANGUAGE, entity: list = ["ORG", "LOC", "PER"]) -> Union[object, None]: """ charge an already trained dataset, you must specify the langage :param langage: langage of the dataset :type langage: str :return: the Training database object :rtype: Union[object, None] """ self = Training_database() self.df_name = name self.language = language self.entity = entity cfg = get_asset_root() file = get_file_content(cfg, name) try: self.df = pd.read_csv(filepath_or_buffer=file, index_col=0) self.df = self.df.fillna(0) self.categories = list(self.df['NEtag'].unique()) self.categories.remove(0) return self except: return "Error when trying to read the dataframe"
def try_trained_model(txt_to_test: str, model: str = "svm_all_features", entity: list = ['ORG', 'LOC', 'MISC', 'PER'], name_entity: list = [ 'Organisations', 'Locations', 'Miscellaneaous', 'Persons' ]): df_to_do = pd.DataFrame() df_to_do['Word'] = txt_to_test.split() df_to_do = Training_database.do_feature_dataset(df_to_do) cfg = get_asset_root() directory = get_file_content(cfg, model) model_clone = joblib.load(directory) df_test = df_to_do[cst.list_features_en] result = model_clone.predict(df_test) LOGGER.info(result) dict_entity = {} for j in name_entity: dict_entity[j] = [] for i in range(0, len(result)): for j in range(0, len(entity)): if result[i] == j + 1: dict_entity[name_entity[j]].append(df_to_do['Word'][i]) return dict_entity
def gazetteer(df: pd.DataFrame, language: str = 'ENG'): cfg = get_asset_root() list_gaz = get_type_of_gazetteers(cfg, 'en') for i in list_gaz: g = [0 for i in df[cst.WORD]] list_files = get_file_content(cfg, 'gazetteer_en', gaztype=i) for j in list_files: gaz = list(pd.read_csv(j)[cst.LOWERCASE]) for index in range(0, len(df[cst.LOWERCASE])): if df[cst.LOWERCASE][index] in gaz: g[index] = 1 df[i] = g return df
def preuni_factory(df: pd.DataFrame, directory:str="pre_freq_CONLL2003"): preuni = {'preuniORG': f'{directory}/preuniORG', 'preuniLOC': f'{directory}/preuniLOC', 'preuniPER': f'{directory}/preuniPER', 'preuniMISC': f'{directory}/preuniMISC'} cfg = get_asset_root() for key, value in preuni.items(): file_name = get_file_content(cfg, value) with open(file_name) as json_file: data = json.load(json_file) frequentname = data[key] LOGGER.info(frequentname) L = [0] for i in range(1, len(df)): if df.iloc[i - 1][cst.LOWERCASE] in frequentname: L.append(1) else: L.append(0) df[key] = L return df
def frequency_factory(df: pd.DataFrame, directory:str="freq_names_CONLL2003"): freq = {'FreqNAMES': f'{directory}/freqNAMES', 'FreqORG': f'{directory}/freqORG', 'FreqLOC': f'{directory}/freqLOC', 'FreqPER': f'{directory}/freqPER', 'FreqMISC': f'{directory}/freqMISC'} cfg = get_asset_root() for key, value in freq.items(): file_name = get_file_content(cfg, value) with open(file_name) as json_file: data = json.load(json_file) frequentname = data[key] LOGGER.info(frequentname) freq_entity = [] for row in df.itertuples(index=True, name='Pandas'): if getattr(row, cst.LOWERCASE) in frequentname: freq_entity.append(1) else: freq_entity.append(0) df[key] = freq_entity return df
def clean_and_setup_training( cls, name: str, language: str = cst.NO_LANGUAGE) -> Union[object, None]: """ charge a virgin dataset, and add the features :param name: name of the virgin dataset :type name: str :param language: language of the dataset :type language: str :return: the new traning dataset object :rtype: Union[object, None] """ self = Training_database() self.df_name = name self.language = language cfg = get_asset_root() file = get_file_content(cfg, name) try: self.df = pd.read_csv(filepath_or_buffer=file, index_col=0) except: return "Error when trying to read the dataframe" return self
import jsonpickle import json # Globals ############################################################################### LOGGER = logzero.logger # Functions and Classes ############################################################################### all_letters = string.ascii_letters + " .,;'" n_letters = len(all_letters) cfg = get_asset_root() directory = get_file_content(cfg, "French_own_data/frenchreuters_trained") df = pd.read_csv(directory) category_lines = {} list_NE = ['ORG','LOC', 'PER'] for i in list_NE: category_lines[i] = [] category_lines['Nothing']=[] for word in range(0, len(df[i])): g=0 for i in list_NE: if df.loc[word, i] == 1: category_lines[i].append(df.loc[word, "Word"]) # else:
import collections import jsonpickle import json # Globals ############################################################################### LOGGER = logzero.logger # Functions and Classes ############################################################################### all_letters = string.ascii_letters + " .,;'" n_letters = len(all_letters) cfg = get_asset_root() directory = get_file_content(cfg, "CoNLL2003/train") df = pd.read_csv(directory) category_lines = {} list_NE = ['ORG', 'LOC', 'MISC', 'PER'] for i in list_NE: category_lines[i] = [] category_lines['Nothing'] = [] for word in range(0, len(df[i])): g = 0 for i in list_NE: if df.loc[word, i] == 1: category_lines[i].append(df.loc[word, "Word"]) # else: # g+=1
L.append(0) df[key] = L return df @staticmethod def debut(df: pd.DataFrame): freq = [0] for i in range(1, len(df)): if df.iloc[i - 1][cst.LOWERCASE] in ['.','!', '?']: freq.append(1) else: freq.append(0) df['debut'] = freq return df # feature_list features_no_directory_eng = [Feature_eng.lowercase, Feature_eng.capitalize, Feature_eng.fullcap, Feature_eng.length, Feature_eng.presufixe, Feature_eng.gazetteer, Feature_eng.number, Feature_eng.debut] if __name__ == '__main__': cfg = get_asset_root() train = get_file_content(cfg, "CoNLL2003/test") f = pd.read_csv(train) f = Feature_eng.lowercase(f) f = Feature_eng.preuni_factory(f, "pre_freq_CONLL2003") LOGGER.info(f.columns) LOGGER.info(f.head())