예제 #1
0
def get_gazetteer(language: str = 'ENG'):
    """
    return a dictionary with the types and the gazetteers associated
    :param language: language in wich we try to get the gazetteers
    :type language: str
    :return: dictionary {'LOC': [Paris, ...]}
    :rtype: dict
    """
    if language == 'ENG':
        gazloc = no_caps(
            nltk.corpus.gazetteers.words(fileids=[
                'countries.txt', 'uscities.txt', 'usstates.txt',
                'usstateabbrev.txt', 'mexstates.txt', 'caprovinces.txt'
            ]))
        gazper = no_caps(
            nltk.corpus.names.words(fileids=['male.txt', 'female.txt']))
        gazmisc = no_caps(
            nltk.corpus.gazetteers.words(fileids=['nationalities.txt']))
        return {cst.LOC: gazloc, cst.PER: gazper, cst.MISC: gazmisc}
    if language == 'FR':
        cfg = get_asset_root()
        gazloc = get_file_content(cfg, 'gazLOC')
        gazloc = pd.read_csv(gazloc)
        gazloc = gazloc.iloc[:, 0].tolist()
        gazper = get_file_content(cfg, 'gazPER')
        gazper = pd.read_csv(gazper)
        gazper = gazper.iloc[:, 0].tolist()
        return {cst.LOC: gazloc, cst.PER: gazper}
예제 #2
0
 def get_already_trained(
         cls,
         name: str,
         language: str = cst.NO_LANGUAGE,
         entity: list = ["ORG", "LOC", "PER"]) -> Union[object, None]:
     """
     charge an already trained dataset, you must specify the langage
     :param langage: langage of the dataset
     :type langage: str
     :return: the Training database object
     :rtype: Union[object, None]
     """
     self = Training_database()
     self.df_name = name
     self.language = language
     self.entity = entity
     cfg = get_asset_root()
     file = get_file_content(cfg, name)
     try:
         self.df = pd.read_csv(filepath_or_buffer=file, index_col=0)
         self.df = self.df.fillna(0)
         self.categories = list(self.df['NEtag'].unique())
         self.categories.remove(0)
         return self
     except:
         return "Error when trying to read the dataframe"
    def try_trained_model(txt_to_test: str,
                          model: str = "svm_all_features",
                          entity: list = ['ORG', 'LOC', 'MISC', 'PER'],
                          name_entity: list = [
                              'Organisations', 'Locations', 'Miscellaneaous',
                              'Persons'
                          ]):
        df_to_do = pd.DataFrame()
        df_to_do['Word'] = txt_to_test.split()
        df_to_do = Training_database.do_feature_dataset(df_to_do)
        cfg = get_asset_root()
        directory = get_file_content(cfg, model)
        model_clone = joblib.load(directory)
        df_test = df_to_do[cst.list_features_en]
        result = model_clone.predict(df_test)
        LOGGER.info(result)
        dict_entity = {}
        for j in name_entity:
            dict_entity[j] = []
        for i in range(0, len(result)):
            for j in range(0, len(entity)):
                if result[i] == j + 1:
                    dict_entity[name_entity[j]].append(df_to_do['Word'][i])

        return dict_entity
예제 #4
0
    def gazetteer(df: pd.DataFrame, language: str = 'ENG'):
        cfg = get_asset_root()

        list_gaz = get_type_of_gazetteers(cfg, 'en')
        for i in list_gaz:
            g = [0 for i in df[cst.WORD]]
            list_files = get_file_content(cfg, 'gazetteer_en', gaztype=i)
            for j in list_files:
                gaz = list(pd.read_csv(j)[cst.LOWERCASE])

                for index in range(0, len(df[cst.LOWERCASE])):
                    if df[cst.LOWERCASE][index] in gaz:
                        g[index] = 1
            df[i] = g
        return df
예제 #5
0
 def preuni_factory(df: pd.DataFrame, directory:str="pre_freq_CONLL2003"):
     preuni = {'preuniORG':  f'{directory}/preuniORG',
               'preuniLOC':  f'{directory}/preuniLOC', 'preuniPER': f'{directory}/preuniPER',
               'preuniMISC': f'{directory}/preuniMISC'}
     cfg = get_asset_root()
     for key, value in preuni.items():
         file_name = get_file_content(cfg, value)
         with open(file_name) as json_file:
             data = json.load(json_file)
         frequentname = data[key]
         LOGGER.info(frequentname)
         L = [0]
         for i in range(1, len(df)):
             if df.iloc[i - 1][cst.LOWERCASE] in frequentname:
                 L.append(1)
             else:
                 L.append(0)
         df[key] = L
     return df
예제 #6
0
 def frequency_factory(df: pd.DataFrame, directory:str="freq_names_CONLL2003"):
     freq = {'FreqNAMES': f'{directory}/freqNAMES', 'FreqORG': f'{directory}/freqORG',
             'FreqLOC':   f'{directory}/freqLOC', 'FreqPER': f'{directory}/freqPER',
             'FreqMISC':  f'{directory}/freqMISC'}
     cfg = get_asset_root()
     for key, value in freq.items():
         file_name = get_file_content(cfg, value)
         with open(file_name) as json_file:
             data = json.load(json_file)
         frequentname = data[key]
         LOGGER.info(frequentname)
         freq_entity = []
         for row in df.itertuples(index=True, name='Pandas'):
             if getattr(row, cst.LOWERCASE) in frequentname:
                 freq_entity.append(1)
             else:
                 freq_entity.append(0)
         df[key] = freq_entity
     return df
예제 #7
0
 def clean_and_setup_training(
         cls,
         name: str,
         language: str = cst.NO_LANGUAGE) -> Union[object, None]:
     """
     charge a virgin dataset, and add the features
     :param name: name of the virgin dataset
     :type name: str
     :param language: language of the dataset
     :type language: str
     :return: the new traning dataset object
     :rtype: Union[object, None]
     """
     self = Training_database()
     self.df_name = name
     self.language = language
     cfg = get_asset_root()
     file = get_file_content(cfg, name)
     try:
         self.df = pd.read_csv(filepath_or_buffer=file, index_col=0)
     except:
         return "Error when trying to read the dataframe"
     return self
예제 #8
0
import jsonpickle
import json

# Globals
###############################################################################

LOGGER = logzero.logger


# Functions and Classes
###############################################################################

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
cfg = get_asset_root()
directory = get_file_content(cfg, "French_own_data/frenchreuters_trained")
df = pd.read_csv(directory)
category_lines = {}
list_NE = ['ORG','LOC', 'PER']
for i in list_NE:
    category_lines[i] = []
category_lines['Nothing']=[]


for word in range(0, len(df[i])):
    g=0
    for i in list_NE:

        if df.loc[word, i] == 1:
            category_lines[i].append(df.loc[word, "Word"])
    #     else:
예제 #9
0
import collections
import jsonpickle
import json

# Globals
###############################################################################

LOGGER = logzero.logger

# Functions and Classes
###############################################################################

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
cfg = get_asset_root()
directory = get_file_content(cfg, "CoNLL2003/train")
df = pd.read_csv(directory)
category_lines = {}
list_NE = ['ORG', 'LOC', 'MISC', 'PER']
for i in list_NE:
    category_lines[i] = []
category_lines['Nothing'] = []

for word in range(0, len(df[i])):
    g = 0
    for i in list_NE:

        if df.loc[word, i] == 1:
            category_lines[i].append(df.loc[word, "Word"])
    #     else:
    #         g+=1
예제 #10
0
            L.append(0)
            df[key] = L
        return df

    @staticmethod
    def debut(df: pd.DataFrame):
        freq = [0]
        for i in range(1, len(df)):
            if df.iloc[i - 1][cst.LOWERCASE] in ['.','!', '?']:
                freq.append(1)
            else:
                freq.append(0)
        df['debut'] = freq
        return df


# feature_list
features_no_directory_eng = [Feature_eng.lowercase, Feature_eng.capitalize, Feature_eng.fullcap,
                         Feature_eng.length, Feature_eng.presufixe, Feature_eng.gazetteer, Feature_eng.number, Feature_eng.debut]



if __name__ == '__main__':
    cfg = get_asset_root()
    train = get_file_content(cfg, "CoNLL2003/test")
    f = pd.read_csv(train)
    f = Feature_eng.lowercase(f)
    f = Feature_eng.preuni_factory(f, "pre_freq_CONLL2003")
    LOGGER.info(f.columns)
    LOGGER.info(f.head())