Exemplos de generate_feature_map em Python, exemplos de hgc.ner.generate_feature_map em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: %test_example1.py Projeto: KWR-Water/hgc

def test_exm_wide():
    """
    ===================
    Example: import wide data
    ===================

    Next, we will import the same data, but from a ‘wide’ shaped file.
    Note that it is also possible to use a dataframe instead of excel or csv as input
    for hgc.io.import_file(). This requires using the argument “dataframe” instead of “file_name”.
    An advantage of this approach is to prevent repeatedly reading the input file .
    """

    df_temp = pd.read_excel(Path(__file__).cwd() / 'tests/example1.xlsx',
                            sheet_name='wide',
                            header=None)  # ignore headers!

    # step 1: generate feature map
    feature_map2, feature_unmapped2, df_feature_map2 = ner.generate_feature_map(
        entity_orig=list(df_temp.iloc[2, 5:]))
    assert (feature_map2 == {
        'chloride': 'Cl',
        'manganese': 'Mn',
        'nietrate': 'NO3',
        'nitrate (filtered)': 'NO3'
    })

    # step 2: generate unit map
    unit_map2, unit_unmapped2, df_unit_map2 = ner.generate_unit_map(
        entity_orig=list(df_temp.iloc[3, 5:]))
    assert (unit_map2 == {
        'mg-N/L': 'mg/L N',
        'mg/L': 'mg/L',
        'ug/L': 'μg/L',
        'μS/cm': 'μS/cm'
    })

    # step 3: import file
    df2 = io.import_file(dataframe=df_temp,
                         shape='wide',
                         slice_header=[3, slice(2, 5)],
                         slice_feature=[2, slice(5, None)],
                         slice_unit=[3, slice(5, None)],
                         slice_data=[slice(4, None)],
                         map_header={
                             **io.default_map_header(), 'loc.': 'LocationID',
                             'date': 'Datetime',
                             'sample': 'SampleID'
                         },
                         map_features={
                             **feature_map2, 'EC new sensor': 'ec_field'
                         },
                         map_units=unit_map2)[0]

    # step 4: convert to wide format
    df2_hgc = io.stack_to_hgc(df2)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: %test_prov.py Projeto: KWR-Water/hgc

def test_VitensOMIVE():
    # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv'
    # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv'
    WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/preprocessed/Vitens_PP_WP_OMIVE_2009_2020.xlsx'
    df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1')
    # define the nrow here 
    n_row = None    
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[1, slice(8, 819)].dropna()))
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[2, slice(8, 819)].dropna()))

    # create a df to record what has been mapped and what has not
    df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose()
    if not not feature_unmapped:
        df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature']))
    if not not unit_unmapped:
        df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit']))

    dct2_arguments = {
        'file_path': WD,
        'sheet_name': 'Sheet1',
        'shape': 'wide',
        'slice_header': [1, slice(1, 8)],
        'slice_feature': [1, slice(8, None)],
        'slice_unit': [2, slice(8, None)],
        'slice_data': [slice(3, n_row), slice(1, None)],

        'map_header': {
            **io.default_map_header(),
            # 'Monsterpuntcode': 'LocationID',
            # 'Omschrijving (Parameter)':'Feature',
            # 'Eenheid (Parameter)': 'Unit',
            # 'Waarde numeriek': 'Value', # Gerapporteerde waarde, right?!
            'Datum': 'Datetime',
            'Naam': 'SampleID',  # Analyse !?
        },
        'map_features': {**feature_map,'pH(1)':'pH'},
        'map_units': {**unit_map},
    }
    df2 = io.import_file(**dct2_arguments)[0]
    df2_hgc = io.stack_to_hgc(df2)
    # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer:          
    with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/VitensOMIVE.xlsx') as writer:          
        df2_hgc.to_excel(writer, sheet_name='hgc_VitenOMIVE')
        # df2.to_excel(writer, sheet_name='VitensOMIVE')
        df_map.to_excel(writer, sheet_name='mapAndUnmap')  
    df2.to_csv(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/VitensOMIVE_ref.csv')

Exemplo n.º 3

0

Exibir arquivo

Arquivo: %test_prov.py Projeto: KWR-Water/hgc

def test_LIMS_Ruw_2017_2019():
    # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv'
    # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv'
    WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/LIMS_Ruw_2017_2019_preprocessed.xlsx'
    df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1')
    # define the nrow here 
    n_row = None    
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 6].dropna()))
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 9].dropna()))

    # create a df to record what has been mapped and what has not
    df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose()
    if not not feature_unmapped:
        df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature']))
    if not not unit_unmapped:
        df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit']))

    dct2_arguments = {
        'file_path': WD,
        'sheet_name': 'Export Worksheet',
        'shape': 'stacked',
        'slice_header': [1, slice(1, 10)],
        'slice_data': [slice(1, n_row), slice(1, 10)],

        'map_header': {
            **io.default_map_header(),
            'POINTDESCR': 'LocationID',
            'ANALYTE':'Feature',
            'UNITS': 'Unit',
            'FINAL': 'Value', # Gerapporteerde waarde, right?!
            'SAMPDATE': 'Datetime',
            'TESTNO': 'SampleID',  # Analyse !?
        },
        'map_features': {**feature_map,'pH(1)':'pH'},
        'map_units': {**unit_map, 'µg/l Hxdcn-d34':'µg/l'},
    }
    df2 = io.import_file(**dct2_arguments)[0]
    df2_hgc = io.stack_to_hgc(df2)
    # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer:          
    with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/LIMS_Ruw_2017_2019_processed.xlsx') as writer:          
        df2_hgc.to_excel(writer, sheet_name='hgc_LIMS_Ruw_2017_2019')
        df2.to_excel(writer, sheet_name='LIMS_Ruw_2017_2019')
        df_map.to_excel(writer, sheet_name='mapAndUnmap')

Exemplo n.º 4

0

Exibir arquivo

Arquivo: %test_prov.py Projeto: KWR-Water/hgc

def test_province():
    # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv'
    WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse'+'/provincie_data_long_preprocessed.csv'
    df_temp = pd.read_csv(WD, encoding='ISO-8859-1', header=None)

    # define the nrow here 
    n_row = None    
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 25].dropna()))
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 26].dropna()))
 
    # create a df to record what has been mapped and what has not
    df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose()
    if not not feature_unmapped:
        df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature']))
    if not not unit_unmapped:
        df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit']))

    dct2_arguments = {
        'file_path': WD,
        'sheet_name': 'stacked',
        'shape': 'stacked',
        'slice_header': [1, slice(1, None)],
        'slice_data': [slice(1, n_row), slice(1, None)],
        'map_header': {
            **io.default_map_header(),
            'MeetpuntId': 'LocationID',
            'parameter':'Feature',
            'eenheid': 'Unit',
            'waarde': 'Value',
            'Opgegeven bemonstering datum': 'Datetime',
            'Monsternummer': 'SampleID',  # "SampleID" already exists as header, but contains wrong date. Use "Sample number" as "SampleID"
            # 'SampleID': None  # otherwise exists twice in output file
        },
        'map_features': {**feature_map,'pH(1)':'pH'},
        'map_units': {**unit_map, 'oC':'°C'},
    }
    df2 = io.import_file(**dct2_arguments)[0]
    df2_hgc = io.stack_to_hgc(df2)
    with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse'+r'/provincie_processed.xlsx') as writer:          
        df2_hgc.to_excel(writer, sheet_name='hgc_prov')
        df2.to_excel(writer, sheet_name='df_prov')
        df_map.to_excel(writer, sheet_name='mapAndUnmap')

Exemplo n.º 5

0

Exibir arquivo

Arquivo: %test_prov.py Projeto: KWR-Water/hgc

def test_KIWKVenloschol():
    # WD = Path(tests.__file__).parent / 'provincie_data_long_preprocessed.csv'
    # WD = r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/Opkomende stoffen KIWK Zuid_preprocessed.csv'
    WD = r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/Opkomende stoffen KIWK Venloschol_preprocessed.xlsx'
    df_temp = pd.read_excel(WD, header=None, encoding='ISO-8859-1')
    # define the nrow here 
    n_row = None    
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 20].dropna()))
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=list(df_temp.iloc[slice(2, n_row), 21].dropna()))
 
    # create a df to record what has been mapped and what has not
    df_map = pd.DataFrame((feature_map.keys(),feature_map.values(),unit_map.keys(),unit_map.values()), index=['Feature','Mapped feature','Unit','Mapped unit']).transpose()
    if not not feature_unmapped:
        df_map = df_map.join(pd.DataFrame(feature_unmapped, columns=['Unmapped feature']))
    if not not unit_unmapped:
        df_map = df_map.join(pd.DataFrame(unit_unmapped, columns=['Unmapped unit']))

    dct2_arguments = {
        'file_path': WD,
        'sheet_name': 'Export KoW 2.0',
        'shape': 'stacked',
        'slice_header': [1, slice(1, 24)],
        'slice_data': [slice(1, n_row), slice(1, 24)],
        'map_header': {
            **io.default_map_header(),
            'Monsterpunt': 'LocationID',
            'Parameter omschrijving':'Feature',
            'Eenheid': 'Unit',
            'Gerapporteerde waarde': 'Value', # Gerapporteerde waarde, right?!
            'Monstername datum': 'Datetime',
            'Analyse': 'SampleID',  # Analyse !?
        },
        'map_features': {**feature_map,'pH(1)':'pH'},
        'map_units': {**unit_map, 'µg/l atrazine-D5':'µg/l'},
    }
    df2 = io.import_file(**dct2_arguments)[0]
    df2_hgc = io.stack_to_hgc(df2)
    # with pd.ExcelWriter(r'D:/DBOX/Dropbox/008KWR/0081Projects/kennisimpulse/KIWK_Zuid_processed.xlsx') as writer:          
    with pd.ExcelWriter(r'C:\Users\beta6\Documents\Dropbox\008KWR\0081Projects\kennisimpulse/Opkomende stoffen KIWK Venloschol_processed.xlsx') as writer:          
        df2_hgc.to_excel(writer, sheet_name='hgc_KIWK Venloschol')
        df2.to_excel(writer, sheet_name='KIWK Venloschol')
        df_map.to_excel(writer, sheet_name='mapAndUnmap')

Exemplo n.º 6

0

Exibir arquivo

Arquivo: %test_example1.py Projeto: KWR-Water/hgc

def test_mapping_features():
    """
    ===================
    Mapping feature
    ===================

    The funtions generate_feature_map() and generate_unit_map() use Named Entity
    Recognition (NER) techniques to match original entities to the entities used by HGC.
    It is based on the fuzzywuzzy module. And uses Levenshtein Distance to calculate the differences between
    original entities and HGC-compatible entities. Original entities are matched to the HGC-entity to which they
    have the least distance. A match is only succesful if the score based on the Levenstein Distance remains above
    a certain threshold.

    For the features, a default database has been provided with the module that contains
    both features and a selection of alias (synonyms). The NER function will try find which
    alias provides the best match (= highest score) for each original feature.
    """

    # Print first lines of default database for mapping features.
    print(ner.default_feature_alias_dutch_english.head())
    """
    By default, all columns are used except for 'CAS'.
    It is possible to change the selection of colums through the argument 'alias_cols'
    In the next example, we will attempt mapping using the CAS number.
    """

    # example with mapping with CAS number
    df_feature_alias = ner.generate_entity_alias(
        df=ner.entire_feature_alias_table,
        entity_col='Feature',
        alias_cols=['CAS'])

    df_temp = pd.read_excel(Path(__file__).cwd() / 'tests/example1.xlsx',
                            sheet_name='wide',
                            header=None)  # ignore headers!
    feature_map3, feature_unmapped3, df_feature_map3 =\
        ner.generate_feature_map(entity_orig=list(df_temp.iloc[1, 5:]),
                                    df_entity_alias=df_feature_alias,
                                    match_method='exact')

    # check if features are correctly mapped
    print(feature_map3)
    """
    The results of the mapping with CAS number are very poor compared to the previous
    mapping. This is logical in this case, since there are no CAS numbers in the
    original file.

    Note that in this case we will adjust the argument 'match_method' to 'exact'
    This works faster, but features must be spelled exactly the same as in the feature list. The mapping method can be
    adjusted with the argument .

    It is also possible to load a user defined database with the argument
    'df_entity_alias'.

    ===================
    Mapping units
    ===================

    For mapping units, similar functionalities are availabe as for mapping features.
    Only with a differente database and alias_cols

    """
    # Print first lines of default database for mapping units.
    print(ner.default_unit_alias.head())
    """

Exemplo n.º 7

0

Exibir arquivo

Arquivo: %test_example1.py Projeto: KWR-Water/hgc

def test_exm_stacked():
    """
    ----------------------
    Step 1: ner.generate_feature_map()
    ----------------------
    First map the features in the original file, to feature names recognized by HGC.
    """

    # compile a list of features by slicing the original file
    lst_features = list(
        pd.read_excel('./tests/example1.xlsx',
                      sheet_name='stacked')['Feature'])

    # automatically detect features using text recognition
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(
        entity_orig=lst_features)

    # check if features are correctly mapped
    assert (feature_map == {
        'chloride': 'Cl',
        'nitrate (filtered)': 'NO3',
        'manganese': 'Mn',
        'nietrate': 'NO3'
    })

    # check for which features the algorithm was not able to find a match that met the minimum resemblance.
    assert (feature_unmapped == ['EC new sensor'])

    # The dataframe can be used to check in more detail the scores how well the original features were matched to HGC features.
    # This can be handy if you want to identify common errors and update the underlying database.
    df_feature_map.head(5)
    """
    In this case we find that the algorithm was not able to find a match for one 
    of the features ('EC new sensor'). Hence, we need to adjust the mapping manually.
    """

    # manually adjust the mapping by merging with a user defined dictionary (optional)
    feature_map2 = {**feature_map, 'EC new sensor': 'ec_field'}
    """
    ----------------------
    Step 2: hgc.io. generate_unit_map()
    ----------------------
    Next, we need to make a mapping for the units, using the same approach as for the features. 
    """
    lst_units = list(
        pd.read_excel(Path(__file__).cwd() / 'tests/example1.xlsx',
                      sheet_name='stacked')['Unit'])
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(
        entity_orig=lst_units)
    assert (unit_map == {
        'mg-N/L': 'mg/L N',
        'mg/L': 'mg/L',
        'ug/L': 'μg/L',
        'μS/cm': 'μS/cm'
    })
    """
    ----------------------
    Step 3: hgc.io.import_file()
    ----------------------
    The third step is to read the original file and and convert the data to the desired 
    datamodel. This requires that we first indicate where to find the data and how to 
    convert it.
    """

    # Arguments defining where to find data
    slice_header = [0, slice(0, 6)]  # row 0
    slice_data = [slice(1, None)
                  ]  # row 1 till end of file. "None" indicates "end" here.

    # Arguments how to convert the data

    # map_header -->  mapping how to adjust headers name
    # Note: The headers 'Value', 'Unit' and 'SampleID' are compulsory. Other headers can be any string
    map_header = {
        **io.default_map_header(), 'loc.': 'LocationID',
        'date': 'Datetime',
        'sample': 'SampleID'
    }

    # map_features --> see step 1

    # map_units --> see step 2

    # feature_units -->  mapping of the desired units for each feature
    # For instance, we can inspect the default units for Cl, NO3 and ec_field
    assert (io.default_feature_units()['Cl'] == 'mg/L')
    assert (io.default_feature_units()['NO3'] == 'mg/L')
    assert (io.default_feature_units()['ec_field'] == 'mS/m')

    # column_dtype --> desired dtypefor columns
    # we will use the default dtype
    print(io.default_column_dtype())  # use default values
    """
    Now the we have defined all the arguments, let's import the data
    """

    df = io.import_file(file_path=str(
        Path(__file__).cwd() / 'tests/example1.xlsx'),
                        sheet_name='stacked',
                        shape='stacked',
                        slice_header=slice_header,
                        slice_data=slice_data,
                        map_header=map_header,
                        map_features=feature_map2,
                        map_units=unit_map)[0]
    df.head(3)  # imported data
    df_1 = io.import_file(file_path=str(
        Path(__file__).cwd() / 'tests/example1.xlsx'),
                          sheet_name='stacked',
                          shape='stacked',
                          slice_header=slice_header,
                          slice_data=slice_data,
                          map_header=map_header,
                          map_features=feature_map2,
                          map_units=unit_map)[1]
    df_1.head(3)  # duplication
    df_2 = io.import_file(file_path=str(
        Path(__file__).cwd() / 'tests/example1.xlsx'),
                          sheet_name='stacked',
                          shape='stacked',
                          slice_header=slice_header,
                          slice_data=slice_data,
                          map_header=map_header,
                          map_features=feature_map2,
                          map_units=unit_map)[2]
    df_2.head(3)  # nan values
    """
    ----------------------
    Step 4: hgc.io.to_hgc()
    ----------------------
    Finally, we need to pivot the stacked data to the wide format used by HGC.
    The default is to use 'LocationID', 'Datetime' and 'SampleID' as index.
    """
    df_hgc = io.stack_to_hgc(df)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_bas_HWL.py Projeto: KWR-Water/hgc

def test_hwl():
    """
    Created on Wed Sep 23 15:14:48 2020

    @author: griftba
    """
    import pandas as pd
    import hgc
    from hgc import ner 
    from hgc import io
    from pathlib import Path

    #pip install -U git+https://github.com/KWR-Water/[email protected]

    #    df_temp = pd.read_excel(r'./Chem_Data_bas.xlsx', sheet_name='Raai 2019-2020', header=None) # ignore headers!

    # step 1: generate feature map
    file_name = r'D:\DBOX\Dropbox\008KWR\0081Projects\QSAR_NER\testing_bas\Chem_Data_bas.xlsx'
    lst_features = list(pd.read_excel(file_name, sheet_name='Raai 2019-2020')['Parameter'])
    feature_map, feature_unmapped, df_feature_map = ner.generate_feature_map(entity_orig=lst_features)
    lst_units = list(pd.read_excel(file_name, sheet_name='Raai 2019-2020')['Unit'])
    unit_map, unit_unmapped, df_unit_map = ner.generate_unit_map(entity_orig=lst_units)
    slice_header = [0, slice(0, 8)]  # row 0
    slice_data = [slice(1, None), slice(0, 8)]
    dct2_arguments = {
        'file_path': file_name,
        'sheet_name': 'Raai 2019-2020',
        'shape': 'stacked',
        'slice_header': slice_header,
        'slice_data': slice_data,
        'map_header': {
            **io.default_map_header(), 'Date': 'Datetime', 'Sample': 'SampleID', 'Parameter':'Feature',
        },
        'map_features': feature_map,
        'map_units': unit_map,
    }
    df2 = io.import_file(**dct2_arguments)[0]


    dct2_arguments = {
        'file_path': file_name,
        'sheet_name': 'Raai 2019-2020',
        'shape': 'stacked',
        'slice_header': slice_header,
        'slice_data': slice_data,
        'map_header': {
            **io.default_map_header(), 'Date': 'Datetime', 'Sample': 'SampleID', 'Parameter':'Feature',
        },
        'map_features': feature_map,
        'map_units': unit_map,
    }
    df2_2 = io.import_file(**dct2_arguments)[2] 

    df2_DLV = hgc.io.stack_to_hgc(df2)

    df2_DLV.to_csv('DLV_raai2019_2020.csv')


    df2_DLV.hgc.make_valid()
    is_valid = df2_DLV.hgc.is_valid
    is_valid

    # Recognized HGC columns
    hgc_cols = df2_DLV.hgc.hgc_cols
    print(hgc_cols)

    sum_anions = df2_DLV.hgc.get_sum_anions_stuyfzand()

    water_types = df2_DLV.hgc.get_stuyfzand_water_type()
    print(water_types)
    bex = df2_DLV.hgc.get_bex()
    bex