Python clean 예제들, recordlinkage.preprocessing.clean Python 예제들

예제 #1

0

파일 보기

파일: test_preprocessing.py 프로젝트: twalen/recordlinkage

    def test_clean(self):

        values = pd.Series([
            'Mary-ann', 'Bob :)', 'Angel', 'Bob (alias Billy)', 'Mary ann',
            'John', np.nan
        ])

        expected = pd.Series(
            ['mary ann', 'bob', 'angel', 'bob', 'mary ann', 'john', np.nan])

        clean_series = clean(values)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)

        clean_series_nothing = clean(
            values,
            lowercase=False,
            replace_by_none=False,
            replace_by_whitespace=False,
            strip_accents=False,
            remove_brackets=False)

        # Check if ntohing happend.
        pdt.assert_series_equal(clean_series_nothing, values)

예제 #2

0

파일 보기

 def _expand_name(df, col):
     df[col] = preprocessing.clean(df[col])
     df[col + "---initial"] = df[col].str[0]
     df[col + "---soundex"] = preprocessing.phonetic(df[col],
                                                     method="soundex")
     df[col + "---nysiis"] = preprocessing.phonetic(df[col],
                                                    method="nysiis")
     df[col + "---metaphone"] = preprocessing.phonetic(df[col],
                                                       method="metaphone")

예제 #3

0

파일 보기

파일: test_preprocessing.py 프로젝트: twalen/recordlinkage

    def test_clean_brackets(self):

        values = pd.Series([np.nan, 'bra(cke)ts', 'brackets with (brackets)'])
        expected = pd.Series([np.nan, 'brats', 'brackets with'])

        clean_series = clean(values, remove_brackets=True)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)

예제 #4

0

파일 보기

파일: test_preprocessing.py 프로젝트: twalen/recordlinkage

    def test_clean_lower(self):

        values = pd.Series([np.nan, 'LowerHigher', 'HIGHERLOWER'])
        expected = pd.Series([np.nan, 'lowerhigher', 'higherlower'])

        clean_series = clean(values, lowercase=True)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)

예제 #5

0

파일 보기

파일: preprocess.py 프로젝트: frellnick/mpi_demo

def clean_raw(df: pd.DataFrame, exclude=['date', 'mpi']):
    def _is_excluded(col: str, exclude: list) -> bool:
        return True in [e in col for e in exclude]

    obj_columns = df.select_dtypes(include='object').columns
    temp = df.copy()
    for col in obj_columns:
        if not _is_excluded(col, exclude):
            temp[col] = clean(temp[col])
    return temp

예제 #6

0

파일 보기

파일: utils.py 프로젝트: helmuthb/SPEML-2020

def preproc_attributes(df: pd.DataFrame, names: List[str]) -> pd.DataFrame:
    for n in names:
        df[n + '_clean'] = preprocessing.clean(df[n])
        df[n + '_soundex'] = preprocessing.phonetic(df[n + '_clean'],
                                                    'soundex')
        df[n + '_nysiis'] = preprocessing.phonetic(df[n + '_clean'], 'nysiis')
        df[n + '_metaphone'] = preprocessing.phonetic(df[n + '_clean'],
                                                      'metaphone')
        df[n + '_match_rating'] = preprocessing.phonetic(
            df[n + '_clean'], 'match_rating')
    return df

예제 #7

0

파일 보기

파일: Entity_Resolution.py 프로젝트: basilvetas/Computer-Systems

def preprocess(df1, df2):

    # set index to the id column
    df1 = df1.set_index('id')
    df2 = df2.set_index('id')

    # replace empty cells with NaN
    df1 = df1.replace("", np.nan)
    df2 = df2.replace("", np.nan)

    # drop country, locality and region
    df1 = df1.drop(['country', 'locality', 'region'], axis=1)
    df2 = df2.drop(['country', 'locality', 'region'], axis=1)

    # remove all non-numbers from phone & convert to numeric
    df1.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df1.loc[:, 'phone']))
    df2.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df2.loc[:, 'phone']))

    # convert postal_code to numeric
    df1.loc[:, 'postal_code'] = pd.to_numeric(df1.loc[:, 'postal_code'])
    df2.loc[:, 'postal_code'] = pd.to_numeric(df2.loc[:, 'postal_code'])

    # clean street_address & website
    df1.loc[:, 'street_address'] = clean(df1.loc[:, 'street_address'])
    df1.loc[:, 'website'] = clean(df1.loc[:, 'website'])

    df2.loc[:, 'street_address'] = clean(df2.loc[:, 'street_address'])
    df2.loc[:, 'website'] = clean(df2.loc[:, 'website'])

    # convert NaNs to 0s for numerics
    df1.loc[:,
            ['latitude', 'longitude', 'phone', 'postal_code']] = df1.loc[:, [
                'latitude', 'longitude', 'phone', 'postal_code'
            ]].replace(np.nan, 0)

    df2.loc[:,
            ['latitude', 'longitude', 'phone', 'postal_code']] = df2.loc[:, [
                'latitude', 'longitude', 'phone', 'postal_code'
            ]].replace(np.nan, 0)

    return df1, df2

예제 #8

0

파일 보기

파일: test_preprocessing.py 프로젝트: twalen/recordlinkage

    def test_clean_accent_stripping(self):

        values = pd.Series(['ősdfésdfë', 'without'])
        expected = pd.Series(['osdfesdfe', 'without'])

        values_unicode = pd.Series([u'ősdfésdfë', u'without'])
        expected_unicode = pd.Series([u'osdfesdfe', u'without'])

        # values_callable = pd.Series([u'ősdfésdfë', u'without'])
        # expected_callable = pd.Series([u'ősdfésdfë', u'without'])

        # # Callable.
        # pdt.assert_series_equal(
        #     clean(values_callable, strip_accents=lambda x: x),
        #     expected_callable)

        # Check if series are identical.
        pdt.assert_series_equal(
            clean(values, strip_accents='unicode'), expected)

        # Check if series are identical.
        pdt.assert_series_equal(clean(values, strip_accents='ascii'), expected)

        # Check if series are identical.
        pdt.assert_series_equal(
            clean(values_unicode, strip_accents='unicode'), expected_unicode)

        # Check if series are identical.
        pdt.assert_series_equal(
            clean(values_unicode, strip_accents='ascii'), expected_unicode)

        with pytest.raises(ValueError):
            clean(values, strip_accents='unknown_algorithm')

예제 #9

0

파일 보기

파일: patientlinkr.py 프로젝트: mayerantoine/matching-comparative-review

def _create_pseudo_id(df_a):

    #first letter of given name
    given_name_1 = pandas.Series(clean(
        df_a['given_name']).str[:1].dropna().apply(
            lambda x: alphabet_position(x)),
                                 name='given_name_1')
    # second letter of the given name
    given_name_2 = pandas.Series(clean(
        df_a['given_name']).str[1:2].dropna().apply(
            lambda x: alphabet_position(x)),
                                 name='given_name_2')
    # first letter of the surname
    surname_1 = pandas.Series(clean(df_a['surname']).str[:1].dropna().apply(
        lambda x: alphabet_position(x)),
                              name='surname_1')
    # second letter of the surname
    surname_2 = pandas.Series(clean(df_a['surname']).str[1:2].dropna().apply(
        lambda x: alphabet_position(x)),
                              name='surname_2')

    #sex
    surname_2 = pandas.Series(clean(df_a['surname']).str[1:2].dropna().apply(
        lambda x: alphabet_position(x)),
                              name='surname_2')
    # year of birth
    yearb = pandas.Series(df_a['YearB'].dropna().astype(str).replace(
        'nan', ''),
                          name='YearB')
    # day of birth
    dayb = pandas.Series(df_a['DayB'].dropna().astype(str).replace('nan', ''),
                         name='DayB')
    df_u = pandas.concat(
        [given_name_1, given_name_2, surname_1, surname_2, yearb, dayb],
        axis=1,
        sort=False)
    df_u = df_u.fillna('')

    return df_u.apply(lambda x: ''.join(x), axis=1)

예제 #10

0

파일 보기

def match_prep(df: pd.DataFrame, add_var: str) -> pd.DataFrame:
    """
    
    Prepare and clean a dataframe for address matching.
    

    Parameters
    ----------
    df : pd.DataFrame
        A dataframe to prepare.
    add_var : str
        The variable that contains the full address (minus postcode).

    Returns
    -------
    df : pd.DataFrame
        df with 'clean_address' and 'numbers' adding. Postcode column cleaned.

    """
    df = df.copy()
    df["clean_address"] = (
        clean(df[add_var], replace_by_whitespace="[\\_]")
        .str.replace(" +", " ")
        .str.replace("([^0-9])0+([0-9])", "\\1\\2")
        .str.replace("^0+", "")
        .str.strip()
    )

    df["numbers"] = (
        df.clean_address.str.replace("[a-z]{2,}", "")
        .str.replace(" +", " ")
        .str.strip()
        .str.split()
        .apply(sorted)
        .apply(lambda x: " ".join(x))
    )
    df.postcode = clean(df.postcode, replace_by_whitespace="[\\_]")

    return df

예제 #11

0

파일 보기

파일: test_preprocessing.py 프로젝트: twalen/recordlinkage

    def test_clean_unicode(self):

        values = pd.Series([
            u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)',
            u'Mary  ann', u'John', np.nan
        ])

        expected = pd.Series([
            u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan
        ])

        clean_series = clean(values)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)

예제 #12

0

파일 보기

def clean_data(df):
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df = df[df['customer_id'].notna()]
    df['phone_clean'] = phonenumbers(df['phone'])
    df['first_name_clean'] = clean(df['first_name'])
    df['last_name_clean'] = clean(df['last_name'])
    df['address_clean'] = clean(df['address'])
    df['city_clean'] = clean(df['city'])
    df['state_clean'] = clean(df['state'])
    df['zip_clean'] = clean(df['zip'].str.split('-').str[0])
    return df

예제 #13

0

파일 보기

파일: test_preprocessing.py 프로젝트: wfranus/recordlinkage

    def test_clean_parameters(self):

        values = pd.Series([
            u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)',
            u'Mary  ann', u'John', np.nan
        ])

        expected = pd.Series([
            u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan
        ])

        clean_series = clean(values,
                             lowercase=True,
                             replace_by_none=r'[^ \-\_A-Za-z0-9]+',
                             replace_by_whitespace=r'[\-\_]',
                             remove_brackets=True)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)

예제 #14

0

파일 보기

def run_phonetic_encoding(df_a, select_encoding):
    """ Calulate the phonetic encoding of the selected fields """

    logging.info("run phonetic encoding ....")
    df_a_processed = df_a.copy()

    #FIXME Errors when selecting non string columns like soc_sec_id
    #TODO Include double metaphone in Python Toolkit
    for field, encoding in select_encoding.items():
        if (encoding == 'double_metaphone'):
            df_a_processed[encoding + "_" + field] = df_a[field].apply(
                lambda x: doublemetaphone(str(x))[0]
                if (np.all(pd.notnull(x))) else x)
        else:
            df_a_processed[encoding + "_" + field] = phonetic(clean(
                df_a[field]),
                                                              method=encoding)

    cols = df_a_processed.columns.to_list()

    return df_a_processed, cols

예제 #15

0

파일 보기

def get_features(df):
    """Clean and engineer new features for match comparison."""

    # remove non-alpha characters from names and generate phonetic codes:
    df['Original Name_cleaned'] = clean(df['Original Name'])
    df['o_name_phonetic_code'] = phonetic(df['Original Name_cleaned'],
                                          method="soundex")

    df['Age'] = df['Age'].astype('float')

    #df['add2_lat'] = df['add2_lat'].astype('float')
    #df['add2_long'] = df['add2_long'].astype('float')

    # remove non-numeric characters from phone numbers and save as strings for levenshtein comparison:
    df['Phone'] = df['Phone'].str.replace(r'[^0-9]+', '')
    df['Phone'] = df['Phone'].astype(str)

    # remove all characters that aren't part of a URL:
    df['social_media'] = df['social_media'].apply(lambda x: re.findall(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        str(x))).sort_values().str[0]
    df['social_media'] = df['social_media'].fillna(np.nan)

    return df

예제 #16

0

파일 보기

파일: recordlinkage_first_trial.py 프로젝트: wangchuan2008888/soweego

get_ipython().run_line_magic('pinfo', 'pandas.Series')
from recordlinkage.preprocessing import clean

wikidata
etichette = json.load(
    open('/Users/focs/wikidata/label2qid_1_percent_sample.json'))
etichette
get_ipython().run_line_magic('pinfo', 'pandas.Series')
serie = pandas.Series(etichette)
serie
serie.axes
serie = pandas.Series(etichette.keys())
serie
serie = pandas.Series(list(etichette.keys()))
serie
clean(serie)
clean(serie, replace_by_none=None, strip_accents='unicode')
clean(serie, replace_by_none=None, strip_accents='ascii')
clean(serie, replace_by_none=None, strip_accents='unicode')
discogs
discogs_df
from recordlinkage.preprocessing import phonetic
get_ipython().run_line_magic('pinfo', 'phonetic')
serie[66]
giappa = pandas.Series([serie[66]])
giappa
phonetic(giappa, 'soundex')
phonetic(giappa, 'metaphone')
phonetic(giappa, 'nysiis')
phonetic(giappa, 'match_rating')
get_ipython().run_line_magic('hist', '')

예제 #17

0

파일 보기

파일: pandas_standards.py 프로젝트: frellnick/mpi_demo

def last_name_pool(view: View, colname='last_name_pool'):
    return clean(view[colname])

예제 #18

0

파일 보기

pr = PR.lower()

ADD = pd.read_csv(INPUT_CSV_FILE, low_memory=False)

# drop entries with null values
ADD = ADD.dropna(subset=[NUMBER_COL, STREET_COL, CITY_COL])

ADD[NUMBER_COL] = ADD[NUMBER_COL].astype(str)
ADD = ADD.reset_index()
ADD = ADD.fillna('')

# remove line breaking characters, if they exist
ADD[STREET_COL] = ADD[STREET_COL].str.replace('\n', '')

# perform basic cleaning
ADD[STREET_COL] = clean(ADD[STREET_COL])
ADD[CITY_COL] = clean(ADD[CITY_COL])
ADD[NUMBER_COL] = ADD[NUMBER_COL].str.replace('\n', '')

# abbreviate street types and directions
print('abbreviating types...')
if PR == 'QC':
    ADD = AddressClean_fr(ADD, STREET_COL, STREET_COL)
else:
    ADD = AddressClean_en(ADD, STREET_COL, STREET_COL)

# Drop all but the specified columns in the data frame
ADD = ADD[[LAT_COL, LONG_COL, NUMBER_COL, STREET_COL, CITY_COL, POSTAL_CODE_COL]]

N1 = len(ADD)
ADD['Duplicated'] = ADD.duplicated(subset=[NUMBER_COL, STREET_COL, CITY_COL], keep=False)

예제 #19

0

파일 보기

파일: pandas_standards.py 프로젝트: frellnick/mpi_demo

def first_name_pool(view: View, colname='first_name_pool'):
    return clean(view[colname])

예제 #20

0

파일 보기

파일: pandas_standards.py 프로젝트: frellnick/mpi_demo

def middle_name_pool(view: View, colname='middle_name_pool'):
    return clean(view[colname])

예제 #21

0

파일 보기

파일: models.py 프로젝트: sergiofenoll/project-databases

    def collect_identical_rows_alg(self, schema_id, table_name, sorting_key,
                                   fixed_column_names, var_column_names, alg):

        schema_name = 'schema-' + str(schema_id)
        dedup_table_name = '_dedup_' + table_name + "_grouped"

        # TODO When user selects rows to remove, collect in table.
        # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO)

        try:

            # Remove complete duplicates before full dedup
            self.remove_identical_rows(
                schema_id,
                table_name,
            )

            # SELECT id, 'column' FROM "schema_name"."table";
            data_query = 'SELECT * FROM {}.{}'.format(
                *_ci(schema_name, table_name))
            df = pd.read_sql(data_query, con=db.engine)
            df = df.set_index('id')

            # Clean dataset

            ## Remove leading whitespaces
            #df.columns = df.columns.to_series().apply(lambda x: x.strip())

            if sorting_key not in fixed_column_names:
                fixed_column_names.append(sorting_key)

            string_columns = list(df.select_dtypes(include=['object']).columns)
            numerical_columns = list(
                df.select_dtypes(include=['int64']).columns)
            numerical_columns.extend(
                list(df.select_dtypes(include=['float64']).columns))
            date_columns = list(
                df.select_dtypes(include=['datetime64[ns]']).columns)

            ## Clean string values
            for column_name in string_columns:
                df[column_name] = clean(df[column_name])

            # Indexation step
            indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key,
                                                             window=3)
            pairs = indexer.index(df)

            # Comparison step
            compare_cl = recordlinkage.Compare()

            ## Exact matches
            for column_name in fixed_column_names:
                compare_cl.exact(column_name, column_name, label=column_name)

            ## Variable matches calculated using an alg (levenshtein / numerical / date)
            for column_name in var_column_names:
                if column_name in numerical_columns:
                    compare_cl.numeric(column_name,
                                       column_name,
                                       method='linear',
                                       offset=10,
                                       scale=10)
                elif column_name in date_columns:
                    compare_cl.date(column_name, column_name)
                elif column_name in string_columns:
                    compare_cl.string(column_name,
                                      column_name,
                                      method=alg,
                                      threshold=0.75,
                                      label=column_name)

            potential_pairs = compare_cl.compute(pairs, df)

            # Classification step
            kmeans = recordlinkage.KMeansClassifier()
            kmeans.learn(potential_pairs)
            matches = kmeans.predict(potential_pairs)

            if len(matches) == 0:
                return False

            # Grouping step
            ## Group matches (A,B), (B,C) into (A,B,C)
            groups = self.group_matches(matches)

            #TODO Create table _dedup_table_groups
            self.create_duplicate_table(schema_id, table_name, groups)

            return True

        except Exception as e:
            app.logger.error(
                "[ERROR] Unable to generate clusters of duplicate rows from table '{}'"
                .format(dedup_table_name))
            app.logger.exception(e)
            raise e

예제 #22

0

파일 보기

import pandas as pd
import numpy as np
# record linkage and preprocessing
import recordlinkage as rl
from recordlinkage.preprocessing import clean, phonetic

#inventors = pd.read_csv('../Data/patentsview_inventors.csv')
assignees = pd.read_csv('../Data/patentsview_assignee_org_names.csv')
organizations = pd.read_csv('../Data/fedreporter_org_names_grants.csv')


#Do cleaning for assignee
assignees.head()
assignees['organizations_clean'] = clean(assignees['organization'])
assignees['organization'] = assignees['organizations_clean'].str.replace(' ','')
assignees["organization_phonetic"] = phonetic(assignees["organization"], method="nysiis")
assignees  = assignees.loc[assignees.org_country == "US",:]

#Do cleaning for organization
organizations[' ORGANIZATION_NAME_CLEAN'] = clean(organizations[' ORGANIZATION_NAME'])
organizations['organization'] = organizations[' ORGANIZATION_NAME_CLEAN'].str.replace(' ','')
organizations['organization_phonetic'] = phonetic(organizations["organization"], method="nysiis")
organizations  = organizations.loc[organizations[' ORGANIZATION_COUNTRY'] == "UNITED STATES",:]


print(assignees.shape)
print(assignees.patent_id.drop_duplicates().shape)

#Explore stuff
assignees.head()
organizations.head()

예제 #23

0

파일 보기

파일: models.py 프로젝트: MeyerJon/project-databases

    def collect_identical_rows_alg(self, schema_id, table_name, sorting_key, fixed_column_names, var_column_names, alg):

        schema_name = 'schema-' + str(schema_id)
        dedup_table_name = '_dedup_' + table_name + "_grouped"

        # TODO When user selects rows to remove, collect in table.
        # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO)

        try:

            # Remove complete duplicates before full dedup
            self.remove_identical_rows(schema_id, table_name, )

            # SELECT id, 'column' FROM "schema_name"."table";
            data_query = 'SELECT * FROM {}.{}'.format(*_ci(schema_name, table_name))
            df = pd.read_sql(data_query, con=db.engine)
            df = df.set_index('id')

            # Clean dataset

            ## Remove leading whitespaces
            #df.columns = df.columns.to_series().apply(lambda x: x.strip())

            if sorting_key not in fixed_column_names:
                fixed_column_names.append(sorting_key)

            string_columns = list(df.select_dtypes(include=['object']).columns)
            numerical_columns = list(df.select_dtypes(include=['int64']).columns)
            numerical_columns.extend(list(df.select_dtypes(include=['float64']).columns))
            date_columns = list(df.select_dtypes(include=['datetime64[ns]']).columns)

            ## Clean string values
            for column_name in string_columns:
                df[column_name] = clean(df[column_name])

            # Indexation step
            indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key, window=3)
            pairs = indexer.index(df)

            # Comparison step
            compare_cl = recordlinkage.Compare()

            ## Exact matches
            for column_name in fixed_column_names:
                compare_cl.exact(column_name, column_name, label=column_name)

            ## Variable matches calculated using an alg (levenshtein / numerical / date)
            for column_name in var_column_names:
                if column_name in numerical_columns:
                    compare_cl.numeric(column_name, column_name, method='linear', offset=10, scale=10)
                elif column_name in date_columns:
                    compare_cl.date(column_name, column_name)
                elif column_name in string_columns:
                    compare_cl.string(column_name, column_name, method=alg, threshold=0.75, label=column_name)

            potential_pairs = compare_cl.compute(pairs, df)

            # Classification step
            kmeans = recordlinkage.KMeansClassifier()
            kmeans.learn(potential_pairs)
            matches = kmeans.predict(potential_pairs)

            if len(matches) == 0:
                return False

            # Grouping step
            ## Group matches (A,B), (B,C) into (A,B,C)
            groups = self.group_matches(matches)


            #TODO Create table _dedup_table_groups
            self.create_duplicate_table(schema_id, table_name, groups)

            return True

        except Exception as e:
            app.logger.error(
                "[ERROR] Unable to generate clusters of duplicate rows from table '{}'".format(dedup_table_name))
            app.logger.exception(e)
            raise e

예제 #24

0

파일 보기

def preprocess(sfdf, repdf):
    print('enter PREPROCESS')

    global key_list, keys
    '''preprocessing'''

    sfdf.update(clean(sfdf.FirstName))
    sfdf.update(clean(sfdf.LastName))
    sfdf.update(clean(sfdf.Email))
    sfdf.update(clean(sfdf.State))
    sfdf.update(phonenumbers(sfdf.Zip))
    sfdf.update(clean(sfdf.City))
    sfdf.update(phonenumbers(sfdf.Phone))
    sfdf.update(clean(sfdf.CRD.astype(str)))

    repdf.update(clean(repdf.FirstName))
    repdf.update(clean(repdf.LastName))
    repdf.update(clean(repdf.Email))
    repdf.update(clean(repdf.State))
    repdf.update(phonenumbers(repdf.Zip))
    repdf.update(clean(repdf.City))
    repdf.update(phonenumbers(repdf.Phone))
    repdf.update(clean(repdf.CRD.astype(str)))

    '''key generating'''

    for df in [sfdf, repdf]:
        for key in keys:
            if len(key) > 1:
                key_col = ''.join([''.join(c for c in s if c.isupper()) for s in key])
                if key_col not in key_list:
                    key_list.append(key_col)
                df[key_col] = pd.Series(np.add.reduce(df[key].astype(str), axis=1))
            else:
                if key[0] not in key_list:
                    key_list.append(key[0])
    print('exit PREPROCESS')

예제 #25

0

파일 보기

파일: test_preprocessing.py 프로젝트: twalen/recordlinkage

    def test_clean_empty(self):
        """ Test the cleaning of an empty Series"""

        # Check empty series
        pdt.assert_series_equal(clean(pd.Series()), pd.Series())

예제 #26

0

파일 보기

파일: case_study_linkage.py 프로젝트: OCHA-DAP/data-environment-mapping-tool

PATH_REACH17_DATASET = "C:/Users/cmcinerney/Desktop/UNOCHA Fellowship/Afghanistan_microdata/Case study datasets/Harmonised/reach_afg_dataset_mcna_aug2017_harmonised.xlsx"

PATH_SDC_DATASET = "C:/Users/cmcinerney/Desktop/UNOCHA Fellowship/Afghanistan_microdata/Case study datasets/Harmonised/sdc-afg-msna-microdata-harmonised.xlsx"

df_reach18 = pd.read_excel(PATH_REACH_DATASET)
df_reach17 = pd.read_excel(PATH_REACH17_DATASET)
df_sdc = pd.read_excel(PATH_SDC_DATASET)

# =============================================================================
# Preprocessing the data to increase likelihood of finding linkages
# =============================================================================
#string variables to clean
df_reach18.province = clean(df_reach18.province,
                            lowercase=True,
                            replace_by_none='[^ \\-\\_A-Za-z0-9]+',
                            replace_by_whitespace='[\\-\\_]',
                            strip_accents=None,
                            remove_brackets=True,
                            encoding='utf-8',
                            decode_error='strict')
df_reach18.district = clean(df_reach18.district,
                            lowercase=True,
                            replace_by_none='[^ \\-\\_A-Za-z0-9]+',
                            replace_by_whitespace='[\\-\\_]',
                            strip_accents=None,
                            remove_brackets=True,
                            encoding='utf-8',
                            decode_error='strict')
df_reach18.origin_country = clean(df_reach18.origin_country,
                                  lowercase=True,
                                  replace_by_none='[^ \\-\\_A-Za-z0-9]+',
                                  replace_by_whitespace='[\\-\\_]',

예제 #27

0

파일 보기

 def _expand_sex(df, col):
     df[col] = preprocessing.clean(df[col])

예제 #28

0

파일 보기

def preprocess(sfdf, repdf, keys):
    key_list = list()
    '''preprocessing'''
    sfdf.update(clean(sfdf.FirstName))
    sfdf.update(clean(sfdf.LastName))
    sfdf.update(clean(sfdf.Email))
    sfdf.update(clean(sfdf.MailingState))
    sfdf.update(phonenumbers(sfdf.MailingPostalCode))
    sfdf.update(clean(sfdf.MailingCity))
    sfdf.update(phonenumbers(sfdf.Phone))
    sfdf.update(clean(sfdf.CRD__c.astype(str)))

    repdf.update(clean(repdf.FirstName))
    repdf.update(clean(repdf.LastName))
    repdf.update(clean(repdf.Email))
    repdf.update(clean(repdf.MailingState))
    repdf.update(phonenumbers(repdf.MailingPostalCode))
    repdf.update(clean(repdf.MailingCity))
    repdf.update(phonenumbers(repdf.Phone))
    repdf.update(clean(repdf.CRD__c.astype(str)))
    '''key generating'''
    for df in [sfdf, repdf]:
        for key in keys:
            if len(key[:-1]) > 1:
                key_col = ''.join(
                    [''.join(c for c in s if c.isupper()) for s in key[:-1]])
                if key_col not in key_list:
                    key_list.append(key_col)
                df[key_col] = pd.Series(
                    np.add.reduce(df[key[:-1]].astype(str), axis=1))
            else:
                if key[0] not in key_list:
                    key_list.append(key[0])
    return sfdf, repdf, key_list

예제 #29

0

파일 보기

 def _expand_location(df, col):
     df[col] = preprocessing.clean(df[col])

예제 #30

0

파일 보기

파일: 1-MakePairs.py 프로젝트: CSBP-CPSE/LODE-ECDO

    df["Name"] = df["Name"].str.replace(r, ' ', regex=False)
    df["Address"] = df["Address"].str.replace(r, ' ', regex=False)

#remove excess whitespace
df["Name"] = df["Name"].str.replace(r" +", " ", regex=True)
df["Address"] = df["Address"].str.replace(r" +", " ", regex=True)

#standardise postal codes - just remove empty space and make sure it's all lower case

df.loc[~df.PostalCode.isnull(),
       'PostalCode'] = df.loc[~df.PostalCode.isnull(),
                              'PostalCode'].str.replace(' ', '').str.lower()

#create an extra temporary Name column with an additional level of cleaning

df['NameClean'] = clean(df["Name"])

#Some records have street number and street name, but no address field filled

df.loc[(df.Address.isnull())&\
       (~df.StreetName.isnull()),'Address']\
    =clean(df.loc[(df.Address.isnull())&\
       (~df.StreetName.isnull()),'StreetNumber']+' '+\
           df.loc[(df.Address.isnull())&\
       (~df.StreetName.isnull()),'StreetName']+' '+\
        df.loc[(df.Address.isnull())&\
       (~df.StreetName.isnull()),'City'])

print(
    '1.b) Simple Deduplication - use Pandas to drop identical rows before additional processing'
)

예제 #31

0

파일 보기

    # PREPROCESSING SP500
    len(sp500)
    sp500.head()
    fig_sp500 = plt.figure(figsize=(9, 4))
    heatmap_sp500 = sns.heatmap(sp500.isnull(), cbar=False)
    plt.title("SP500: Missing Values ")
    plt.savefig("MV-sp500.png", dpi=700, bbox_inches='tight')
    plt.show()
    sp500['Date first added'].isnull().sum()

    sp500['GICS Sector'].unique()
    sp500['GICS Sub Industry'].unique()

    sp500_pre = sp500.copy()
    sp500_pre['GICS Sector'] = clean(sp500_pre['GICS Sector'])
    sp500_pre['GICS Sub Industry'] = clean(sp500_pre['GICS Sub Industry'])
    sp500_pre['Security'] = clean(sp500_pre['Security'])
    sp500_pre["Security"].str.count(r'\bcorp\b').sum()
    sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\bcorp\b',
                                                              '',
                                                              regex=True)
    sp500_pre["Security"].str.count(r'\binc\b').sum()
    sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\binc\b',
                                                              '',
                                                              regex=True)
    sp500_pre["Security"].str.count(r'\bco\b').sum()
    sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\bco\b',
                                                              '',
                                                              regex=True)
    sp500_pre["Security"].str.count(r'\bltdp\b').sum()