def test_clean(self):

        values = pd.Series([
            'Mary-ann', 'Bob :)', 'Angel', 'Bob (alias Billy)', 'Mary ann',
            'John', np.nan
        ])

        expected = pd.Series(
            ['mary ann', 'bob', 'angel', 'bob', 'mary ann', 'john', np.nan])

        clean_series = clean(values)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)

        clean_series_nothing = clean(
            values,
            lowercase=False,
            replace_by_none=False,
            replace_by_whitespace=False,
            strip_accents=False,
            remove_brackets=False)

        # Check if ntohing happend.
        pdt.assert_series_equal(clean_series_nothing, values)
示例#2
0
 def _expand_name(df, col):
     df[col] = preprocessing.clean(df[col])
     df[col + "---initial"] = df[col].str[0]
     df[col + "---soundex"] = preprocessing.phonetic(df[col],
                                                     method="soundex")
     df[col + "---nysiis"] = preprocessing.phonetic(df[col],
                                                    method="nysiis")
     df[col + "---metaphone"] = preprocessing.phonetic(df[col],
                                                       method="metaphone")
    def test_clean_brackets(self):

        values = pd.Series([np.nan, 'bra(cke)ts', 'brackets with (brackets)'])
        expected = pd.Series([np.nan, 'brats', 'brackets with'])

        clean_series = clean(values, remove_brackets=True)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)
    def test_clean_lower(self):

        values = pd.Series([np.nan, 'LowerHigher', 'HIGHERLOWER'])
        expected = pd.Series([np.nan, 'lowerhigher', 'higherlower'])

        clean_series = clean(values, lowercase=True)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)
示例#5
0
def clean_raw(df: pd.DataFrame, exclude=['date', 'mpi']):
    def _is_excluded(col: str, exclude: list) -> bool:
        return True in [e in col for e in exclude]

    obj_columns = df.select_dtypes(include='object').columns
    temp = df.copy()
    for col in obj_columns:
        if not _is_excluded(col, exclude):
            temp[col] = clean(temp[col])
    return temp
示例#6
0
def preproc_attributes(df: pd.DataFrame, names: List[str]) -> pd.DataFrame:
    for n in names:
        df[n + '_clean'] = preprocessing.clean(df[n])
        df[n + '_soundex'] = preprocessing.phonetic(df[n + '_clean'],
                                                    'soundex')
        df[n + '_nysiis'] = preprocessing.phonetic(df[n + '_clean'], 'nysiis')
        df[n + '_metaphone'] = preprocessing.phonetic(df[n + '_clean'],
                                                      'metaphone')
        df[n + '_match_rating'] = preprocessing.phonetic(
            df[n + '_clean'], 'match_rating')
    return df
def preprocess(df1, df2):

    # set index to the id column
    df1 = df1.set_index('id')
    df2 = df2.set_index('id')

    # replace empty cells with NaN
    df1 = df1.replace("", np.nan)
    df2 = df2.replace("", np.nan)

    # drop country, locality and region
    df1 = df1.drop(['country', 'locality', 'region'], axis=1)
    df2 = df2.drop(['country', 'locality', 'region'], axis=1)

    # remove all non-numbers from phone & convert to numeric
    df1.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df1.loc[:, 'phone']))
    df2.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df2.loc[:, 'phone']))

    # convert postal_code to numeric
    df1.loc[:, 'postal_code'] = pd.to_numeric(df1.loc[:, 'postal_code'])
    df2.loc[:, 'postal_code'] = pd.to_numeric(df2.loc[:, 'postal_code'])

    # clean street_address & website
    df1.loc[:, 'street_address'] = clean(df1.loc[:, 'street_address'])
    df1.loc[:, 'website'] = clean(df1.loc[:, 'website'])

    df2.loc[:, 'street_address'] = clean(df2.loc[:, 'street_address'])
    df2.loc[:, 'website'] = clean(df2.loc[:, 'website'])

    # convert NaNs to 0s for numerics
    df1.loc[:,
            ['latitude', 'longitude', 'phone', 'postal_code']] = df1.loc[:, [
                'latitude', 'longitude', 'phone', 'postal_code'
            ]].replace(np.nan, 0)

    df2.loc[:,
            ['latitude', 'longitude', 'phone', 'postal_code']] = df2.loc[:, [
                'latitude', 'longitude', 'phone', 'postal_code'
            ]].replace(np.nan, 0)

    return df1, df2
    def test_clean_accent_stripping(self):

        values = pd.Series(['ősdfésdfë', 'without'])
        expected = pd.Series(['osdfesdfe', 'without'])

        values_unicode = pd.Series([u'ősdfésdfë', u'without'])
        expected_unicode = pd.Series([u'osdfesdfe', u'without'])

        # values_callable = pd.Series([u'ősdfésdfë', u'without'])
        # expected_callable = pd.Series([u'ősdfésdfë', u'without'])

        # # Callable.
        # pdt.assert_series_equal(
        #     clean(values_callable, strip_accents=lambda x: x),
        #     expected_callable)

        # Check if series are identical.
        pdt.assert_series_equal(
            clean(values, strip_accents='unicode'), expected)

        # Check if series are identical.
        pdt.assert_series_equal(clean(values, strip_accents='ascii'), expected)

        # Check if series are identical.
        pdt.assert_series_equal(
            clean(values_unicode, strip_accents='unicode'), expected_unicode)

        # Check if series are identical.
        pdt.assert_series_equal(
            clean(values_unicode, strip_accents='ascii'), expected_unicode)

        with pytest.raises(ValueError):
            clean(values, strip_accents='unknown_algorithm')
def _create_pseudo_id(df_a):

    #first letter of given name
    given_name_1 = pandas.Series(clean(
        df_a['given_name']).str[:1].dropna().apply(
            lambda x: alphabet_position(x)),
                                 name='given_name_1')
    # second letter of the given name
    given_name_2 = pandas.Series(clean(
        df_a['given_name']).str[1:2].dropna().apply(
            lambda x: alphabet_position(x)),
                                 name='given_name_2')
    # first letter of the surname
    surname_1 = pandas.Series(clean(df_a['surname']).str[:1].dropna().apply(
        lambda x: alphabet_position(x)),
                              name='surname_1')
    # second letter of the surname
    surname_2 = pandas.Series(clean(df_a['surname']).str[1:2].dropna().apply(
        lambda x: alphabet_position(x)),
                              name='surname_2')

    #sex
    surname_2 = pandas.Series(clean(df_a['surname']).str[1:2].dropna().apply(
        lambda x: alphabet_position(x)),
                              name='surname_2')
    # year of birth
    yearb = pandas.Series(df_a['YearB'].dropna().astype(str).replace(
        'nan', ''),
                          name='YearB')
    # day of birth
    dayb = pandas.Series(df_a['DayB'].dropna().astype(str).replace('nan', ''),
                         name='DayB')
    df_u = pandas.concat(
        [given_name_1, given_name_2, surname_1, surname_2, yearb, dayb],
        axis=1,
        sort=False)
    df_u = df_u.fillna('')

    return df_u.apply(lambda x: ''.join(x), axis=1)
示例#10
0
def match_prep(df: pd.DataFrame, add_var: str) -> pd.DataFrame:
    """
    
    Prepare and clean a dataframe for address matching.
    

    Parameters
    ----------
    df : pd.DataFrame
        A dataframe to prepare.
    add_var : str
        The variable that contains the full address (minus postcode).

    Returns
    -------
    df : pd.DataFrame
        df with 'clean_address' and 'numbers' adding. Postcode column cleaned.

    """
    df = df.copy()
    df["clean_address"] = (
        clean(df[add_var], replace_by_whitespace="[\\_]")
        .str.replace(" +", " ")
        .str.replace("([^0-9])0+([0-9])", "\\1\\2")
        .str.replace("^0+", "")
        .str.strip()
    )

    df["numbers"] = (
        df.clean_address.str.replace("[a-z]{2,}", "")
        .str.replace(" +", " ")
        .str.strip()
        .str.split()
        .apply(sorted)
        .apply(lambda x: " ".join(x))
    )
    df.postcode = clean(df.postcode, replace_by_whitespace="[\\_]")

    return df
示例#11
0
    def test_clean_unicode(self):

        values = pd.Series([
            u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)',
            u'Mary  ann', u'John', np.nan
        ])

        expected = pd.Series([
            u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan
        ])

        clean_series = clean(values)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)
示例#12
0
def clean_data(df):
    df = df.replace(r'^\s*$', np.nan, regex=True)
    df = df[df['customer_id'].notna()]
    df['phone_clean'] = phonenumbers(df['phone'])
    df['first_name_clean'] = clean(df['first_name'])
    df['last_name_clean'] = clean(df['last_name'])
    df['address_clean'] = clean(df['address'])
    df['city_clean'] = clean(df['city'])
    df['state_clean'] = clean(df['state'])
    df['zip_clean'] = clean(df['zip'].str.split('-').str[0])
    return df
示例#13
0
    def test_clean_parameters(self):

        values = pd.Series([
            u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)',
            u'Mary  ann', u'John', np.nan
        ])

        expected = pd.Series([
            u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan
        ])

        clean_series = clean(values,
                             lowercase=True,
                             replace_by_none=r'[^ \-\_A-Za-z0-9]+',
                             replace_by_whitespace=r'[\-\_]',
                             remove_brackets=True)

        # Check if series are identical.
        pdt.assert_series_equal(clean_series, expected)
示例#14
0
def run_phonetic_encoding(df_a, select_encoding):
    """ Calulate the phonetic encoding of the selected fields """

    logging.info("run phonetic encoding ....")
    df_a_processed = df_a.copy()

    #FIXME Errors when selecting non string columns like soc_sec_id
    #TODO Include double metaphone in Python Toolkit
    for field, encoding in select_encoding.items():
        if (encoding == 'double_metaphone'):
            df_a_processed[encoding + "_" + field] = df_a[field].apply(
                lambda x: doublemetaphone(str(x))[0]
                if (np.all(pd.notnull(x))) else x)
        else:
            df_a_processed[encoding + "_" + field] = phonetic(clean(
                df_a[field]),
                                                              method=encoding)

    cols = df_a_processed.columns.to_list()

    return df_a_processed, cols
示例#15
0
def get_features(df):
    """Clean and engineer new features for match comparison."""

    # remove non-alpha characters from names and generate phonetic codes:
    df['Original Name_cleaned'] = clean(df['Original Name'])
    df['o_name_phonetic_code'] = phonetic(df['Original Name_cleaned'],
                                          method="soundex")

    df['Age'] = df['Age'].astype('float')

    #df['add2_lat'] = df['add2_lat'].astype('float')
    #df['add2_long'] = df['add2_long'].astype('float')

    # remove non-numeric characters from phone numbers and save as strings for levenshtein comparison:
    df['Phone'] = df['Phone'].str.replace(r'[^0-9]+', '')
    df['Phone'] = df['Phone'].astype(str)

    # remove all characters that aren't part of a URL:
    df['social_media'] = df['social_media'].apply(lambda x: re.findall(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
        str(x))).sort_values().str[0]
    df['social_media'] = df['social_media'].fillna(np.nan)

    return df
get_ipython().run_line_magic('pinfo', 'pandas.Series')
from recordlinkage.preprocessing import clean

wikidata
etichette = json.load(
    open('/Users/focs/wikidata/label2qid_1_percent_sample.json'))
etichette
get_ipython().run_line_magic('pinfo', 'pandas.Series')
serie = pandas.Series(etichette)
serie
serie.axes
serie = pandas.Series(etichette.keys())
serie
serie = pandas.Series(list(etichette.keys()))
serie
clean(serie)
clean(serie, replace_by_none=None, strip_accents='unicode')
clean(serie, replace_by_none=None, strip_accents='ascii')
clean(serie, replace_by_none=None, strip_accents='unicode')
discogs
discogs_df
from recordlinkage.preprocessing import phonetic
get_ipython().run_line_magic('pinfo', 'phonetic')
serie[66]
giappa = pandas.Series([serie[66]])
giappa
phonetic(giappa, 'soundex')
phonetic(giappa, 'metaphone')
phonetic(giappa, 'nysiis')
phonetic(giappa, 'match_rating')
get_ipython().run_line_magic('hist', '')
示例#17
0
def last_name_pool(view: View, colname='last_name_pool'):
    return clean(view[colname])
示例#18
0
pr = PR.lower()

ADD = pd.read_csv(INPUT_CSV_FILE, low_memory=False)

# drop entries with null values
ADD = ADD.dropna(subset=[NUMBER_COL, STREET_COL, CITY_COL])

ADD[NUMBER_COL] = ADD[NUMBER_COL].astype(str)
ADD = ADD.reset_index()
ADD = ADD.fillna('')

# remove line breaking characters, if they exist
ADD[STREET_COL] = ADD[STREET_COL].str.replace('\n', '')

# perform basic cleaning
ADD[STREET_COL] = clean(ADD[STREET_COL])
ADD[CITY_COL] = clean(ADD[CITY_COL])
ADD[NUMBER_COL] = ADD[NUMBER_COL].str.replace('\n', '')

# abbreviate street types and directions
print('abbreviating types...')
if PR == 'QC':
    ADD = AddressClean_fr(ADD, STREET_COL, STREET_COL)
else:
    ADD = AddressClean_en(ADD, STREET_COL, STREET_COL)

# Drop all but the specified columns in the data frame
ADD = ADD[[LAT_COL, LONG_COL, NUMBER_COL, STREET_COL, CITY_COL, POSTAL_CODE_COL]]

N1 = len(ADD)
ADD['Duplicated'] = ADD.duplicated(subset=[NUMBER_COL, STREET_COL, CITY_COL], keep=False)
示例#19
0
def first_name_pool(view: View, colname='first_name_pool'):
    return clean(view[colname])
示例#20
0
def middle_name_pool(view: View, colname='middle_name_pool'):
    return clean(view[colname])
示例#21
0
    def collect_identical_rows_alg(self, schema_id, table_name, sorting_key,
                                   fixed_column_names, var_column_names, alg):

        schema_name = 'schema-' + str(schema_id)
        dedup_table_name = '_dedup_' + table_name + "_grouped"

        # TODO When user selects rows to remove, collect in table.
        # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO)

        try:

            # Remove complete duplicates before full dedup
            self.remove_identical_rows(
                schema_id,
                table_name,
            )

            # SELECT id, 'column' FROM "schema_name"."table";
            data_query = 'SELECT * FROM {}.{}'.format(
                *_ci(schema_name, table_name))
            df = pd.read_sql(data_query, con=db.engine)
            df = df.set_index('id')

            # Clean dataset

            ## Remove leading whitespaces
            #df.columns = df.columns.to_series().apply(lambda x: x.strip())

            if sorting_key not in fixed_column_names:
                fixed_column_names.append(sorting_key)

            string_columns = list(df.select_dtypes(include=['object']).columns)
            numerical_columns = list(
                df.select_dtypes(include=['int64']).columns)
            numerical_columns.extend(
                list(df.select_dtypes(include=['float64']).columns))
            date_columns = list(
                df.select_dtypes(include=['datetime64[ns]']).columns)

            ## Clean string values
            for column_name in string_columns:
                df[column_name] = clean(df[column_name])

            # Indexation step
            indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key,
                                                             window=3)
            pairs = indexer.index(df)

            # Comparison step
            compare_cl = recordlinkage.Compare()

            ## Exact matches
            for column_name in fixed_column_names:
                compare_cl.exact(column_name, column_name, label=column_name)

            ## Variable matches calculated using an alg (levenshtein / numerical / date)
            for column_name in var_column_names:
                if column_name in numerical_columns:
                    compare_cl.numeric(column_name,
                                       column_name,
                                       method='linear',
                                       offset=10,
                                       scale=10)
                elif column_name in date_columns:
                    compare_cl.date(column_name, column_name)
                elif column_name in string_columns:
                    compare_cl.string(column_name,
                                      column_name,
                                      method=alg,
                                      threshold=0.75,
                                      label=column_name)

            potential_pairs = compare_cl.compute(pairs, df)

            # Classification step
            kmeans = recordlinkage.KMeansClassifier()
            kmeans.learn(potential_pairs)
            matches = kmeans.predict(potential_pairs)

            if len(matches) == 0:
                return False

            # Grouping step
            ## Group matches (A,B), (B,C) into (A,B,C)
            groups = self.group_matches(matches)

            #TODO Create table _dedup_table_groups
            self.create_duplicate_table(schema_id, table_name, groups)

            return True

        except Exception as e:
            app.logger.error(
                "[ERROR] Unable to generate clusters of duplicate rows from table '{}'"
                .format(dedup_table_name))
            app.logger.exception(e)
            raise e
示例#22
0
import pandas as pd
import numpy as np
# record linkage and preprocessing
import recordlinkage as rl
from recordlinkage.preprocessing import clean, phonetic

#inventors = pd.read_csv('../Data/patentsview_inventors.csv')
assignees = pd.read_csv('../Data/patentsview_assignee_org_names.csv')
organizations = pd.read_csv('../Data/fedreporter_org_names_grants.csv')


#Do cleaning for assignee
assignees.head()
assignees['organizations_clean'] = clean(assignees['organization'])
assignees['organization'] = assignees['organizations_clean'].str.replace(' ','')
assignees["organization_phonetic"] = phonetic(assignees["organization"], method="nysiis")
assignees  = assignees.loc[assignees.org_country == "US",:]

#Do cleaning for organization
organizations[' ORGANIZATION_NAME_CLEAN'] = clean(organizations[' ORGANIZATION_NAME'])
organizations['organization'] = organizations[' ORGANIZATION_NAME_CLEAN'].str.replace(' ','')
organizations['organization_phonetic'] = phonetic(organizations["organization"], method="nysiis")
organizations  = organizations.loc[organizations[' ORGANIZATION_COUNTRY'] == "UNITED STATES",:]


print(assignees.shape)
print(assignees.patent_id.drop_duplicates().shape)

#Explore stuff
assignees.head()
organizations.head()
示例#23
0
    def collect_identical_rows_alg(self, schema_id, table_name, sorting_key, fixed_column_names, var_column_names, alg):

        schema_name = 'schema-' + str(schema_id)
        dedup_table_name = '_dedup_' + table_name + "_grouped"

        # TODO When user selects rows to remove, collect in table.
        # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO)

        try:

            # Remove complete duplicates before full dedup
            self.remove_identical_rows(schema_id, table_name, )

            # SELECT id, 'column' FROM "schema_name"."table";
            data_query = 'SELECT * FROM {}.{}'.format(*_ci(schema_name, table_name))
            df = pd.read_sql(data_query, con=db.engine)
            df = df.set_index('id')

            # Clean dataset

            ## Remove leading whitespaces
            #df.columns = df.columns.to_series().apply(lambda x: x.strip())

            if sorting_key not in fixed_column_names:
                fixed_column_names.append(sorting_key)

            string_columns = list(df.select_dtypes(include=['object']).columns)
            numerical_columns = list(df.select_dtypes(include=['int64']).columns)
            numerical_columns.extend(list(df.select_dtypes(include=['float64']).columns))
            date_columns = list(df.select_dtypes(include=['datetime64[ns]']).columns)

            ## Clean string values
            for column_name in string_columns:
                df[column_name] = clean(df[column_name])

            # Indexation step
            indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key, window=3)
            pairs = indexer.index(df)

            # Comparison step
            compare_cl = recordlinkage.Compare()

            ## Exact matches
            for column_name in fixed_column_names:
                compare_cl.exact(column_name, column_name, label=column_name)

            ## Variable matches calculated using an alg (levenshtein / numerical / date)
            for column_name in var_column_names:
                if column_name in numerical_columns:
                    compare_cl.numeric(column_name, column_name, method='linear', offset=10, scale=10)
                elif column_name in date_columns:
                    compare_cl.date(column_name, column_name)
                elif column_name in string_columns:
                    compare_cl.string(column_name, column_name, method=alg, threshold=0.75, label=column_name)

            potential_pairs = compare_cl.compute(pairs, df)

            # Classification step
            kmeans = recordlinkage.KMeansClassifier()
            kmeans.learn(potential_pairs)
            matches = kmeans.predict(potential_pairs)

            if len(matches) == 0:
                return False

            # Grouping step
            ## Group matches (A,B), (B,C) into (A,B,C)
            groups = self.group_matches(matches)


            #TODO Create table _dedup_table_groups
            self.create_duplicate_table(schema_id, table_name, groups)

            return True

        except Exception as e:
            app.logger.error(
                "[ERROR] Unable to generate clusters of duplicate rows from table '{}'".format(dedup_table_name))
            app.logger.exception(e)
            raise e
示例#24
0
def preprocess(sfdf, repdf):
    print('enter PREPROCESS')

    global key_list, keys
    '''preprocessing'''

    sfdf.update(clean(sfdf.FirstName))
    sfdf.update(clean(sfdf.LastName))
    sfdf.update(clean(sfdf.Email))
    sfdf.update(clean(sfdf.State))
    sfdf.update(phonenumbers(sfdf.Zip))
    sfdf.update(clean(sfdf.City))
    sfdf.update(phonenumbers(sfdf.Phone))
    sfdf.update(clean(sfdf.CRD.astype(str)))

    repdf.update(clean(repdf.FirstName))
    repdf.update(clean(repdf.LastName))
    repdf.update(clean(repdf.Email))
    repdf.update(clean(repdf.State))
    repdf.update(phonenumbers(repdf.Zip))
    repdf.update(clean(repdf.City))
    repdf.update(phonenumbers(repdf.Phone))
    repdf.update(clean(repdf.CRD.astype(str)))

    '''key generating'''

    for df in [sfdf, repdf]:
        for key in keys:
            if len(key) > 1:
                key_col = ''.join([''.join(c for c in s if c.isupper()) for s in key])
                if key_col not in key_list:
                    key_list.append(key_col)
                df[key_col] = pd.Series(np.add.reduce(df[key].astype(str), axis=1))
            else:
                if key[0] not in key_list:
                    key_list.append(key[0])
    print('exit PREPROCESS')
示例#25
0
    def test_clean_empty(self):
        """ Test the cleaning of an empty Series"""

        # Check empty series
        pdt.assert_series_equal(clean(pd.Series()), pd.Series())
PATH_REACH17_DATASET = "C:/Users/cmcinerney/Desktop/UNOCHA Fellowship/Afghanistan_microdata/Case study datasets/Harmonised/reach_afg_dataset_mcna_aug2017_harmonised.xlsx"

PATH_SDC_DATASET = "C:/Users/cmcinerney/Desktop/UNOCHA Fellowship/Afghanistan_microdata/Case study datasets/Harmonised/sdc-afg-msna-microdata-harmonised.xlsx"

df_reach18 = pd.read_excel(PATH_REACH_DATASET)
df_reach17 = pd.read_excel(PATH_REACH17_DATASET)
df_sdc = pd.read_excel(PATH_SDC_DATASET)

# =============================================================================
# Preprocessing the data to increase likelihood of finding linkages
# =============================================================================
#string variables to clean
df_reach18.province = clean(df_reach18.province,
                            lowercase=True,
                            replace_by_none='[^ \\-\\_A-Za-z0-9]+',
                            replace_by_whitespace='[\\-\\_]',
                            strip_accents=None,
                            remove_brackets=True,
                            encoding='utf-8',
                            decode_error='strict')
df_reach18.district = clean(df_reach18.district,
                            lowercase=True,
                            replace_by_none='[^ \\-\\_A-Za-z0-9]+',
                            replace_by_whitespace='[\\-\\_]',
                            strip_accents=None,
                            remove_brackets=True,
                            encoding='utf-8',
                            decode_error='strict')
df_reach18.origin_country = clean(df_reach18.origin_country,
                                  lowercase=True,
                                  replace_by_none='[^ \\-\\_A-Za-z0-9]+',
                                  replace_by_whitespace='[\\-\\_]',
示例#27
0
 def _expand_sex(df, col):
     df[col] = preprocessing.clean(df[col])
示例#28
0
def preprocess(sfdf, repdf, keys):
    key_list = list()
    '''preprocessing'''
    sfdf.update(clean(sfdf.FirstName))
    sfdf.update(clean(sfdf.LastName))
    sfdf.update(clean(sfdf.Email))
    sfdf.update(clean(sfdf.MailingState))
    sfdf.update(phonenumbers(sfdf.MailingPostalCode))
    sfdf.update(clean(sfdf.MailingCity))
    sfdf.update(phonenumbers(sfdf.Phone))
    sfdf.update(clean(sfdf.CRD__c.astype(str)))

    repdf.update(clean(repdf.FirstName))
    repdf.update(clean(repdf.LastName))
    repdf.update(clean(repdf.Email))
    repdf.update(clean(repdf.MailingState))
    repdf.update(phonenumbers(repdf.MailingPostalCode))
    repdf.update(clean(repdf.MailingCity))
    repdf.update(phonenumbers(repdf.Phone))
    repdf.update(clean(repdf.CRD__c.astype(str)))
    '''key generating'''
    for df in [sfdf, repdf]:
        for key in keys:
            if len(key[:-1]) > 1:
                key_col = ''.join(
                    [''.join(c for c in s if c.isupper()) for s in key[:-1]])
                if key_col not in key_list:
                    key_list.append(key_col)
                df[key_col] = pd.Series(
                    np.add.reduce(df[key[:-1]].astype(str), axis=1))
            else:
                if key[0] not in key_list:
                    key_list.append(key[0])
    return sfdf, repdf, key_list
示例#29
0
 def _expand_location(df, col):
     df[col] = preprocessing.clean(df[col])
示例#30
0
    df["Name"] = df["Name"].str.replace(r, ' ', regex=False)
    df["Address"] = df["Address"].str.replace(r, ' ', regex=False)

#remove excess whitespace
df["Name"] = df["Name"].str.replace(r" +", " ", regex=True)
df["Address"] = df["Address"].str.replace(r" +", " ", regex=True)

#standardise postal codes - just remove empty space and make sure it's all lower case

df.loc[~df.PostalCode.isnull(),
       'PostalCode'] = df.loc[~df.PostalCode.isnull(),
                              'PostalCode'].str.replace(' ', '').str.lower()

#create an extra temporary Name column with an additional level of cleaning

df['NameClean'] = clean(df["Name"])

#Some records have street number and street name, but no address field filled

df.loc[(df.Address.isnull())&\
       (~df.StreetName.isnull()),'Address']\
    =clean(df.loc[(df.Address.isnull())&\
       (~df.StreetName.isnull()),'StreetNumber']+' '+\
           df.loc[(df.Address.isnull())&\
       (~df.StreetName.isnull()),'StreetName']+' '+\
        df.loc[(df.Address.isnull())&\
       (~df.StreetName.isnull()),'City'])

print(
    '1.b) Simple Deduplication - use Pandas to drop identical rows before additional processing'
)
示例#31
0
    # PREPROCESSING SP500
    len(sp500)
    sp500.head()
    fig_sp500 = plt.figure(figsize=(9, 4))
    heatmap_sp500 = sns.heatmap(sp500.isnull(), cbar=False)
    plt.title("SP500: Missing Values ")
    plt.savefig("MV-sp500.png", dpi=700, bbox_inches='tight')
    plt.show()
    sp500['Date first added'].isnull().sum()

    sp500['GICS Sector'].unique()
    sp500['GICS Sub Industry'].unique()

    sp500_pre = sp500.copy()
    sp500_pre['GICS Sector'] = clean(sp500_pre['GICS Sector'])
    sp500_pre['GICS Sub Industry'] = clean(sp500_pre['GICS Sub Industry'])
    sp500_pre['Security'] = clean(sp500_pre['Security'])
    sp500_pre["Security"].str.count(r'\bcorp\b').sum()
    sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\bcorp\b',
                                                              '',
                                                              regex=True)
    sp500_pre["Security"].str.count(r'\binc\b').sum()
    sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\binc\b',
                                                              '',
                                                              regex=True)
    sp500_pre["Security"].str.count(r'\bco\b').sum()
    sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\bco\b',
                                                              '',
                                                              regex=True)
    sp500_pre["Security"].str.count(r'\bltdp\b').sum()