def test_clean(self): values = pd.Series([ 'Mary-ann', 'Bob :)', 'Angel', 'Bob (alias Billy)', 'Mary ann', 'John', np.nan ]) expected = pd.Series( ['mary ann', 'bob', 'angel', 'bob', 'mary ann', 'john', np.nan]) clean_series = clean(values) # Check if series are identical. pdt.assert_series_equal(clean_series, expected) clean_series_nothing = clean( values, lowercase=False, replace_by_none=False, replace_by_whitespace=False, strip_accents=False, remove_brackets=False) # Check if ntohing happend. pdt.assert_series_equal(clean_series_nothing, values)
def _expand_name(df, col): df[col] = preprocessing.clean(df[col]) df[col + "---initial"] = df[col].str[0] df[col + "---soundex"] = preprocessing.phonetic(df[col], method="soundex") df[col + "---nysiis"] = preprocessing.phonetic(df[col], method="nysiis") df[col + "---metaphone"] = preprocessing.phonetic(df[col], method="metaphone")
def test_clean_brackets(self): values = pd.Series([np.nan, 'bra(cke)ts', 'brackets with (brackets)']) expected = pd.Series([np.nan, 'brats', 'brackets with']) clean_series = clean(values, remove_brackets=True) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def test_clean_lower(self): values = pd.Series([np.nan, 'LowerHigher', 'HIGHERLOWER']) expected = pd.Series([np.nan, 'lowerhigher', 'higherlower']) clean_series = clean(values, lowercase=True) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def clean_raw(df: pd.DataFrame, exclude=['date', 'mpi']): def _is_excluded(col: str, exclude: list) -> bool: return True in [e in col for e in exclude] obj_columns = df.select_dtypes(include='object').columns temp = df.copy() for col in obj_columns: if not _is_excluded(col, exclude): temp[col] = clean(temp[col]) return temp
def preproc_attributes(df: pd.DataFrame, names: List[str]) -> pd.DataFrame: for n in names: df[n + '_clean'] = preprocessing.clean(df[n]) df[n + '_soundex'] = preprocessing.phonetic(df[n + '_clean'], 'soundex') df[n + '_nysiis'] = preprocessing.phonetic(df[n + '_clean'], 'nysiis') df[n + '_metaphone'] = preprocessing.phonetic(df[n + '_clean'], 'metaphone') df[n + '_match_rating'] = preprocessing.phonetic( df[n + '_clean'], 'match_rating') return df
def preprocess(df1, df2): # set index to the id column df1 = df1.set_index('id') df2 = df2.set_index('id') # replace empty cells with NaN df1 = df1.replace("", np.nan) df2 = df2.replace("", np.nan) # drop country, locality and region df1 = df1.drop(['country', 'locality', 'region'], axis=1) df2 = df2.drop(['country', 'locality', 'region'], axis=1) # remove all non-numbers from phone & convert to numeric df1.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df1.loc[:, 'phone'])) df2.loc[:, 'phone'] = pd.to_numeric(phonenumbers(df2.loc[:, 'phone'])) # convert postal_code to numeric df1.loc[:, 'postal_code'] = pd.to_numeric(df1.loc[:, 'postal_code']) df2.loc[:, 'postal_code'] = pd.to_numeric(df2.loc[:, 'postal_code']) # clean street_address & website df1.loc[:, 'street_address'] = clean(df1.loc[:, 'street_address']) df1.loc[:, 'website'] = clean(df1.loc[:, 'website']) df2.loc[:, 'street_address'] = clean(df2.loc[:, 'street_address']) df2.loc[:, 'website'] = clean(df2.loc[:, 'website']) # convert NaNs to 0s for numerics df1.loc[:, ['latitude', 'longitude', 'phone', 'postal_code']] = df1.loc[:, [ 'latitude', 'longitude', 'phone', 'postal_code' ]].replace(np.nan, 0) df2.loc[:, ['latitude', 'longitude', 'phone', 'postal_code']] = df2.loc[:, [ 'latitude', 'longitude', 'phone', 'postal_code' ]].replace(np.nan, 0) return df1, df2
def test_clean_accent_stripping(self): values = pd.Series(['ősdfésdfë', 'without']) expected = pd.Series(['osdfesdfe', 'without']) values_unicode = pd.Series([u'ősdfésdfë', u'without']) expected_unicode = pd.Series([u'osdfesdfe', u'without']) # values_callable = pd.Series([u'ősdfésdfë', u'without']) # expected_callable = pd.Series([u'ősdfésdfë', u'without']) # # Callable. # pdt.assert_series_equal( # clean(values_callable, strip_accents=lambda x: x), # expected_callable) # Check if series are identical. pdt.assert_series_equal( clean(values, strip_accents='unicode'), expected) # Check if series are identical. pdt.assert_series_equal(clean(values, strip_accents='ascii'), expected) # Check if series are identical. pdt.assert_series_equal( clean(values_unicode, strip_accents='unicode'), expected_unicode) # Check if series are identical. pdt.assert_series_equal( clean(values_unicode, strip_accents='ascii'), expected_unicode) with pytest.raises(ValueError): clean(values, strip_accents='unknown_algorithm')
def _create_pseudo_id(df_a): #first letter of given name given_name_1 = pandas.Series(clean( df_a['given_name']).str[:1].dropna().apply( lambda x: alphabet_position(x)), name='given_name_1') # second letter of the given name given_name_2 = pandas.Series(clean( df_a['given_name']).str[1:2].dropna().apply( lambda x: alphabet_position(x)), name='given_name_2') # first letter of the surname surname_1 = pandas.Series(clean(df_a['surname']).str[:1].dropna().apply( lambda x: alphabet_position(x)), name='surname_1') # second letter of the surname surname_2 = pandas.Series(clean(df_a['surname']).str[1:2].dropna().apply( lambda x: alphabet_position(x)), name='surname_2') #sex surname_2 = pandas.Series(clean(df_a['surname']).str[1:2].dropna().apply( lambda x: alphabet_position(x)), name='surname_2') # year of birth yearb = pandas.Series(df_a['YearB'].dropna().astype(str).replace( 'nan', ''), name='YearB') # day of birth dayb = pandas.Series(df_a['DayB'].dropna().astype(str).replace('nan', ''), name='DayB') df_u = pandas.concat( [given_name_1, given_name_2, surname_1, surname_2, yearb, dayb], axis=1, sort=False) df_u = df_u.fillna('') return df_u.apply(lambda x: ''.join(x), axis=1)
def match_prep(df: pd.DataFrame, add_var: str) -> pd.DataFrame: """ Prepare and clean a dataframe for address matching. Parameters ---------- df : pd.DataFrame A dataframe to prepare. add_var : str The variable that contains the full address (minus postcode). Returns ------- df : pd.DataFrame df with 'clean_address' and 'numbers' adding. Postcode column cleaned. """ df = df.copy() df["clean_address"] = ( clean(df[add_var], replace_by_whitespace="[\\_]") .str.replace(" +", " ") .str.replace("([^0-9])0+([0-9])", "\\1\\2") .str.replace("^0+", "") .str.strip() ) df["numbers"] = ( df.clean_address.str.replace("[a-z]{2,}", "") .str.replace(" +", " ") .str.strip() .str.split() .apply(sorted) .apply(lambda x: " ".join(x)) ) df.postcode = clean(df.postcode, replace_by_whitespace="[\\_]") return df
def test_clean_unicode(self): values = pd.Series([ u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)', u'Mary ann', u'John', np.nan ]) expected = pd.Series([ u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan ]) clean_series = clean(values) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def clean_data(df): df = df.replace(r'^\s*$', np.nan, regex=True) df = df[df['customer_id'].notna()] df['phone_clean'] = phonenumbers(df['phone']) df['first_name_clean'] = clean(df['first_name']) df['last_name_clean'] = clean(df['last_name']) df['address_clean'] = clean(df['address']) df['city_clean'] = clean(df['city']) df['state_clean'] = clean(df['state']) df['zip_clean'] = clean(df['zip'].str.split('-').str[0]) return df
def test_clean_parameters(self): values = pd.Series([ u'Mary-ann', u'Bob :)', u'Angel', u'Bob (alias Billy)', u'Mary ann', u'John', np.nan ]) expected = pd.Series([ u'mary ann', u'bob', u'angel', u'bob', u'mary ann', u'john', np.nan ]) clean_series = clean(values, lowercase=True, replace_by_none=r'[^ \-\_A-Za-z0-9]+', replace_by_whitespace=r'[\-\_]', remove_brackets=True) # Check if series are identical. pdt.assert_series_equal(clean_series, expected)
def run_phonetic_encoding(df_a, select_encoding): """ Calulate the phonetic encoding of the selected fields """ logging.info("run phonetic encoding ....") df_a_processed = df_a.copy() #FIXME Errors when selecting non string columns like soc_sec_id #TODO Include double metaphone in Python Toolkit for field, encoding in select_encoding.items(): if (encoding == 'double_metaphone'): df_a_processed[encoding + "_" + field] = df_a[field].apply( lambda x: doublemetaphone(str(x))[0] if (np.all(pd.notnull(x))) else x) else: df_a_processed[encoding + "_" + field] = phonetic(clean( df_a[field]), method=encoding) cols = df_a_processed.columns.to_list() return df_a_processed, cols
def get_features(df): """Clean and engineer new features for match comparison.""" # remove non-alpha characters from names and generate phonetic codes: df['Original Name_cleaned'] = clean(df['Original Name']) df['o_name_phonetic_code'] = phonetic(df['Original Name_cleaned'], method="soundex") df['Age'] = df['Age'].astype('float') #df['add2_lat'] = df['add2_lat'].astype('float') #df['add2_long'] = df['add2_long'].astype('float') # remove non-numeric characters from phone numbers and save as strings for levenshtein comparison: df['Phone'] = df['Phone'].str.replace(r'[^0-9]+', '') df['Phone'] = df['Phone'].astype(str) # remove all characters that aren't part of a URL: df['social_media'] = df['social_media'].apply(lambda x: re.findall( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', str(x))).sort_values().str[0] df['social_media'] = df['social_media'].fillna(np.nan) return df
get_ipython().run_line_magic('pinfo', 'pandas.Series') from recordlinkage.preprocessing import clean wikidata etichette = json.load( open('/Users/focs/wikidata/label2qid_1_percent_sample.json')) etichette get_ipython().run_line_magic('pinfo', 'pandas.Series') serie = pandas.Series(etichette) serie serie.axes serie = pandas.Series(etichette.keys()) serie serie = pandas.Series(list(etichette.keys())) serie clean(serie) clean(serie, replace_by_none=None, strip_accents='unicode') clean(serie, replace_by_none=None, strip_accents='ascii') clean(serie, replace_by_none=None, strip_accents='unicode') discogs discogs_df from recordlinkage.preprocessing import phonetic get_ipython().run_line_magic('pinfo', 'phonetic') serie[66] giappa = pandas.Series([serie[66]]) giappa phonetic(giappa, 'soundex') phonetic(giappa, 'metaphone') phonetic(giappa, 'nysiis') phonetic(giappa, 'match_rating') get_ipython().run_line_magic('hist', '')
def last_name_pool(view: View, colname='last_name_pool'): return clean(view[colname])
pr = PR.lower() ADD = pd.read_csv(INPUT_CSV_FILE, low_memory=False) # drop entries with null values ADD = ADD.dropna(subset=[NUMBER_COL, STREET_COL, CITY_COL]) ADD[NUMBER_COL] = ADD[NUMBER_COL].astype(str) ADD = ADD.reset_index() ADD = ADD.fillna('') # remove line breaking characters, if they exist ADD[STREET_COL] = ADD[STREET_COL].str.replace('\n', '') # perform basic cleaning ADD[STREET_COL] = clean(ADD[STREET_COL]) ADD[CITY_COL] = clean(ADD[CITY_COL]) ADD[NUMBER_COL] = ADD[NUMBER_COL].str.replace('\n', '') # abbreviate street types and directions print('abbreviating types...') if PR == 'QC': ADD = AddressClean_fr(ADD, STREET_COL, STREET_COL) else: ADD = AddressClean_en(ADD, STREET_COL, STREET_COL) # Drop all but the specified columns in the data frame ADD = ADD[[LAT_COL, LONG_COL, NUMBER_COL, STREET_COL, CITY_COL, POSTAL_CODE_COL]] N1 = len(ADD) ADD['Duplicated'] = ADD.duplicated(subset=[NUMBER_COL, STREET_COL, CITY_COL], keep=False)
def first_name_pool(view: View, colname='first_name_pool'): return clean(view[colname])
def middle_name_pool(view: View, colname='middle_name_pool'): return clean(view[colname])
def collect_identical_rows_alg(self, schema_id, table_name, sorting_key, fixed_column_names, var_column_names, alg): schema_name = 'schema-' + str(schema_id) dedup_table_name = '_dedup_' + table_name + "_grouped" # TODO When user selects rows to remove, collect in table. # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO) try: # Remove complete duplicates before full dedup self.remove_identical_rows( schema_id, table_name, ) # SELECT id, 'column' FROM "schema_name"."table"; data_query = 'SELECT * FROM {}.{}'.format( *_ci(schema_name, table_name)) df = pd.read_sql(data_query, con=db.engine) df = df.set_index('id') # Clean dataset ## Remove leading whitespaces #df.columns = df.columns.to_series().apply(lambda x: x.strip()) if sorting_key not in fixed_column_names: fixed_column_names.append(sorting_key) string_columns = list(df.select_dtypes(include=['object']).columns) numerical_columns = list( df.select_dtypes(include=['int64']).columns) numerical_columns.extend( list(df.select_dtypes(include=['float64']).columns)) date_columns = list( df.select_dtypes(include=['datetime64[ns]']).columns) ## Clean string values for column_name in string_columns: df[column_name] = clean(df[column_name]) # Indexation step indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key, window=3) pairs = indexer.index(df) # Comparison step compare_cl = recordlinkage.Compare() ## Exact matches for column_name in fixed_column_names: compare_cl.exact(column_name, column_name, label=column_name) ## Variable matches calculated using an alg (levenshtein / numerical / date) for column_name in var_column_names: if column_name in numerical_columns: compare_cl.numeric(column_name, column_name, method='linear', offset=10, scale=10) elif column_name in date_columns: compare_cl.date(column_name, column_name) elif column_name in string_columns: compare_cl.string(column_name, column_name, method=alg, threshold=0.75, label=column_name) potential_pairs = compare_cl.compute(pairs, df) # Classification step kmeans = recordlinkage.KMeansClassifier() kmeans.learn(potential_pairs) matches = kmeans.predict(potential_pairs) if len(matches) == 0: return False # Grouping step ## Group matches (A,B), (B,C) into (A,B,C) groups = self.group_matches(matches) #TODO Create table _dedup_table_groups self.create_duplicate_table(schema_id, table_name, groups) return True except Exception as e: app.logger.error( "[ERROR] Unable to generate clusters of duplicate rows from table '{}'" .format(dedup_table_name)) app.logger.exception(e) raise e
import pandas as pd import numpy as np # record linkage and preprocessing import recordlinkage as rl from recordlinkage.preprocessing import clean, phonetic #inventors = pd.read_csv('../Data/patentsview_inventors.csv') assignees = pd.read_csv('../Data/patentsview_assignee_org_names.csv') organizations = pd.read_csv('../Data/fedreporter_org_names_grants.csv') #Do cleaning for assignee assignees.head() assignees['organizations_clean'] = clean(assignees['organization']) assignees['organization'] = assignees['organizations_clean'].str.replace(' ','') assignees["organization_phonetic"] = phonetic(assignees["organization"], method="nysiis") assignees = assignees.loc[assignees.org_country == "US",:] #Do cleaning for organization organizations[' ORGANIZATION_NAME_CLEAN'] = clean(organizations[' ORGANIZATION_NAME']) organizations['organization'] = organizations[' ORGANIZATION_NAME_CLEAN'].str.replace(' ','') organizations['organization_phonetic'] = phonetic(organizations["organization"], method="nysiis") organizations = organizations.loc[organizations[' ORGANIZATION_COUNTRY'] == "UNITED STATES",:] print(assignees.shape) print(assignees.patent_id.drop_duplicates().shape) #Explore stuff assignees.head() organizations.head()
def collect_identical_rows_alg(self, schema_id, table_name, sorting_key, fixed_column_names, var_column_names, alg): schema_name = 'schema-' + str(schema_id) dedup_table_name = '_dedup_' + table_name + "_grouped" # TODO When user selects rows to remove, collect in table. # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO) try: # Remove complete duplicates before full dedup self.remove_identical_rows(schema_id, table_name, ) # SELECT id, 'column' FROM "schema_name"."table"; data_query = 'SELECT * FROM {}.{}'.format(*_ci(schema_name, table_name)) df = pd.read_sql(data_query, con=db.engine) df = df.set_index('id') # Clean dataset ## Remove leading whitespaces #df.columns = df.columns.to_series().apply(lambda x: x.strip()) if sorting_key not in fixed_column_names: fixed_column_names.append(sorting_key) string_columns = list(df.select_dtypes(include=['object']).columns) numerical_columns = list(df.select_dtypes(include=['int64']).columns) numerical_columns.extend(list(df.select_dtypes(include=['float64']).columns)) date_columns = list(df.select_dtypes(include=['datetime64[ns]']).columns) ## Clean string values for column_name in string_columns: df[column_name] = clean(df[column_name]) # Indexation step indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key, window=3) pairs = indexer.index(df) # Comparison step compare_cl = recordlinkage.Compare() ## Exact matches for column_name in fixed_column_names: compare_cl.exact(column_name, column_name, label=column_name) ## Variable matches calculated using an alg (levenshtein / numerical / date) for column_name in var_column_names: if column_name in numerical_columns: compare_cl.numeric(column_name, column_name, method='linear', offset=10, scale=10) elif column_name in date_columns: compare_cl.date(column_name, column_name) elif column_name in string_columns: compare_cl.string(column_name, column_name, method=alg, threshold=0.75, label=column_name) potential_pairs = compare_cl.compute(pairs, df) # Classification step kmeans = recordlinkage.KMeansClassifier() kmeans.learn(potential_pairs) matches = kmeans.predict(potential_pairs) if len(matches) == 0: return False # Grouping step ## Group matches (A,B), (B,C) into (A,B,C) groups = self.group_matches(matches) #TODO Create table _dedup_table_groups self.create_duplicate_table(schema_id, table_name, groups) return True except Exception as e: app.logger.error( "[ERROR] Unable to generate clusters of duplicate rows from table '{}'".format(dedup_table_name)) app.logger.exception(e) raise e
def preprocess(sfdf, repdf): print('enter PREPROCESS') global key_list, keys '''preprocessing''' sfdf.update(clean(sfdf.FirstName)) sfdf.update(clean(sfdf.LastName)) sfdf.update(clean(sfdf.Email)) sfdf.update(clean(sfdf.State)) sfdf.update(phonenumbers(sfdf.Zip)) sfdf.update(clean(sfdf.City)) sfdf.update(phonenumbers(sfdf.Phone)) sfdf.update(clean(sfdf.CRD.astype(str))) repdf.update(clean(repdf.FirstName)) repdf.update(clean(repdf.LastName)) repdf.update(clean(repdf.Email)) repdf.update(clean(repdf.State)) repdf.update(phonenumbers(repdf.Zip)) repdf.update(clean(repdf.City)) repdf.update(phonenumbers(repdf.Phone)) repdf.update(clean(repdf.CRD.astype(str))) '''key generating''' for df in [sfdf, repdf]: for key in keys: if len(key) > 1: key_col = ''.join([''.join(c for c in s if c.isupper()) for s in key]) if key_col not in key_list: key_list.append(key_col) df[key_col] = pd.Series(np.add.reduce(df[key].astype(str), axis=1)) else: if key[0] not in key_list: key_list.append(key[0]) print('exit PREPROCESS')
def test_clean_empty(self): """ Test the cleaning of an empty Series""" # Check empty series pdt.assert_series_equal(clean(pd.Series()), pd.Series())
PATH_REACH17_DATASET = "C:/Users/cmcinerney/Desktop/UNOCHA Fellowship/Afghanistan_microdata/Case study datasets/Harmonised/reach_afg_dataset_mcna_aug2017_harmonised.xlsx" PATH_SDC_DATASET = "C:/Users/cmcinerney/Desktop/UNOCHA Fellowship/Afghanistan_microdata/Case study datasets/Harmonised/sdc-afg-msna-microdata-harmonised.xlsx" df_reach18 = pd.read_excel(PATH_REACH_DATASET) df_reach17 = pd.read_excel(PATH_REACH17_DATASET) df_sdc = pd.read_excel(PATH_SDC_DATASET) # ============================================================================= # Preprocessing the data to increase likelihood of finding linkages # ============================================================================= #string variables to clean df_reach18.province = clean(df_reach18.province, lowercase=True, replace_by_none='[^ \\-\\_A-Za-z0-9]+', replace_by_whitespace='[\\-\\_]', strip_accents=None, remove_brackets=True, encoding='utf-8', decode_error='strict') df_reach18.district = clean(df_reach18.district, lowercase=True, replace_by_none='[^ \\-\\_A-Za-z0-9]+', replace_by_whitespace='[\\-\\_]', strip_accents=None, remove_brackets=True, encoding='utf-8', decode_error='strict') df_reach18.origin_country = clean(df_reach18.origin_country, lowercase=True, replace_by_none='[^ \\-\\_A-Za-z0-9]+', replace_by_whitespace='[\\-\\_]',
def _expand_sex(df, col): df[col] = preprocessing.clean(df[col])
def preprocess(sfdf, repdf, keys): key_list = list() '''preprocessing''' sfdf.update(clean(sfdf.FirstName)) sfdf.update(clean(sfdf.LastName)) sfdf.update(clean(sfdf.Email)) sfdf.update(clean(sfdf.MailingState)) sfdf.update(phonenumbers(sfdf.MailingPostalCode)) sfdf.update(clean(sfdf.MailingCity)) sfdf.update(phonenumbers(sfdf.Phone)) sfdf.update(clean(sfdf.CRD__c.astype(str))) repdf.update(clean(repdf.FirstName)) repdf.update(clean(repdf.LastName)) repdf.update(clean(repdf.Email)) repdf.update(clean(repdf.MailingState)) repdf.update(phonenumbers(repdf.MailingPostalCode)) repdf.update(clean(repdf.MailingCity)) repdf.update(phonenumbers(repdf.Phone)) repdf.update(clean(repdf.CRD__c.astype(str))) '''key generating''' for df in [sfdf, repdf]: for key in keys: if len(key[:-1]) > 1: key_col = ''.join( [''.join(c for c in s if c.isupper()) for s in key[:-1]]) if key_col not in key_list: key_list.append(key_col) df[key_col] = pd.Series( np.add.reduce(df[key[:-1]].astype(str), axis=1)) else: if key[0] not in key_list: key_list.append(key[0]) return sfdf, repdf, key_list
def _expand_location(df, col): df[col] = preprocessing.clean(df[col])
df["Name"] = df["Name"].str.replace(r, ' ', regex=False) df["Address"] = df["Address"].str.replace(r, ' ', regex=False) #remove excess whitespace df["Name"] = df["Name"].str.replace(r" +", " ", regex=True) df["Address"] = df["Address"].str.replace(r" +", " ", regex=True) #standardise postal codes - just remove empty space and make sure it's all lower case df.loc[~df.PostalCode.isnull(), 'PostalCode'] = df.loc[~df.PostalCode.isnull(), 'PostalCode'].str.replace(' ', '').str.lower() #create an extra temporary Name column with an additional level of cleaning df['NameClean'] = clean(df["Name"]) #Some records have street number and street name, but no address field filled df.loc[(df.Address.isnull())&\ (~df.StreetName.isnull()),'Address']\ =clean(df.loc[(df.Address.isnull())&\ (~df.StreetName.isnull()),'StreetNumber']+' '+\ df.loc[(df.Address.isnull())&\ (~df.StreetName.isnull()),'StreetName']+' '+\ df.loc[(df.Address.isnull())&\ (~df.StreetName.isnull()),'City']) print( '1.b) Simple Deduplication - use Pandas to drop identical rows before additional processing' )
# PREPROCESSING SP500 len(sp500) sp500.head() fig_sp500 = plt.figure(figsize=(9, 4)) heatmap_sp500 = sns.heatmap(sp500.isnull(), cbar=False) plt.title("SP500: Missing Values ") plt.savefig("MV-sp500.png", dpi=700, bbox_inches='tight') plt.show() sp500['Date first added'].isnull().sum() sp500['GICS Sector'].unique() sp500['GICS Sub Industry'].unique() sp500_pre = sp500.copy() sp500_pre['GICS Sector'] = clean(sp500_pre['GICS Sector']) sp500_pre['GICS Sub Industry'] = clean(sp500_pre['GICS Sub Industry']) sp500_pre['Security'] = clean(sp500_pre['Security']) sp500_pre["Security"].str.count(r'\bcorp\b').sum() sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\bcorp\b', '', regex=True) sp500_pre["Security"].str.count(r'\binc\b').sum() sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\binc\b', '', regex=True) sp500_pre["Security"].str.count(r'\bco\b').sum() sp500_pre['Security'] = sp500_pre["Security"].str.replace(r'\bco\b', '', regex=True) sp500_pre["Security"].str.count(r'\bltdp\b').sum()