def _run(df, i): nonlocal records nonlocal missing nonlocal columns print("col_name:", cols[i]) # match cols (they removed replaced space in column names when saving them to the file name) col = cols[i] ds_name = os.path.basename(gz_paths[i]) result = match_preprocess(cols[i], {'foo': df.columns}, match_jacc_min) if result is not None: c = result[COL] print('found:', c) col = c try: df = df.select(spark_col(col)) # remove all but col of interest except Exception: missing.add(str({'ds_name': ds_name, 'col_name_ta': cols[i]})) raise ValueError('missing:', (ds_name, cols[i]), 'cols:', df.columns) df_cols = map_cols(df) df_counts = get_counts(df_cols) df_output = get_n_freq_str(df_counts, top_n) df_output = df_output.select( lit(ds_name).alias('ds_path'), lit(cols[i]).alias('col_name_ta'), '*') if columns is None: columns = df_output.columns # concat records.append([row.asDict() for row in df_output.collect()][0]) return df_output
def _match_semantic_vals(col_val, s_type_col, is_spark=True): """ stage 1: run value matcher ('match_preprocess') on only the matched s_type_col if the cutoff not passed (avg distance from column is too high): stage 2: use heuristics (from manually examining frequent data for each col (ref_set)) to limit the amount of s_type_vals in ref_set_vals to compare to. I.e. null is automatically assigned the matched s_type_col I.e. check for subtrings, like if 'com' is in the val, then check 'website' s_type_vals for similarity. 'co' is implicitly in 'com' so check business_name as well, etc. this is to minimize misclassifications place them in 'check' to later build another s_type_vals using only those s_types stage 3: run 'match_preprocess' again on all s_types except the match s_type_col, or only on the heuristic matches in stage 2 (if they exist (if the heuristic check yielded results)) stage 4: check whether the stage 3 result is significantly better than the stage 1 result--by checking whether the avg_dist is some percentage ('IMPROVE_RATIO') better than what it was. If not, assign the val to the matched s_type_col as would happen if the value was null stage 5 (doesn't work in spark): if the min_dist is less than some similarity cutoff: 'MIN_DIST' (meaning it is sufficiently small) and is larger than some similarity cutoff: 'IDENTICAL_CUTOFF' (meaning it isn't nearly identical to something already in the ref_set) add it to the ref_set. if initial matches are correct, later matches should be more accurate. the ref_set tops out at some sufficient size as to prevent slow down and redundant matching all {col_val: s_type} combinations are cached so that identical column values aren't recomputed, and so that spark can assign each to the dataframe by using a udf after they are computed outside of Spark. the cache is cleared after each dataset """ col_val = str(col_val) s_type_col = str(s_type_col) add = False # print(col_val, s_type_col, {s_type_col: [ref_set_vals[s_type_col]]}) if not col_val in cache_col_val: AVG_CUTOFF = 0.9 # similarity measure worse than this triggers second more general run # MIN_CUTOFF = 0.65 # IDENTICAL_CUTOFF = 0.10 IMPROVE_RATIO = 0.2 # second run improved by some percent str_col_val = str(col_val).lower() # print(str_col_val) if str_col_val == 'null' or str_col_val == '-' or str_col_val == '_' or str_col_val == '0' or str_col_val == 'none' or str_col_val == '' or col_val is None: res_final = (s_type_col, col_val, 0.0, 0.0 ) # default to s_type_col else: res0 = match_preprocess( col_val, {s_type_col: ref_set_vals[s_type_col]}, match_jacc_avg ) # compare to values of matched (based on col_name) semantic type # print('res0:', res0) # res0[MIN_DIST] != 0.0 if res0 is None or AVG_CUTOFF < res0[ AVG_DIST]: # was the cutoff passed, i.e. was the value present for this semantic type based on the col_name match? # check only these semantic types based on the content of the col_val (more explicit rules after examining data) check = [] remove = [] is_alpha = str_col_val.isalpha() is_digit = str_col_val.isdigit() if len(str_col_val) == 1 and is_alpha: possibles = ['person_name (middle_initial)', 'borough'] for pos_s_type in possibles: if s_type_col == pos_s_type: # which of these is the s_type of the col? check.extend([pos_s_type]) break if len(str_col_val) == 2 and is_alpha: check.extend(['color']) if len(str_col_val) == 5 and is_digit: check.extend(['zip_code']) if len(str_col_val) >= 3 and is_digit: check.extend([ 'city_agency', 'street_number', 'phone_number', 'building_classification' ]) if len(str_col_val) >= 1 and is_digit: check.extend(['street_number']) if 'ps ' in str_col_val or 'is ' in str_col_val or 'js ' in str_col_val or 'hs ' in str_col_val: check.extend(['school_name']) if len(str_col_val ) >= 3: # can have numbers and other chars if 'llc' in str_col_val or 'inc' in str_col_val or 'co' in str_col_val: check.extend(['business_name']) if 'http' in str_col_val or 'www' in str_col_val or 'org' in str_col_val or 'com' in str_col_val: check.extend(['website']) if 'ave' in str_col_val or 'str' in str_col_val: if str_col_val[0].isdigit(): check.extend(['address']) # if len(check) > 0: # print('check:', check) check = list(set(check)) remove = list(set(remove)) if len(check) == 0: # compare to every semantic type but already checked if is_spark: # check for expensive unnecessary operation ref_set_diff = ref_set_vals else: ref_set_diff = copy.deepcopy(ref_set_vals) # clone else: # compare to only those in check ref_set_diff = {} for s_type in check: if is_spark: ref_set_diff[s_type] = ref_set_vals[s_type] else: ref_set_diff[s_type] = copy.deepcopy( ref_set_vals[s_type]) # for key, val in ref_set_cols.items(): # compare to column names as well (for ms_core) # if key in ref_set_diff: # ref_set_diff[key].extend(val) # ref_set_diff[s_type_col] = [] # prevent key error and delete all values for already matched ref_set_diff[s_type_col] = ref_set_vals[ s_type_col] # prevent key error and delete all values for already matched for rm in remove: if rm in ref_set_diff: ref_set_diff[rm] = [] res1 = match_preprocess( col_val, ref_set_diff, match_jacc_avg ) # find similarity with other semantic value types res_final = res1 if res0 is None and res1 is None: res_final = (s_type_col, col_val, 0.0, 0.0) elif res0 is None: # print('res0:', res0, res1) res_final = res1 elif res1 is None: # print('res1:', res0, res1) res_final = res0 else: # neither are None res_final = min([res0, res1], key=lambda x: x[AVG_DIST]) # if AVG_CUTOFF < res_final[AVG_DIST]: # still greater than cutoff and therefore unknown if not ( res_final[AVG_DIST] <= (res0[AVG_DIST] * (1 - IMPROVE_RATIO))): # dist has not improved res_final = _default( s_type_col, col_val) # default to s_type_col # ^ should the distance be non-0 to add to ref_set? else: # print('FALSE') res_final = res0 # cutoff passed, return initial result # # not an exact match and up to n different values stored # if res_final[MIN_DIST] <= MIN_CUTOFF and res_final[MIN_DIST] >= IDENTICAL_CUTOFF and len(ref_set_vals[res_final[S_TYPE]]) < 30: # if is_spark: # add = True # else: # ref_set_vals[res_final[S_TYPE]].append(col_val) # append to ref_set cache_col_val[col_val] = str(res_final[S_TYPE]) # # print('res_final:', res_final) return (cache_col_val[col_val], add)
def _run(df, i): print("col_name:", cols[i]) col = None match_col = match_preprocess( cols[i], {'foo': df.columns}) # match the col from ta name to ds cols name if match_col is not None: col = match_col[COL] else: # shouldn't exec raise Exception(f'{cols[i]} not matched in {str(df.columns)}') df_cols = map_cols(df.select(col)) # filter single col # df_cols = df_cols.sample(0.5, seed=3).limit(500) # TEST if not col in cache_col_name: # currently uneccessary since cache_col_name is cleared after every ds cache_col_name[col] = match_preprocess( col, ref_set_cols)[S_TYPE] # match col to s_type s_type_col = cache_col_name[col] # print('s_type_col:', s_type_col) # print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col]) df_cols = df_cols.withColumn( 's_type_col', lit(s_type_col)) # populate df with col s_type # s_types_distinct = [(s_type_col, df_cols.count())] # ### Python method: no spark to add to ref_set_vals # if i > -10: # run on small datasets (before it gets slow) # s_types_all = [] # for row in df_cols.select('value', 's_type_col').collect(): # s_type_i = _match_semantic_vals(row['value'], row['s_type_col'], False) # s_types_all.append(s_type_i[0]) # # get (s_type, count) # s_types_distinct = sc.parallelize(s_types_all).countByValue().items() # ### if i >= -10: ### Spark method df_cols = df_cols.withColumn( 's_type_val_add', match_semantic_vals( 'value', 's_type_col')) # match uknown col value to semantic type # add to ref set with Spark df_cols = df_cols.select('*', 's_type_val_add.s_type_val') df_cols = df_cols.select('*', 's_type_val_add.add') s_types_distinct = df_cols.select('s_type_val').rdd.map( lambda x: x['s_type_val']).countByValue().items() # for row in df_cols.filter('add == True').select('value', 's_type_val').distinct().collect(): # if len(ref_set_vals[row['s_type_val']]) < 30: # # print('ADD') # # print(row['s_type_val'], 'row:', ref_set_vals[row['s_type_val']][-5:], 'val:', row['value']) # ref_set_vals[row['s_type_val']].append(row['value']) # else: # break # # DEBUG # df_test = df_cols.groupby('s_type_col', 'value', 's_type_val', 'add').count() # df_test = df_test.sort('count', ascending=False) # print() # print('25 top vals') # df_test.show(25) # print('s_type_val different than s_type_col') # df_test.filter('s_type_val != s_type_col').show(25) # ### ds_dict = {'column_name': ta_path[i], 'semantic_types': []} for s_type, count in s_types_distinct: if s_type in LABEL_LIST_TA: ds_dict['semantic_types'].append({ 'semantic_type': s_type, 'count': count }) else: ds_dict['semantic_types'].append({ 'semantic_type': 'other', 'label': s_type, 'count': count }) master_lst.append(ds_dict) print('ta_path[i]:', ds_dict) with open("results_similarities/master_dct_0.json", "w") as json_file: json.dump(master_lst, json_file, indent=4) cache_col_name.clear() cache_col_val.clear()
def _run(df, i): print("col_name:", cols[i]) col = None match_col = match_preprocess(cols[i], {'foo': df.columns}) # match the col from ta name to ds cols name if match_col is not None: col = match_col[COL] else: # shouldn't exec raise Exception(f'{cols[i]} not matched in {str(df.columns)}') df_cols = map_cols(df.select(col)) # filter single col # df_cols = df_cols.sample(0.5, seed=3).limit(500) # TEST if not col in cache_col_name: # currently uneccessary since cache_col_name is cleared after every ds cache_col_name[col] = match_preprocess(col, ref_set_cols)[S_TYPE] # match col to s_type s_type_col = cache_col_name[col] print('s_type_col:', s_type_col) print('ref_set_vals[s_type_col]:', ref_set_vals[s_type_col]) df_cols = df_cols.withColumn('s_type_col', lit(s_type_col)) # populate df with col s_type # if i < 35: # run on small datasets (before it gets slow) s_types_all = [] ### Python method: no spark to add to ref_set_vals for row in df_cols.select('value', 's_type_col').collect(): s_type_i = _match_semantic_vals(row['value'], row['s_type_col']) s_types_all.append(s_type_i) # get (s_type, count) s_types_distinct = sc.parallelize(s_types_all).countByValue().items() ### # the below udf call just pulls out the s_types from the cache df_cols = df_cols.withColumn('s_type_val', match_semantic_vals('value', 's_type_col')) # match uknown col value to semantic type df_test = df_cols.groupby('s_type_col', 'value', 's_type_val').count() df_test = df_test.sort('count', ascending=False) df_test.filter('s_type_val != s_type_col').show(25) df_test.show(25) # results = [str(list(row.asDict().values())) + '\n' for row in df_test.collect()] # print(results[:10]) # with open('results_similarities/test.txt', '+a') as f: # for s in results: # f.write(s) ds_dict = { 'column_name': col, 'semantic_types': [] } for s_type, count in s_types_distinct: if s_type in LABEL_LIST_TA: ds_dict['semantic_types'].append({ 'semantic_type': s_type, 'count': count }) else: ds_dict['semantic_types'].append({ 'semantic_type': 'other', 'label': s_type, 'count': count }) if gz_paths[i] not in master_dct: master_dct[gz_paths[i]] = {} master_dct[gz_paths[i]].update({col: ds_dict}) print('gz_paths[i]:', {gz_paths[i]: master_dct[gz_paths[i]]}) with open("results_similarities/master_dct.json", "w") as json_file: json.dump(master_dct, json_file, indent=4) cache_col_name.clear() cache_col_val.clear()