def bestMatch(token): # Convert the token with soundex soundexToken = soundex(token) # Find the exact same soundex code in the soundexDict, and save their index numbers candidateIndex = [] for i in range(len(soundexDict)): if soundexToken == soundexDict[i]: candidateIndex.append(i) # Use the index numbers of the matches, extract the original words from the original dictionary candidateLs = [] for i in candidateIndex: candidateLs.append(dictLs[i]) # Use Levenshtein Distance (edit distance) to compare every potential match to the misspelled token, # and return the most similar one as the best match maxRatio = 0 bestMatch = "" for i in candidateLs: # The higher the ratio, the more similar the two strings are ratio = lev.ratio(token, i) if ratio > maxRatio: maxRatio = ratio bestMatch = i return bestMatch
def phonetic_candidates_soundex(word, d): word = word.lower() phonetic_representation = soundex(word)[1] # print(phonetic_representation) soundex_candidates = [] if phonetic_representation in dict_inverted_soundex: word_list = dict_inverted_soundex[phonetic_representation] for w in word_list: soundex_candidates.append((w, word_threshold * d)) return soundex_candidates
def phonetic_candidates_soundex(word, d): word=word.lower() phonetic_representation=soundex(word)[1] # print(phonetic_representation) soundex_candidates=[] if phonetic_representation in dict_inverted_soundex: word_list = dict_inverted_soundex[phonetic_representation] for w in word_list: soundex_candidates.append((w, word_threshold*d)) return soundex_candidates
def __init__(self, corpus_name): logger.info(f"initialize index for file {corpus_name}") self.file_name = corpus_name self.soundex = soundex(N__GRAM) self.algo_ref = { 'levenshtein': levenshtein, 'c_levenshtein': editdistance.eval, 'lcs': lcs, 'ngrams': ngrams_match } self.load_corpus()
def main(argv): parser = argparse.ArgumentParser( description= 'Merge split WMO Publication 47 metadata files to one file per country' ) parser.add_argument( "-config", dest = "config", required = False, \ default = "config.json", help = "JSON file containing configuration settings") parser.add_argument( "-jobs", dest = "jobs", required = True, \ default = "jobs.json", help = "JSON file containing configuration and list of jobs to run") parser.add_argument("-countries", dest="country_file", required=False, \ help="JSON file containing list of countries to process", default = None) parser.add_argument("-index", dest="index", required=False, type = int, \ help="Index of country to process", default = None) parser.add_argument("-country", dest="country", required=False, \ help="2 character country code to process", default = None) #parser.add_argument( "-log", dest="log_path", required=False, default='./', \ # help = "Directory to write log files to") #parser.add_argument( "-tag", dest="tag", required=False, default='', \ # help = "Tag appended to log files") # add argument to specify index / position in list of countries. args = parser.parse_args() control_file = args.jobs config_file = args.config country_file = args.country_file country = args.country country_index = args.index - 1 #log_path = args.log_path if country_file is None and country is None: print("Error, one of countries or country must be supplied") assert False if country_file is not None and country is not None: print("Error, only one of countries or country must be supplied") assert False # load config options with open(config_file) as cf: config = json.load(cf) with open(control_file) as s: control = json.load(s) #datapath = config['data_path'] configpath = config['config_path'] #verbose = config['verbose'] outputpath = config['output_path'] #corrections_file = configpath + './' + config['corrections_file'] map_file = config["mapping_path"] + "./pub47_common_names.json" fmiss = -999999. imiss = -1 # -999999 with open(map_file) as m: mapping = json.load(m) if country_file is not None: with open(country_file) as m: countries = json.load(m) if country_index is not None: country = countries[country_index] countries = list() countries.append(country) else: countries = list() countries.append(country) # iterate over countries for country in countries: print("Processing " + country) master = pd.DataFrame() for job in control['jobs']: schema = pub47schema(configpath + './schemas/', job['schema']) input_file = outputpath + './split/' + os.path.basename( job['data_file']) + "." + country # files now only exist for country if data in them, warn if no file found. if not os.path.isfile(input_file): print('{} not found'.format(input_file)) continue else: print(' ... {} '.format(input_file)) # load data datain = pd.read_csv(input_file, sep='|', dtype='object') # read as object datain = datain.drop_duplicates( keep='first') # some duplicates are appearing from somewhere ! # check whether we need to handle columns that have been split # split_columns = (len( schema.split_fields ) > 0) # NOTE: only text / object columns split so don't need to convert those columns # need to check / revise this in the future # convert to expected data type numeric_columns = list() columns_processed = list() for column in schema.column_name: columns_processed.append(column) if schema.column_type[column] == 'int': datain[column].replace(cmiss, str(imiss), inplace=True) datain = datain.astype({column: 'int'}) elif schema.column_type[column] == 'float': datain[column].replace(cmiss, str(fmiss), inplace=True) datain = datain.astype({column: 'float'}) numeric_columns.append(column) # convert numeric_columns variable to set for later use numeric_columns = set(numeric_columns) # fill all NAs with fmiss (-99999) #datain.fillna( fmiss , inplace = True) # convert valid_from and valid_to to datetime objects, these are not in the schema but added in the first # step of processing datain['valid_from'] = pd.to_datetime(datain['valid_from']) datain['valid_to'] = pd.to_datetime(datain['valid_to']) # identify which mapping to use version = "v" + str(schema.version) mapUse = mapping[version][1] invMap = dict([[v, k] for k, v in mapUse.items()]) # map columns in input data to output required (store in tmp df) tmpDf = datain.copy() tmpDf = tmpDf.assign(source_file=input_file) tmpDf = tmpDf.assign(alt_names='') # check if year present, if not set to year from schema if not ('year' in tmpDf): tmpDf = tmpDf.assign(year=job['year']) tmpDf = tmpDf.assign(month=job['month']) tmpDf = tmpDf.assign(publication_frequency=job['freq']) # rename columns to that expected in output schema colNewNames = dict() for column in tmpDf: if column in invMap: colNewNames[column] = invMap[column] tmpDf.rename(columns=colNewNames, inplace=True) # regularise ship names, first need to fill null strings with 'NULL' tmpDf['name'] = tmpDf['name'].fillna('NULL') # replace double spaces with single tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply( lambda x: re.sub("\\s\\+", " ", x)) # now single initials with initial., e.g. A with A. tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply( lambda x: re.sub("( )([A-Z]{1})( )", "\\1\\2.\\3", x)) # finally, add space between dot and letters, e.g. A.ABC with A. ABC tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply( lambda x: re.sub("([A-Z]{1})(\\.)([A-Z]{1})", "\\1\\2 \\3", x)) sx = tmpDf['name'].apply(lambda x: soundex(x)) tmpDf = tmpDf.assign(sx=sx) tmpDf = tmpDf.assign(record_number=1) to_add = [] # now check each callsign and ship name to see if records can be merged with existing if master.shape[0] > 0: print(" input_file: {} ".format(input_file)) print(tmpDf['callsign'].dtype) for idx in tmpDf.index.values: action = 'new_record' id = tmpDf.loc[idx, 'callsign'] if id == cmiss: continue shipname = tmpDf.loc[idx, 'name'] matches = master.loc[(master['callsign'] == id)].copy() if matches.shape[0] > 0: # get last record added max_record = max(matches['record_number']) id_match = matches[matches['record_number'] == max_record].index.values # now get similarity in names, either by soundex or type distance = max( float(matches['sx'][id_match[0]] == tmpDf.loc[idx, 'sx' ]),\ Levenshtein.ratio(matches['name'][id_match[0]], shipname) ) # if close match check elements if distance > 0.8: # get list of common fields between new entry and matches common = list( set(list(tmpDf)).intersection( list(matches)).intersection( config['duplicateChecks'])) # perform merge # idx = row in current file # id_match = row in master data frame # if rows are the same excluding missing data, merge copying missing data # else if rows are different add new row. # get list of matching elements (TRUE|FALSE) matching_elements = tmpDf.loc[ idx, common] == matches.loc[id_match[0], common] # possible actions # - merge and fill # - merge and correct # - keep old, increment dates # - add new if matching_elements.all( ): # exact match, merge dates and files action = 'increment_date' min_date = min({ tmpDf.loc[idx, 'valid_from'], matches.loc[id_match[0], 'valid_from'] }) max_date = max({ tmpDf.loc[idx, 'valid_to'], matches.loc[id_match[0], 'valid_to'] }) #master.at[matches.index[0], 'valid_to'] = max_date #master.at[matches.index[0], 'valid_from'] = min_date master.at[id_match[0], 'valid_to'] = max_date master.at[id_match[0], 'valid_from'] = min_date master.at[id_match[0], 'source_file'] = master.loc[ id_match[0], 'source_file'] + ';' + tmpDf.loc[ idx, 'source_file'] if (tmpDf.loc[idx, 'name'] != master.loc[id_match[0], 'name']) & ( tmpDf.loc[idx, 'name'] not in master.loc[id_match[0], 'alt_names']): master.at[ id_match[0], 'alt_names'] = master.loc[ id_match[0], 'alt_names'] + ';' + tmpDf.loc[ idx, 'name'] else: # remove missing elements and recheck missing_left = ( (tmpDf.loc[idx, common] == cmiss) | (tmpDf.loc[idx, common] == imiss) | (tmpDf.loc[idx, common] == fmiss)) missing_right = ( (matches.loc[id_match[0], common] == cmiss) | (matches.loc[id_match[0], common] == imiss) | (matches.loc[id_match[0], common] == fmiss)) missing = (missing_left | missing_right) missing = (missing | matching_elements) if missing.all(): action = 'fill_missing' mismatch = ~matching_elements right_columns = missing_right.index[( missing_right & mismatch)].format() # set valid date range to span both records min_date = min({ tmpDf.loc[idx, 'valid_from'], matches.loc[id_match[0], 'valid_from'] }) max_date = max({ tmpDf.loc[idx, 'valid_to'], matches.loc[id_match[0], 'valid_to'] }) master.at[ id_match[0], 'source_file'] = master.loc[ id_match[0], 'source_file'] + ';' + tmpDf.loc[ idx, 'source_file'] if (tmpDf.loc[idx, 'name'] != master.loc[id_match[0], 'name'] ) & (tmpDf.loc[idx, 'name'] not in master.loc[id_match[0], 'alt_names']): master.at[ id_match[0], 'alt_names'] = master.loc[ id_match[0], 'alt_names'] + ';' + tmpDf.loc[ idx, 'name'] # now update master table master.at[id_match[0], 'valid_to'] = max_date master.at[id_match[0], 'valid_from'] = min_date # now fill master table (this is the one we keep) if len(right_columns) > 0: master.at[id_match[0], right_columns] = tmpDf.loc[ idx, right_columns] else: # now check numeric (float) elements mismatch = ~(matching_elements | missing) numeric_mismatch = numeric_columns.intersection( mismatch.index[mismatch].format()) if len(numeric_mismatch) > 0: print(" **** Numeric mismatch **** ") print(tmpDf.loc[ idx, pd.np.array(common)[mismatch]]) print(matches.loc[ id_match[0], pd.np.array(common)[mismatch]]) action = 'correct_numeric' else: action = 'new_record' tmpDf.at[ idx, 'record_number'] = max_record + 1 else: action = 'new_record' tmpDf.at[idx, 'record_number'] = max_record + 1 if action == 'new_record': to_add.append(idx) else: to_add = tmpDf.index.values # concat to master table master = pd.concat([master, tmpDf.loc[to_add, ]], ignore_index=True, sort=False) # replace nans with expected missing value for column in master: if master[column].dtype == 'datetime64[ns]': continue if master[column].dtype == 'float64': master[column].fillna(fmiss, inplace=True) elif master[column].dtype == 'object': master[column].fillna(cmiss, inplace=True) elif master[column].dtype == 'int64': master[column].fillna(imiss, inplace=True) else: print('Unknown column type: {}'.format( master[column].dtype)) # final step is sort and addition of record numbers # assign UIDs to all records uid = master.apply(lambda x: '{}-{}-{}'.format(x['callsign'], x[ 'sx'], x['recruiting_country']), axis=1) master = master.assign(uid=uid) # sort by id then date master.sort_values(['uid', 'valid_from'], inplace=True) # reset index master.reset_index(inplace=True, drop=True) # now reset record numbers based on uid uids = master['uid'].unique() count = 0 for uid in uids: if count % 200 == 0: print('{} / {} '.format(count, len(uids))) records = master.loc[master['uid'] == uid, :] nrecs = records.shape[0] master.at[records.index, 'record_number'] = pd.np.arange(nrecs) # adjust valid from and to for 1st and last records new_valid_to = records.valid_from.shift(-1) to_change = (((records['valid_to'] - new_valid_to)).dt.days >= -3625) & \ (((records['valid_to'] - new_valid_to)).dt.days <= 0) if to_change.any(): records.loc[to_change, 'valid_to'] = new_valid_to[to_change] # add 5 years to last record and subtract 1 year from first records.loc[records.index[0], 'valid_from'] = records.loc[ records.index[0], 'valid_from'] - relativedelta(months=12) records.loc[records.index[nrecs - 1], 'valid_to'] = records.loc[records.index[nrecs - 1], 'valid_to'] + relativedelta( months=60) master.at[records.index, ['valid_from', 'valid_to']] = records.loc[ records.index, ['valid_from', 'valid_to']] count += 1 # now save # convert each field back to str and replace missing values with NULL master = master.astype(str) master.replace(str(fmiss), pd.np.nan, inplace=True) master.replace(str(imiss), pd.np.nan, inplace=True) master.to_csv(outputpath + './master/master.' + country + '.csv', index=False, sep='|', na_rep='NULL')
rdr.next() for row in rdr: female_names.append(row[1]) male_names.append(row[0]) parse_csv('/Users/TJiang/Desktop/name_in_english/names.csv') print male_names exit(0) #soundex = fuzzy.Soundex(4) hash_list = [] for n in male_names: print '%-10s' % n, soundex(n) hash_list.append(soundex(n)) user_name = raw_input('enter desired name to match') user_hash = soundex(user_name) gender = 'm' #gender = raw_input('enter gender m/f') same_score_list = [] top_match = -1; for idx, one_hash in enumerate(hash_list): score = fuzz.ratio(user_hash, one_hash) if score > top_match: top_match = score del same_score_list[:]
# Find the best match from the dictionary for a misspelled token import Levenshtein as lev from soundex import * # Convert the dictionary with soundex. Referred as soundexDict soundexDict = [] for i in dictLs: soundexDict.append(soundex(i)) def bestMatch(token): # Convert the token with soundex soundexToken = soundex(token) # Find the exact same soundex code in the soundexDict, and save their index numbers candidateIndex = [] for i in range(len(soundexDict)): if soundexToken == soundexDict[i]: candidateIndex.append(i) # Use the index numbers of the matches, extract the original words from the original dictionary candidateLs = [] for i in candidateIndex: candidateLs.append(dictLs[i]) # Use Levenshtein Distance (edit distance) to compare every potential match to the misspelled token, # and return the most similar one as the best match maxRatio = 0 bestMatch = "" for i in candidateLs:
def main(argv): parser = argparse.ArgumentParser( description='Split WMO Publication 47 metadata files by country') parser.add_argument( "-config", dest = "config", required = False, \ default = "config.json", help = "JSON file containing configuration settings") parser.add_argument( "-jobs", dest = "jobs", required = True, \ default = "jobs.json", help = "JSON file containing list of jobs to run") parser.add_argument( "-start", dest="jobIndexStart", type=int, required=True, default=1, \ help = "Index of first job to process") parser.add_argument( "-end", dest="jobIndexEnd", type=int, required=False, default=None, \ help = "Index of last job to process, defaults to first job") parser.add_argument( "-log", dest="log_path", required=False, default='./', \ help = "Directory to write log files to") parser.add_argument( "-tag", dest="tag", required=False, default='', \ help = "Tag appended to log files") args = parser.parse_args() control_file = args.jobs first_job = args.jobIndexStart last_job = args.jobIndexEnd config_file = args.config log_path = args.log_path if last_job is None: last_job = first_job # set validity periods for different editions validity = {'annual': 12, 'quarterly': 3, 'semi-annual': 6} # global options stored in config file # jobs specific options in control_file (need to rename) # load config options with open(config_file) as cf: config = json.load(cf) # load controls / list if files to process with open(control_file) as s: control = json.load(s) # parsing using pandas # global options map_path = config['mapping_path'] datapath = config['data_path'] configpath = config['config_path'] verbose = config['verbose'] outputpath = config['output_path'] corrections_file = configpath + './' + config['corrections_file'] print(corrections_file) # read options from control file log_file = log_path + './split_pub47_' + args.tag + '.log' # load corrections with open(corrections_file) as m: corrections = json.load(m) # open log file for later use log = open(log_file, 'w') # iterate over jobs in control file for job_index in pd.np.arange(first_job, last_job + 1, 1): # find job in job list for job in control['jobs']: if job['jobindex'] == job_index: break assert job_index == job['jobindex'] rejects = pd.DataFrame() # load schema schema = pub47schema(configpath + './schemas/', job['schema']) # get input file input_file = job['data_file'] input_file = datapath + input_file # set validity dates valid_from = datetime.date(job['year'], job['month'], 1) valid_to = datetime.date(job['year'], job['month'], 1) + relativedelta( months=validity[job['freq']]) # feedback if verbose > 0: print("Processing " + os.path.basename(input_file), file=log) # now read in the data datain = pub47load(schema, input_file, map_path) # remove any exact duplicates datain = datain.drop_duplicates(keep='first') # now we need to identify duplicates within country id_counts = datain.loc[:, 'call'].value_counts() duplicated_ids = id_counts.index.values[id_counts > 1] duplicated_ids = list(duplicated_ids[duplicated_ids != cmiss]) unique_ids = list(id_counts.index.values[id_counts == 1]) unique_ids.append(cmiss) unique_rows = datain.loc[ datain['call'].apply(lambda x: x in unique_ids), :].copy() for dup_id in duplicated_ids: dup_rows = datain.loc[datain['call'] == dup_id, :] # more than two entries for same callsign, reject all for later assessment if dup_rows.shape[0] > 2: rejects = pd.concat([rejects, dup_rows], ignore_index=True, sort=False) continue cmp = dup_rows.apply(lambda x: pub47_record_completeness(x), axis=1) vsslM = dup_rows.loc[:, 'vsslM'] most_complete = list(cmp[cmp == max(cmp)].index.values) highest_class = list(vsslM[vsslM == min(vsslM)].index.values) ix = dup_rows.index.values same_name = soundex(dup_rows.loc[ix[0], 'name']) == soundex( dup_rows.loc[ix[1], 'name']) same_country = dup_rows.loc[ix[0], schema.recruiting_country ] == \ dup_rows.loc[ix[1], schema.recruiting_country ] # if same country and name merge if possible # if different country but same name use highest VOS class # else mark for rejection as ambiguous if same_country and same_name: # check if we can merge if pub47_record_compare( dup_rows.loc[ix[0], schema.duplicate_check], dup_rows.loc[ix[1], schema.duplicate_check]): record_to_add = dup_rows.loc[[most_complete[0]], ].copy() merged_record = pub47_merge_rows( dup_rows.loc[ix[0], schema.duplicate_check], dup_rows.loc[ix[1], schema.duplicate_check]) # record_to_add.at[ ix[0], schema['duplicate_check'] ] \ merged_record = pd.DataFrame(merged_record).transpose() record_to_add.reset_index(inplace=True, drop=True) merged_record.reset_index(inplace=True, drop=True) record_to_add.at[:, schema. duplicate_check] = merged_record.loc[:, schema . duplicate_check] elif len(highest_class) == 1: record_to_add = dup_rows.loc[[highest_class[0]], :].copy() elif len(most_complete) == 1: record_to_add = dup_rows.loc[[most_complete[0]], :].copy() else: rejects = pd.concat([rejects, dup_rows], ignore_index=True, sort=False) record_to_add = None elif same_country: rejects = pd.concat([rejects, dup_rows], ignore_index=True, sort=False) record_to_add = None else: record_to_add = dup_rows if record_to_add is not None: unique_rows = pd.concat([unique_rows, record_to_add], ignore_index=True, sort=False) # save rejects to file print("Saving rejects") rejects = rejects.astype(str) rejects.replace(str(fmiss), pd.np.nan, inplace=True) rejects.replace(str(imiss), pd.np.nan, inplace=True) rejects.to_csv(outputpath + './split/' + os.path.basename(input_file) + '.' + 'reject', index=False, sep='|', na_rep='NULL') datain = unique_rows.copy() # get list of countries present in file countries = datain.rcnty.unique() print(countries, file=log) # now loop over countries homogenising for country in countries: if verbose > 0: print("Processing {}".format(country), file=log) tmp_data = datain.loc[datain.rcnty == country].copy() tmp_data = tmp_data.reindex() nrows = tmp_data.shape[0] # output file for data from this country country_file = os.path.basename(input_file) + '.' + country cor = None # check if corrections exists for country / edition for cor_temp in corrections: if cor_temp['file'] == country_file: cor = cor_temp break # validate (and correct) data for column in tmp_data: # ++++++++++ CORRECT DATA ++++++++++ # check if correction required and apply if cor is not None: for f in cor['corrections']: if f['field'] == column: if f['all'] == 1: if verbose > 0: print( "Applying corrections to all values in {}" .format(column), file=log) print("Factor = {}".format(f['factor']), file=log) # getting non missing rows # valid = tmp_data[column] != fmiss valid = tmp_data[column].apply( lambda x: abs(x - fmiss) < tol) # apply to tmp data tmp_data.at[valid, column] = tmp_data.loc[ valid, column] * f['factor'] # apply to datain datain.at[ (datain['rcnty'] == country) & (valid) , column] = \ datain.loc[ (datain['rcnty'] == country) & (valid) , column] * f['factor'] else: valid = pv.validate_numeric( tmp_data[column], min_value=schema.column_valid_min[column], max_value=schema.column_valid_max[column], return_type='mask_series') valid = valid & ~(tmp_data[column].apply( lambda x: abs(x - fmiss) < tol)) if any(valid): if verbose > 0: print( "Applying corrections to invalid values in {}" .format(column), file=log) print("Factor = {}".format( f['factor']), file=log) # apply to tmp data tmp_data.at[valid, column] = tmp_data.loc[ valid, column] * f['factor'] # now apply to datain valid = pv.validate_numeric( datain[column], min_value=schema. column_valid_min[column], max_value=schema. column_valid_max[column], return_type='mask_series') datain.at[ (datain['rcnty'] == country) & (valid), column] = \ datain.loc[ (datain['rcnty'] == country) & (valid), column] * f['factor'] # ++++++++++ VALIDATE CODED DATA ++++++++++ # get code table to validate against tableID = schema.column_code_table[column] if tableID in schema.code_tables: codes = schema.code_tables[tableID] if verbose > 1: print("Validating against code table: " + str(tableID), file=log) whitelist = codes['code'].map(str) whitelist = whitelist.append( pd.Series([cmiss, '-1', 'NA', '-999999'])) tmp_values = tmp_data[column].map(str) valid = pv.validate_string(tmp_values, whitelist=whitelist, return_type='mask_series') # ++++++++++ VALIDATE NUMERIC ++++++++++ if tableID == None: if schema.column_type[column] != 'object': # if int convert to float and replace -1 with np.na if str(tmp_data[column].dtype) == 'int64': tmp_values = pd.to_numeric(tmp_data[column]) tmp_values = tmp_values.replace( to_replace=imiss, value=fmiss) # pd.np.nan ) else: tmp_values = tmp_data[column] valid = pv.validate_numeric( tmp_data[column], min_value=schema.column_valid_min[column], max_value=schema.column_valid_max[column], return_type='mask_series') valid = valid & ~(tmp_data[column].apply( lambda x: abs(x - fmiss) < tol)) else: valid = pd.Series(False * nrows) # calculate fraction bad fraction_bad = sum(valid) / nrows if (fraction_bad > 0.05) & (nrows > 10): mask = valid.apply(lambda x: not x) print("///////////// " + os.path.basename(input_file) + '.' + country + " /////////////", file=log) print("Large number of bad values for " + column + "(" + str(tableID) + ")", file=log) print(tmp_data.loc[valid, column].unique(), file=log) elif any(valid): print("Bad values, {} ({}) :: {}".format( column, str(tableID), tmp_values[valid].unique()), file=log) dataout = datain[datain.rcnty == country] dataout = dataout.assign(valid_from=valid_from) dataout = dataout.assign(valid_to=valid_to) dataout = dataout.assign(schema=schema.version) # convert all columns to object and replace fmiss and imiss with NA dataout = dataout.astype(str) dataout.replace(str(fmiss), pd.np.nan, inplace=True) dataout.replace(str(imiss), pd.np.nan, inplace=True) dataout.to_csv(outputpath + './split/' + os.path.basename(input_file) + '.' + country, index=False, sep='|', na_rep='NULL')