示例#1
0
def bestMatch(token):
    # Convert the token with soundex
    soundexToken = soundex(token)

    # Find the exact same soundex code in the soundexDict, and save their index numbers
    candidateIndex = []
    for i in range(len(soundexDict)):
        if soundexToken == soundexDict[i]:
            candidateIndex.append(i)

    # Use the index numbers of the matches, extract the original words from the original dictionary
    candidateLs = []
    for i in candidateIndex:
        candidateLs.append(dictLs[i])

    # Use Levenshtein Distance (edit distance) to compare every potential match to the misspelled token,
    # and return the most similar one as the best match
    maxRatio = 0
    bestMatch = ""
    for i in candidateLs:
        # The higher the ratio, the more similar the two strings are
        ratio = lev.ratio(token, i)
        if ratio > maxRatio:
            maxRatio = ratio
            bestMatch = i
    return bestMatch
示例#2
0
def phonetic_candidates_soundex(word, d):
    word = word.lower()
    phonetic_representation = soundex(word)[1]
    # print(phonetic_representation)
    soundex_candidates = []
    if phonetic_representation in dict_inverted_soundex:
        word_list = dict_inverted_soundex[phonetic_representation]
        for w in word_list:
            soundex_candidates.append((w, word_threshold * d))

    return soundex_candidates
示例#3
0
def phonetic_candidates_soundex(word, d):
	word=word.lower()
	phonetic_representation=soundex(word)[1]
	# print(phonetic_representation)
	soundex_candidates=[]
	if phonetic_representation in dict_inverted_soundex:
		word_list = dict_inverted_soundex[phonetic_representation]
		for w in word_list:
			soundex_candidates.append((w, word_threshold*d))
		
	return soundex_candidates
示例#4
0
 def __init__(self, corpus_name):
     logger.info(f"initialize index for file {corpus_name}")
     self.file_name = corpus_name
     self.soundex = soundex(N__GRAM)
     self.algo_ref = {
         'levenshtein': levenshtein,
         'c_levenshtein': editdistance.eval,
         'lcs': lcs,
         'ngrams': ngrams_match
     }
     self.load_corpus()
示例#5
0
def main(argv):

    parser = argparse.ArgumentParser(
        description=
        'Merge split WMO Publication 47 metadata files to one file per country'
    )
    parser.add_argument( "-config", dest = "config", required = False, \
                          default = "config.json", help = "JSON file containing configuration settings")
    parser.add_argument( "-jobs", dest = "jobs", required = True, \
                          default = "jobs.json", help = "JSON file containing configuration and list of jobs to run")
    parser.add_argument("-countries", dest="country_file", required=False, \
                         help="JSON file containing list of countries to process", default = None)
    parser.add_argument("-index", dest="index", required=False,  type = int,  \
                         help="Index of country to process", default = None)
    parser.add_argument("-country", dest="country", required=False, \
                         help="2 character country code to process", default = None)

    #parser.add_argument( "-log", dest="log_path", required=False, default='./', \
    #                     help = "Directory to write log files to")
    #parser.add_argument( "-tag", dest="tag", required=False, default='', \
    #                     help = "Tag appended to log files")

    # add argument to specify index / position in list of countries.
    args = parser.parse_args()
    control_file = args.jobs
    config_file = args.config
    country_file = args.country_file
    country = args.country
    country_index = args.index - 1
    #log_path = args.log_path

    if country_file is None and country is None:
        print("Error, one of countries or country must be supplied")
        assert False

    if country_file is not None and country is not None:
        print("Error, only one of countries or country must be supplied")
        assert False

    # load config options
    with open(config_file) as cf:
        config = json.load(cf)

    with open(control_file) as s:
        control = json.load(s)

    #datapath         = config['data_path']
    configpath = config['config_path']
    #verbose          = config['verbose']
    outputpath = config['output_path']
    #corrections_file = configpath + './' + config['corrections_file']

    map_file = config["mapping_path"] + "./pub47_common_names.json"

    fmiss = -999999.
    imiss = -1  # -999999

    with open(map_file) as m:
        mapping = json.load(m)

    if country_file is not None:
        with open(country_file) as m:
            countries = json.load(m)
        if country_index is not None:
            country = countries[country_index]
            countries = list()
            countries.append(country)
    else:
        countries = list()
        countries.append(country)

    # iterate over countries
    for country in countries:
        print("Processing " + country)
        master = pd.DataFrame()
        for job in control['jobs']:
            schema = pub47schema(configpath + './schemas/', job['schema'])
            input_file = outputpath + './split/' + os.path.basename(
                job['data_file']) + "." + country

            # files now only exist for country if data in them, warn if no file found.
            if not os.path.isfile(input_file):
                print('{} not found'.format(input_file))
                continue
            else:
                print(' ... {} '.format(input_file))
            # load data
            datain = pd.read_csv(input_file, sep='|',
                                 dtype='object')  # read as object
            datain = datain.drop_duplicates(
                keep='first')  # some duplicates are appearing from somewhere !
            # check whether we need to handle columns that have been split
            # split_columns = (len( schema.split_fields ) > 0)

            # NOTE: only text / object columns split so don't need to convert those columns
            # need to check / revise this in the future
            # convert to expected data type
            numeric_columns = list()
            columns_processed = list()
            for column in schema.column_name:
                columns_processed.append(column)
                if schema.column_type[column] == 'int':
                    datain[column].replace(cmiss, str(imiss), inplace=True)
                    datain = datain.astype({column: 'int'})
                elif schema.column_type[column] == 'float':
                    datain[column].replace(cmiss, str(fmiss), inplace=True)
                    datain = datain.astype({column: 'float'})
                    numeric_columns.append(column)

            # convert numeric_columns variable to set for later use
            numeric_columns = set(numeric_columns)

            # fill all NAs with fmiss (-99999)
            #datain.fillna( fmiss , inplace = True)

            # convert valid_from and valid_to to datetime objects, these are not in the schema but added in the first
            # step of processing
            datain['valid_from'] = pd.to_datetime(datain['valid_from'])
            datain['valid_to'] = pd.to_datetime(datain['valid_to'])

            # identify which mapping to use
            version = "v" + str(schema.version)
            mapUse = mapping[version][1]
            invMap = dict([[v, k] for k, v in mapUse.items()])

            # map columns in input data to output required (store in tmp df)
            tmpDf = datain.copy()
            tmpDf = tmpDf.assign(source_file=input_file)
            tmpDf = tmpDf.assign(alt_names='')

            # check if year present, if not set to year from schema
            if not ('year' in tmpDf):
                tmpDf = tmpDf.assign(year=job['year'])

            tmpDf = tmpDf.assign(month=job['month'])
            tmpDf = tmpDf.assign(publication_frequency=job['freq'])

            # rename columns to that expected in output schema
            colNewNames = dict()
            for column in tmpDf:
                if column in invMap:
                    colNewNames[column] = invMap[column]

            tmpDf.rename(columns=colNewNames, inplace=True)

            # regularise ship names, first need to fill null strings with 'NULL'
            tmpDf['name'] = tmpDf['name'].fillna('NULL')
            # replace double spaces with single
            tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply(
                lambda x: re.sub("\\s\\+", " ", x))
            # now single initials with initial., e.g. A with A.
            tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply(
                lambda x: re.sub("( )([A-Z]{1})( )", "\\1\\2.\\3", x))
            # finally, add space between dot and letters, e.g. A.ABC with A. ABC
            tmpDf.at[:, 'name'] = tmpDf.loc[:, 'name'].apply(
                lambda x: re.sub("([A-Z]{1})(\\.)([A-Z]{1})", "\\1\\2 \\3", x))

            sx = tmpDf['name'].apply(lambda x: soundex(x))
            tmpDf = tmpDf.assign(sx=sx)
            tmpDf = tmpDf.assign(record_number=1)
            to_add = []
            # now check each callsign and ship name to see if records can be merged with existing
            if master.shape[0] > 0:
                print("   input_file: {}  ".format(input_file))
                print(tmpDf['callsign'].dtype)
                for idx in tmpDf.index.values:
                    action = 'new_record'
                    id = tmpDf.loc[idx, 'callsign']
                    if id == cmiss:
                        continue
                    shipname = tmpDf.loc[idx, 'name']
                    matches = master.loc[(master['callsign'] == id)].copy()
                    if matches.shape[0] > 0:

                        # get last record added
                        max_record = max(matches['record_number'])
                        id_match = matches[matches['record_number'] ==
                                           max_record].index.values

                        # now get similarity in names, either by soundex or type
                        distance = max( float(matches['sx'][id_match[0]] == tmpDf.loc[idx, 'sx' ]),\
                                        Levenshtein.ratio(matches['name'][id_match[0]], shipname) )

                        # if close match check elements
                        if distance > 0.8:
                            # get list of common fields between new entry and matches
                            common = list(
                                set(list(tmpDf)).intersection(
                                    list(matches)).intersection(
                                        config['duplicateChecks']))

                            # perform merge
                            # idx = row in current file
                            # id_match = row in master data frame

                            # if rows are the same excluding missing data, merge copying missing data
                            # else if rows are different add new row.

                            # get list of matching elements (TRUE|FALSE)
                            matching_elements = tmpDf.loc[
                                idx, common] == matches.loc[id_match[0],
                                                            common]
                            # possible actions
                            #  - merge and fill
                            #  - merge and correct
                            #  - keep old, increment dates
                            #  - add new
                            if matching_elements.all(
                            ):  # exact match, merge dates and files
                                action = 'increment_date'
                                min_date = min({
                                    tmpDf.loc[idx, 'valid_from'],
                                    matches.loc[id_match[0], 'valid_from']
                                })
                                max_date = max({
                                    tmpDf.loc[idx, 'valid_to'],
                                    matches.loc[id_match[0], 'valid_to']
                                })
                                #master.at[matches.index[0], 'valid_to'] = max_date
                                #master.at[matches.index[0], 'valid_from'] = min_date
                                master.at[id_match[0], 'valid_to'] = max_date
                                master.at[id_match[0], 'valid_from'] = min_date
                                master.at[id_match[0],
                                          'source_file'] = master.loc[
                                              id_match[0],
                                              'source_file'] + ';' + tmpDf.loc[
                                                  idx, 'source_file']
                                if (tmpDf.loc[idx, 'name'] !=
                                        master.loc[id_match[0], 'name']) & (
                                            tmpDf.loc[idx, 'name']
                                            not in master.loc[id_match[0],
                                                              'alt_names']):
                                    master.at[
                                        id_match[0], 'alt_names'] = master.loc[
                                            id_match[0],
                                            'alt_names'] + ';' + tmpDf.loc[
                                                idx, 'name']
                            else:
                                # remove missing elements and recheck
                                missing_left = (
                                    (tmpDf.loc[idx, common] == cmiss) |
                                    (tmpDf.loc[idx, common] == imiss) |
                                    (tmpDf.loc[idx, common] == fmiss))
                                missing_right = (
                                    (matches.loc[id_match[0], common] == cmiss)
                                    |
                                    (matches.loc[id_match[0], common] == imiss)
                                    | (matches.loc[id_match[0], common]
                                       == fmiss))
                                missing = (missing_left | missing_right)
                                missing = (missing | matching_elements)
                                if missing.all():
                                    action = 'fill_missing'
                                    mismatch = ~matching_elements
                                    right_columns = missing_right.index[(
                                        missing_right & mismatch)].format()
                                    # set valid date range to span both records
                                    min_date = min({
                                        tmpDf.loc[idx, 'valid_from'],
                                        matches.loc[id_match[0], 'valid_from']
                                    })
                                    max_date = max({
                                        tmpDf.loc[idx, 'valid_to'],
                                        matches.loc[id_match[0], 'valid_to']
                                    })
                                    master.at[
                                        id_match[0],
                                        'source_file'] = master.loc[
                                            id_match[0],
                                            'source_file'] + ';' + tmpDf.loc[
                                                idx, 'source_file']
                                    if (tmpDf.loc[idx, 'name'] !=
                                            master.loc[id_match[0], 'name']
                                        ) & (tmpDf.loc[idx, 'name']
                                             not in master.loc[id_match[0],
                                                               'alt_names']):
                                        master.at[
                                            id_match[0],
                                            'alt_names'] = master.loc[
                                                id_match[0],
                                                'alt_names'] + ';' + tmpDf.loc[
                                                    idx, 'name']
                                    # now update master table
                                    master.at[id_match[0],
                                              'valid_to'] = max_date
                                    master.at[id_match[0],
                                              'valid_from'] = min_date
                                    # now fill master table (this is the one we keep)
                                    if len(right_columns) > 0:
                                        master.at[id_match[0],
                                                  right_columns] = tmpDf.loc[
                                                      idx, right_columns]
                                else:
                                    # now check numeric (float) elements
                                    mismatch = ~(matching_elements | missing)
                                    numeric_mismatch = numeric_columns.intersection(
                                        mismatch.index[mismatch].format())
                                    if len(numeric_mismatch) > 0:
                                        print(" **** Numeric mismatch **** ")
                                        print(tmpDf.loc[
                                            idx,
                                            pd.np.array(common)[mismatch]])
                                        print(matches.loc[
                                            id_match[0],
                                            pd.np.array(common)[mismatch]])
                                        action = 'correct_numeric'
                                    else:
                                        action = 'new_record'
                                        tmpDf.at[
                                            idx,
                                            'record_number'] = max_record + 1
                        else:
                            action = 'new_record'
                            tmpDf.at[idx, 'record_number'] = max_record + 1
                    if action == 'new_record':
                        to_add.append(idx)
            else:
                to_add = tmpDf.index.values
            # concat to master table
            master = pd.concat([master, tmpDf.loc[to_add, ]],
                               ignore_index=True,
                               sort=False)
            # replace nans with expected missing value
            for column in master:
                if master[column].dtype == 'datetime64[ns]':
                    continue
                if master[column].dtype == 'float64':
                    master[column].fillna(fmiss, inplace=True)
                elif master[column].dtype == 'object':
                    master[column].fillna(cmiss, inplace=True)
                elif master[column].dtype == 'int64':
                    master[column].fillna(imiss, inplace=True)
                else:
                    print('Unknown column type: {}'.format(
                        master[column].dtype))

        # final step is sort and addition of record numbers

        # assign UIDs to all records
        uid = master.apply(lambda x: '{}-{}-{}'.format(x['callsign'], x[
            'sx'], x['recruiting_country']),
                           axis=1)
        master = master.assign(uid=uid)

        # sort by id then date
        master.sort_values(['uid', 'valid_from'], inplace=True)

        # reset index
        master.reset_index(inplace=True, drop=True)

        # now reset record numbers based on uid
        uids = master['uid'].unique()
        count = 0
        for uid in uids:
            if count % 200 == 0:
                print('{} / {} '.format(count, len(uids)))
            records = master.loc[master['uid'] == uid, :]
            nrecs = records.shape[0]
            master.at[records.index, 'record_number'] = pd.np.arange(nrecs)
            # adjust valid from and to for 1st and last records
            new_valid_to = records.valid_from.shift(-1)
            to_change = (((records['valid_to'] - new_valid_to)).dt.days >= -3625) & \
                        (((records['valid_to'] - new_valid_to)).dt.days  <= 0)
            if to_change.any():
                records.loc[to_change, 'valid_to'] = new_valid_to[to_change]
            # add 5 years to last record and subtract 1 year from first
            records.loc[records.index[0], 'valid_from'] = records.loc[
                records.index[0], 'valid_from'] - relativedelta(months=12)
            records.loc[records.index[nrecs - 1],
                        'valid_to'] = records.loc[records.index[nrecs - 1],
                                                  'valid_to'] + relativedelta(
                                                      months=60)
            master.at[records.index, ['valid_from', 'valid_to']] = records.loc[
                records.index, ['valid_from', 'valid_to']]
            count += 1

        # now save
        # convert each field back to str and replace missing values with NULL
        master = master.astype(str)
        master.replace(str(fmiss), pd.np.nan, inplace=True)
        master.replace(str(imiss), pd.np.nan, inplace=True)
        master.to_csv(outputpath + './master/master.' + country + '.csv',
                      index=False,
                      sep='|',
                      na_rep='NULL')
示例#6
0
文件: fuzzy.py 项目: happytcj/Euphony
        rdr.next()

        for row in rdr:
            female_names.append(row[1])
            male_names.append(row[0])

parse_csv('/Users/TJiang/Desktop/name_in_english/names.csv')
print male_names
exit(0)

#soundex = fuzzy.Soundex(4)

hash_list = []

for n in male_names:
    print '%-10s' % n, soundex(n)
    hash_list.append(soundex(n))

user_name   = raw_input('enter desired name to match')
user_hash   = soundex(user_name)
gender      = 'm'
#gender      = raw_input('enter gender m/f')

same_score_list = []

top_match = -1;
for idx, one_hash in enumerate(hash_list):
    score = fuzz.ratio(user_hash, one_hash)
    if score > top_match:
        top_match = score
        del same_score_list[:]
示例#7
0
# Find the best match from the dictionary for a misspelled token

import Levenshtein as lev
from soundex import *

# Convert the dictionary with soundex. Referred as soundexDict
soundexDict = []
for i in dictLs:
    soundexDict.append(soundex(i))


def bestMatch(token):
    # Convert the token with soundex
    soundexToken = soundex(token)

    # Find the exact same soundex code in the soundexDict, and save their index numbers
    candidateIndex = []
    for i in range(len(soundexDict)):
        if soundexToken == soundexDict[i]:
            candidateIndex.append(i)

    # Use the index numbers of the matches, extract the original words from the original dictionary
    candidateLs = []
    for i in candidateIndex:
        candidateLs.append(dictLs[i])

    # Use Levenshtein Distance (edit distance) to compare every potential match to the misspelled token,
    # and return the most similar one as the best match
    maxRatio = 0
    bestMatch = ""
    for i in candidateLs:
示例#8
0
def main(argv):

    parser = argparse.ArgumentParser(
        description='Split WMO Publication 47 metadata files by country')
    parser.add_argument( "-config", dest = "config", required = False, \
                          default = "config.json", help = "JSON file containing configuration settings")
    parser.add_argument( "-jobs", dest = "jobs", required = True, \
                          default = "jobs.json", help = "JSON file containing list of jobs to run")
    parser.add_argument( "-start", dest="jobIndexStart", type=int, required=True, default=1, \
                         help = "Index of first job to process")
    parser.add_argument( "-end", dest="jobIndexEnd", type=int, required=False, default=None, \
                         help = "Index of last job to process, defaults to first job")
    parser.add_argument( "-log", dest="log_path", required=False, default='./', \
                         help = "Directory to write log files to")
    parser.add_argument( "-tag", dest="tag", required=False, default='', \
                         help = "Tag appended to log files")

    args = parser.parse_args()
    control_file = args.jobs
    first_job = args.jobIndexStart
    last_job = args.jobIndexEnd
    config_file = args.config
    log_path = args.log_path

    if last_job is None:
        last_job = first_job

    # set validity periods for different editions
    validity = {'annual': 12, 'quarterly': 3, 'semi-annual': 6}

    # global options stored in config file
    # jobs specific options in control_file (need to rename)

    # load config options
    with open(config_file) as cf:
        config = json.load(cf)

    # load controls / list if files to process
    with open(control_file) as s:
        control = json.load(s)

    # parsing using pandas

    # global options

    map_path = config['mapping_path']
    datapath = config['data_path']
    configpath = config['config_path']
    verbose = config['verbose']
    outputpath = config['output_path']
    corrections_file = configpath + './' + config['corrections_file']

    print(corrections_file)

    # read options from control file
    log_file = log_path + './split_pub47_' + args.tag + '.log'

    # load corrections
    with open(corrections_file) as m:
        corrections = json.load(m)

    # open log file for later use
    log = open(log_file, 'w')

    # iterate over jobs in control file
    for job_index in pd.np.arange(first_job, last_job + 1, 1):
        # find job in job list
        for job in control['jobs']:
            if job['jobindex'] == job_index:
                break
        assert job_index == job['jobindex']

        rejects = pd.DataFrame()

        # load schema
        schema = pub47schema(configpath + './schemas/', job['schema'])

        # get input file
        input_file = job['data_file']
        input_file = datapath + input_file

        # set validity dates
        valid_from = datetime.date(job['year'], job['month'], 1)
        valid_to = datetime.date(job['year'], job['month'], 1) + relativedelta(
            months=validity[job['freq']])

        # feedback
        if verbose > 0:
            print("Processing " + os.path.basename(input_file), file=log)

        # now read in the data
        datain = pub47load(schema, input_file, map_path)

        # remove any exact duplicates
        datain = datain.drop_duplicates(keep='first')

        # now we need to identify duplicates within country
        id_counts = datain.loc[:, 'call'].value_counts()
        duplicated_ids = id_counts.index.values[id_counts > 1]
        duplicated_ids = list(duplicated_ids[duplicated_ids != cmiss])
        unique_ids = list(id_counts.index.values[id_counts == 1])
        unique_ids.append(cmiss)
        unique_rows = datain.loc[
            datain['call'].apply(lambda x: x in unique_ids), :].copy()

        for dup_id in duplicated_ids:
            dup_rows = datain.loc[datain['call'] == dup_id, :]
            # more than two entries for same callsign, reject all for later assessment
            if dup_rows.shape[0] > 2:
                rejects = pd.concat([rejects, dup_rows],
                                    ignore_index=True,
                                    sort=False)
                continue
            cmp = dup_rows.apply(lambda x: pub47_record_completeness(x),
                                 axis=1)
            vsslM = dup_rows.loc[:, 'vsslM']
            most_complete = list(cmp[cmp == max(cmp)].index.values)
            highest_class = list(vsslM[vsslM == min(vsslM)].index.values)
            ix = dup_rows.index.values
            same_name = soundex(dup_rows.loc[ix[0], 'name']) == soundex(
                dup_rows.loc[ix[1], 'name'])
            same_country = dup_rows.loc[ix[0], schema.recruiting_country ] == \
                           dup_rows.loc[ix[1], schema.recruiting_country ]
            # if same country and name merge if possible
            # if different country but same name use highest VOS class
            # else mark for rejection as ambiguous
            if same_country and same_name:
                # check if we can merge
                if pub47_record_compare(
                        dup_rows.loc[ix[0], schema.duplicate_check],
                        dup_rows.loc[ix[1], schema.duplicate_check]):
                    record_to_add = dup_rows.loc[[most_complete[0]], ].copy()
                    merged_record = pub47_merge_rows(
                        dup_rows.loc[ix[0], schema.duplicate_check],
                        dup_rows.loc[ix[1], schema.duplicate_check])
                    # record_to_add.at[ ix[0], schema['duplicate_check'] ] \
                    merged_record = pd.DataFrame(merged_record).transpose()
                    record_to_add.reset_index(inplace=True, drop=True)
                    merged_record.reset_index(inplace=True, drop=True)
                    record_to_add.at[:, schema.
                                     duplicate_check] = merged_record.loc[:,
                                                                          schema
                                                                          .
                                                                          duplicate_check]
                elif len(highest_class) == 1:
                    record_to_add = dup_rows.loc[[highest_class[0]], :].copy()
                elif len(most_complete) == 1:
                    record_to_add = dup_rows.loc[[most_complete[0]], :].copy()
                else:
                    rejects = pd.concat([rejects, dup_rows],
                                        ignore_index=True,
                                        sort=False)
                    record_to_add = None
            elif same_country:
                rejects = pd.concat([rejects, dup_rows],
                                    ignore_index=True,
                                    sort=False)
                record_to_add = None
            else:
                record_to_add = dup_rows
            if record_to_add is not None:
                unique_rows = pd.concat([unique_rows, record_to_add],
                                        ignore_index=True,
                                        sort=False)

        # save rejects to file
        print("Saving rejects")
        rejects = rejects.astype(str)
        rejects.replace(str(fmiss), pd.np.nan, inplace=True)
        rejects.replace(str(imiss), pd.np.nan, inplace=True)
        rejects.to_csv(outputpath + './split/' + os.path.basename(input_file) +
                       '.' + 'reject',
                       index=False,
                       sep='|',
                       na_rep='NULL')
        datain = unique_rows.copy()

        # get list of countries present in file
        countries = datain.rcnty.unique()
        print(countries, file=log)

        # now loop over countries homogenising
        for country in countries:
            if verbose > 0:
                print("Processing {}".format(country), file=log)
            tmp_data = datain.loc[datain.rcnty == country].copy()

            tmp_data = tmp_data.reindex()
            nrows = tmp_data.shape[0]

            # output file for data from this country
            country_file = os.path.basename(input_file) + '.' + country

            cor = None
            # check if corrections exists for country / edition
            for cor_temp in corrections:
                if cor_temp['file'] == country_file:
                    cor = cor_temp
                    break

            # validate (and correct) data
            for column in tmp_data:
                # ++++++++++ CORRECT DATA ++++++++++
                # check if correction required and apply
                if cor is not None:
                    for f in cor['corrections']:
                        if f['field'] == column:
                            if f['all'] == 1:
                                if verbose > 0:
                                    print(
                                        "Applying corrections to all values in {}"
                                        .format(column),
                                        file=log)
                                    print("Factor = {}".format(f['factor']),
                                          file=log)
                                # getting non missing rows
                                # valid = tmp_data[column] != fmiss
                                valid = tmp_data[column].apply(
                                    lambda x: abs(x - fmiss) < tol)
                                # apply to tmp data
                                tmp_data.at[valid, column] = tmp_data.loc[
                                    valid, column] * f['factor']
                                # apply to datain
                                datain.at[ (datain['rcnty'] == country) & (valid) , column] = \
                                                datain.loc[ (datain['rcnty'] == country) & (valid) , column] * f['factor']
                            else:
                                valid = pv.validate_numeric(
                                    tmp_data[column],
                                    min_value=schema.column_valid_min[column],
                                    max_value=schema.column_valid_max[column],
                                    return_type='mask_series')
                                valid = valid & ~(tmp_data[column].apply(
                                    lambda x: abs(x - fmiss) < tol))
                                if any(valid):
                                    if verbose > 0:
                                        print(
                                            "Applying corrections to invalid values in {}"
                                            .format(column),
                                            file=log)
                                        print("Factor = {}".format(
                                            f['factor']),
                                              file=log)
                                    # apply to tmp data
                                    tmp_data.at[valid, column] = tmp_data.loc[
                                        valid, column] * f['factor']
                                    # now apply to datain
                                    valid = pv.validate_numeric(
                                        datain[column],
                                        min_value=schema.
                                        column_valid_min[column],
                                        max_value=schema.
                                        column_valid_max[column],
                                        return_type='mask_series')
                                    datain.at[ (datain['rcnty'] == country) & (valid), column] = \
                                                    datain.loc[ (datain['rcnty'] == country) & (valid), column] * f['factor']
                # ++++++++++ VALIDATE CODED DATA ++++++++++
                # get code table to validate against
                tableID = schema.column_code_table[column]
                if tableID in schema.code_tables:
                    codes = schema.code_tables[tableID]
                    if verbose > 1:
                        print("Validating against code table: " + str(tableID),
                              file=log)
                    whitelist = codes['code'].map(str)
                    whitelist = whitelist.append(
                        pd.Series([cmiss, '-1', 'NA', '-999999']))
                    tmp_values = tmp_data[column].map(str)
                    valid = pv.validate_string(tmp_values,
                                               whitelist=whitelist,
                                               return_type='mask_series')

                # ++++++++++ VALIDATE NUMERIC ++++++++++
                if tableID == None:
                    if schema.column_type[column] != 'object':
                        # if int convert to float and replace -1 with np.na
                        if str(tmp_data[column].dtype) == 'int64':
                            tmp_values = pd.to_numeric(tmp_data[column])
                            tmp_values = tmp_values.replace(
                                to_replace=imiss, value=fmiss)  # pd.np.nan )
                        else:
                            tmp_values = tmp_data[column]
                        valid = pv.validate_numeric(
                            tmp_data[column],
                            min_value=schema.column_valid_min[column],
                            max_value=schema.column_valid_max[column],
                            return_type='mask_series')
                        valid = valid & ~(tmp_data[column].apply(
                            lambda x: abs(x - fmiss) < tol))
                    else:
                        valid = pd.Series(False * nrows)

                # calculate fraction bad
                fraction_bad = sum(valid) / nrows
                if (fraction_bad > 0.05) & (nrows > 10):
                    mask = valid.apply(lambda x: not x)
                    print("///////////// " + os.path.basename(input_file) +
                          '.' + country + " /////////////",
                          file=log)
                    print("Large number of bad values for " + column + "(" +
                          str(tableID) + ")",
                          file=log)
                    print(tmp_data.loc[valid, column].unique(), file=log)
                elif any(valid):
                    print("Bad values, {} ({})  :: {}".format(
                        column, str(tableID), tmp_values[valid].unique()),
                          file=log)

            dataout = datain[datain.rcnty == country]
            dataout = dataout.assign(valid_from=valid_from)
            dataout = dataout.assign(valid_to=valid_to)
            dataout = dataout.assign(schema=schema.version)
            # convert all columns to object and replace fmiss and imiss with NA
            dataout = dataout.astype(str)
            dataout.replace(str(fmiss), pd.np.nan, inplace=True)
            dataout.replace(str(imiss), pd.np.nan, inplace=True)
            dataout.to_csv(outputpath + './split/' +
                           os.path.basename(input_file) + '.' + country,
                           index=False,
                           sep='|',
                           na_rep='NULL')