def get_mean_length(values): """ Returns the mean (average) length of the input. Ignores unknown values, if no values found besides unknown it will just return 'None' Inputs: - a dictionary of frequency distribution - with all data of a string type, and character keyss and numeric data representing occurances of the keys. Exceptions will be ignored. Outputs: - a single value - the mean of the inputs Test Coverage: - complete via test harness """ count = 0 accum = 0 for value in values: if field_type.is_unknown(value): continue try: accum += len(value) * int(values[value]) count += int(values[value]) except ValueError: pass # usually 'unknown values', sometimes garbage try: return accum / count except ZeroDivisionError: return None
def get_known_values(self, fieldno): """ returns a frequency-distribution dictionary that is the self.field_freqs with unknown values removed. """ return [val for val in self.field_freqs[fieldno] if typer.is_unknown(val) is False]
def get_max(value_type, values): """ Returns the maximum value of the input. Ignores unknown values, if no values found besides unknown it will just return 'None' Inputs: - value_type - one of integer, float, string, timestap - dictionary or list of string values Outputs: - the single maximum value of the appropriate type Test Coverage: - complete via test harness """ assert value_type in ['integer', 'float', 'string', 'timestamp', 'unknown', None] known_vals = [] for val in values: if not typer.is_unknown(val): try: if value_type == 'integer': known_vals.append(int(val)) elif value_type == 'float': known_vals.append(float(val)) else: known_vals.append(val) except ValueError: pass # ignore invalid values try: return str(max(known_vals)) except ValueError: return None
def get_known_values(self, fieldno): """ returns a frequency-distribution dictionary that is the self.field_freqs with unknown values removed. """ return [ val for val in self.field_freqs[fieldno] if typer.is_unknown(val) is False ]
def get_max(value_type, values): """ Returns the maximum value of the input. Ignores unknown values, if no values found besides unknown it will just return 'None' Inputs: - value_type - one of integer, float, string, timestap - dictionary or list of string values Outputs: - the single maximum value of the appropriate type Test Coverage: - complete via test harness Issues: - doesn't report to caller number of values rejected due to unknown or invalid values """ assert(value_type in ['integer', 'float', 'string', 'timestamp', 'unknown', None]) unknown_field_cnt = 0 invalid_field_cnt = 0 #simpler, older, solution, but didn't support try,except code #if value_type == 'integer': # known_vals = [int(val) for val in values if not typer.is_unknown(val)] #elif value_type == 'float': # known_vals = [float(val) for val in values if not typer.is_unknown(val)] #else: # known_vals = [val for val in values if not typer.is_unknown(val)] known_vals = [] for val in values: if typer.is_unknown(val): unknown_field_cnt += 1 else: try: if value_type == 'integer': known_vals.append(int(val)) elif value_type == 'float': known_vals.append(float(val)) else: known_vals.append(val) except ValueError: invalid_field_cnt += 1 try: if value_type in ['integer','float']: return str(max(known_vals)) else: return max(known_vals) except ValueError: return None
def get_max(value_type, values): """ Returns the maximum value of the input. Ignores unknown values, if no values found besides unknown it will just return 'None' Inputs: - value_type - one of integer, float, string, timestap - dictionary or list of string values Outputs: - the single maximum value of the appropriate type Test Coverage: - complete via test harness Issues: - doesn't report to caller number of values rejected due to unknown or invalid values """ assert (value_type in ['integer', 'float', 'string', 'timestamp', 'unknown', None]) unknown_field_cnt = 0 invalid_field_cnt = 0 #simpler, older, solution, but didn't support try,except code #if value_type == 'integer': # known_vals = [int(val) for val in values if not typer.is_unknown(val)] #elif value_type == 'float': # known_vals = [float(val) for val in values if not typer.is_unknown(val)] #else: # known_vals = [val for val in values if not typer.is_unknown(val)] known_vals = [] for val in values: if typer.is_unknown(val): unknown_field_cnt += 1 else: try: if value_type == 'integer': known_vals.append(int(val)) elif value_type == 'float': known_vals.append(float(val)) else: known_vals.append(val) except ValueError: invalid_field_cnt += 1 try: if value_type in ['integer', 'float']: return str(max(known_vals)) else: return max(known_vals) except ValueError: return None
def get_min(value_type, values): """ Returns the minimum value of the input. Ignores unknown values, if no values found besides unknown it will just return 'None' Inputs: - value_type - one of integer, float, string, timestap - dictionary or list of string values Outputs: - the single maximum value of the appropriate type Test Coverage: - complete via test harness Issues: - doesn't report to caller number of values rejected due to unknown or invalid values """ assert value_type in ["integer", "float", "string", "timestamp", "unknown", None] unknown_field_cnt = 0 invalid_field_cnt = 0 # first handle types & unknowns: # if value_type == 'integer': # known_vals = [int(val) for val in values if not typer.is_unknown(val)] # elif value_type == 'float': # known_vals = [float(val) for val in values if not typer.is_unknown(val)] # else: # known_vals = [val for val in values if not typer.is_unknown(val)] known_vals = [] for val in values: if typer.is_unknown(val): unknown_field_cnt += 1 else: try: if value_type == "integer": known_vals.append(int(val)) elif value_type == "float": known_vals.append(float(val)) else: known_vals.append(val) except ValueError: invalid_field_cnt += 1 # next return the minimum value try: if value_type in ["integer", "float"]: return str(min(known_vals)) else: return min(known_vals) except ValueError: return None
def get_case(field_type, values): """ Determines the case of a list or dictionary of values. Args: - type: if not == 'string', will return 'n/a' - values: could be either dictionary or list. If it's a list, then it will only examine the keys. Returns: - one of: 'mixed','lower','upper','unknown' Misc notes: - "unknown values" are ignored - empty values list/dict results in 'unknown' result To do: - add consistency factor Test coverage: - complete, via test harness """ freq = collections.defaultdict(int) case = None if field_type != 'string': return 'n/a' # count occurances of each case field_type in values: for key in values: if typer.is_unknown(key): freq['unk'] += 1 elif typer.is_integer(key): # will be ignoring these freq['number'] += 1 elif typer.is_float(key): # will be ignoring these freq['number'] += 1 elif key.islower(): freq['lower'] += 1 elif key.isupper(): freq['upper'] += 1 else: freq['mixed'] += 1 # evaluate frequency distribution: if 'mixed' in freq: case = 'mixed' elif ('lower' in freq and 'upper' not in freq): case = 'lower' elif ('lower' not in freq and 'upper' in freq): case = 'upper' elif ('lower' in freq and 'upper' in freq): case = 'mixed' else: case = 'unknown' return case
def get_min_length(values): """ Returns the minimum length value of the input. If no values found besides unknown it will just return 'None' Inputs: - dictionary or list of string values Outputs: - the single minimum value """ min_length = 999999 for value in values: if not typer.is_unknown(value): if len(value) < min_length: min_length = len(value) return min_length
def get_case(field_type, values): """ Determines the case of a list or dictionary of values. Args: - type: if not == 'string', will return 'n/a' - values: could be either dictionary or list. If it's a list, then it will only examine the keys. Returns: - one of: 'mixed','lower','upper','unknown' Misc notes: - "unknown values" are ignored - empty values list/dict results in 'unknown' result To do: - add consistency factor Test coverage: - complete, via test harness """ freq = collections.defaultdict(int) case = None if field_type != 'string': return 'n/a' # count occurances of each case field_type in values: for key in values: if typer.is_unknown(key): freq['unk'] += 1 elif typer.is_integer(key): # will be ignoring these freq['number'] += 1 elif typer.is_float(key): # will be ignoring these freq['number'] += 1 elif key.islower(): freq['lower'] += 1 elif key.isupper(): freq['upper'] += 1 else: freq['mixed'] += 1 # evaluate frequency distribution: if 'mixed' in freq: case = 'mixed'
def test_type_d01_is_unknown(self): assert(mod.is_unknown('') is True) assert(mod.is_unknown(' ') is True) assert(mod.is_unknown('na') is True) assert(mod.is_unknown('NA') is True) assert(mod.is_unknown('n/a') is True) assert(mod.is_unknown('N/A') is True) assert(mod.is_unknown('unk') is True) assert(mod.is_unknown('unknown') is True) assert(mod.is_unknown('b') is False) assert(mod.is_unknown('$3') is False) assert(mod.is_unknown('4,333.22') is False) assert(mod.is_unknown('33.22') is False) assert(mod.is_unknown('3') is False) assert(mod.is_unknown('-3') is False) assert(mod.is_unknown(3) is False) assert(mod.is_unknown(3.3) is False) assert(mod.is_unknown(None) is False)
def test_type_d01_is_unknown(self): assert (mod.is_unknown('') is True) assert (mod.is_unknown(' ') is True) assert (mod.is_unknown('na') is True) assert (mod.is_unknown('NA') is True) assert (mod.is_unknown('n/a') is True) assert (mod.is_unknown('N/A') is True) assert (mod.is_unknown('unk') is True) assert (mod.is_unknown('unknown') is True) assert (mod.is_unknown('b') is False) assert (mod.is_unknown('$3') is False) assert (mod.is_unknown('4,333.22') is False) assert (mod.is_unknown('33.22') is False) assert (mod.is_unknown('3') is False) assert (mod.is_unknown('-3') is False) assert (mod.is_unknown(3) is False) assert (mod.is_unknown(3.3) is False) assert (mod.is_unknown(None) is False)
def test_is_unknown_basics(self): assert mod.is_unknown('') assert mod.is_unknown(' ') assert mod.is_unknown('na') assert mod.is_unknown('NA') assert mod.is_unknown('n/a') assert mod.is_unknown('N/A') assert mod.is_unknown('unk') assert mod.is_unknown('unknown') assert mod.is_unknown('b') is False assert mod.is_unknown('$3') is False assert mod.is_unknown('4,333.22') is False assert mod.is_unknown('33.22') is False assert mod.is_unknown('3') is False assert mod.is_unknown('-3') is False assert mod.is_unknown(3) is False assert mod.is_unknown(3.3) is False assert mod.is_unknown(None) is False
def test_is_unknown_basics(self): assert mod.is_unknown("") assert mod.is_unknown(" ") assert mod.is_unknown("na") assert mod.is_unknown("NA") assert mod.is_unknown("n/a") assert mod.is_unknown("N/A") assert mod.is_unknown("unk") assert mod.is_unknown("unknown") assert mod.is_unknown("b") is False assert mod.is_unknown("$3") is False assert mod.is_unknown("4,333.22") is False assert mod.is_unknown("33.22") is False assert mod.is_unknown("3") is False assert mod.is_unknown("-3") is False assert mod.is_unknown(3) is False assert mod.is_unknown(3.3) is False assert mod.is_unknown(None) is False