def test_single_entry_three_occurances(self): values = [(3, 3)] assert mod.get_median(values) == 3
def test_single_value(self): values = [(3, 1)] assert mod.get_median(values) == 3
def test_alpha_values(self): values = [('foo', 1), (4, 1)] assert mod.get_median(values) == 4
def test_types(self): assert isinstance(mod.get_median([(10, 4), (100, 86)]), float) assert isinstance(mod.get_median([(1, 1)]), float)
def test_odd_number_of_values_with_three_occurances_each(self): values = [(1, 3), (2, 3), (6, 3), (5, 3), (4, 3), (3, 3)] assert mod.get_median(values) == 3.5
def test_odds_and_even_numbers(self): values = [(-1, 1), (1, 1)] assert mod.get_median(values) == 0
def test_floats(self): values = [(0.1, 1), (0.3, 1)] assert mod.get_median(values) == 0.2
def test_even_number_with_skew(self): values = [(1, 10), (2, 4), (6, 3), (5, 2), (4, 1)] assert mod.get_median(values) == 1.5
def test_odd_number_of_values_with_one_occurances_each(self): values = [(1, 1), (2, 1), (5, 1), (4, 1), (3, 1)] assert mod.get_median(values) == 3
def test_two_entries_with_one_occurence_each(self): values = [(1, 1), (2, 1)] assert mod.get_median(values) == 1.5
def test_none_values(self): values = [(None, 1), (4, 1)] assert mod.get_median(values) == 4
def analyze_fields(self, field_number: Optional[int] = None, field_types_overrides: Optional[Dict[int, str]] = None, max_freq_number: Optional[int] = None, read_limit: int = -1) -> None: """ Determines types, names, and characteristics of fields. Arguments: - field_number: if None, then analyzes all fields, otherwise analyzes just the single field (based on zero-offset) - field_types_overrides: - max_freq_number: limits size of collected frequency distribution, allowing for faster analysis or analysis of very large high-cardinality fields. - read_limit: a performance setting that stops file reads after this number. The default is -1 which means 'no limit'. Returns: - Nothing directly - populates instance variables. """ self.max_freq_number = max_freq_number if self.verbose: print('Field Analysis Progress: ') for f_no in range(self.field_cnt): if field_number is not None: # optional analysis of a single field if f_no != field_number: continue if self.verbose: print(' Analyzing field: %d' % f_no) self.field_names[f_no] = miscer.get_field_name( self.filename, self.dialect, f_no) if max_freq_number is None: if field_number is None: max_items = MAX_FREQ_MULTI_COL_DEFAULT else: max_items = MAX_FREQ_SINGLE_COL_DEFAULT else: max_items = max_freq_number (self.field_freqs[f_no], self.field_trunc[f_no], self.field_rows_invalid[f_no]) = miscer.get_field_freq( self.filename, self.dialect, f_no, max_items, read_limit) field_freqs = list(self.field_freqs[f_no].items()) self.field_types[f_no] = typer.get_field_type( self.field_freqs[f_no]) if field_types_overrides: for col_no in field_types_overrides: self.field_types[col_no] = field_types_overrides[col_no] self.field_max[f_no] = miscer.get_max(self.field_types[f_no], field_freqs) self.field_min[f_no] = miscer.get_min(self.field_types[f_no], field_freqs) if self.field_types[f_no] == 'string': self.field_case[f_no] = miscer.get_case( self.field_types[f_no], field_freqs) self.field_min_length[f_no] = miscer.get_min_length( field_freqs) self.field_max_length[f_no] = miscer.get_max_length( field_freqs) self.field_mean_length[f_no] = mather.get_mean_length( field_freqs) else: self.field_case[f_no] = None self.field_min_length[f_no] = None self.field_max_length[f_no] = None self.field_mean_length[f_no] = None if self.field_types[f_no] in ('integer', 'float'): self.field_mean[f_no] = mather.get_mean(field_freqs) self.field_median[f_no] = mather.get_median(field_freqs) (self.variance[f_no], self.stddev[f_no]) \ = mather.get_variance_and_stddev(field_freqs, self.field_mean[f_no]) else: self.field_mean[f_no] = None self.field_median[f_no] = None self.variance[f_no] = None self.stddev[f_no] = None
def test_empty(self): values = [] assert mod.get_median(values) is None
def analyze_fields(self, field_number: Optional[int] = None, field_types_overrides: Optional[Dict[int, str]] = None, max_freq_number: Optional[int] = None, read_limit: int = -1) -> None: """ Determines types, names, and characteristics of fields. Arguments: - field_number: if None, then analyzes all fields, otherwise analyzes just the single field (based on zero-offset) - field_types_overrides: - max_freq_number: limits size of collected frequency distribution, allowing for faster analysis or analysis of very large high-cardinality fields. - read_limit: a performance setting that stops file reads after this number. The default is -1 which means 'no limit'. Returns: - Nothing directly - populates instance variables. """ self.max_freq_number = max_freq_number if self.verbose: print('Field Analysis Progress: ') for f_no in range(self.field_cnt): if field_number is not None: # optional analysis of a single field if f_no != field_number: continue if self.verbose: print(' Analyzing field: %d' % f_no) self.field_names[f_no] = miscer.get_field_name(self.filename, self.dialect, f_no) if max_freq_number is None: if field_number is None: max_items = MAX_FREQ_MULTI_COL_DEFAULT else: max_items = MAX_FREQ_SINGLE_COL_DEFAULT else: max_items = max_freq_number (self.field_freqs[f_no], self.field_trunc[f_no], self.field_rows_invalid[f_no]) = miscer.get_field_freq(self.filename, self.dialect, f_no, max_items, read_limit) field_freqs = list(self.field_freqs[f_no].items()) self.field_types[f_no] = typer.get_field_type(self.field_freqs[f_no]) if field_types_overrides: for col_no in field_types_overrides: self.field_types[col_no] = field_types_overrides[col_no] self.field_max[f_no] = miscer.get_max(self.field_types[f_no], field_freqs) self.field_min[f_no] = miscer.get_min(self.field_types[f_no], field_freqs) if self.field_types[f_no] == 'string': self.field_case[f_no] = miscer.get_case(self.field_types[f_no], field_freqs) self.field_min_length[f_no] = miscer.get_min_length(field_freqs) self.field_max_length[f_no] = miscer.get_max_length(field_freqs) self.field_mean_length[f_no] = mather.get_mean_length(field_freqs) else: self.field_case[f_no] = None self.field_min_length[f_no] = None self.field_max_length[f_no] = None self.field_mean_length[f_no] = None if self.field_types[f_no] in ('integer', 'float'): self.field_mean[f_no] = mather.get_mean(field_freqs) self.field_median[f_no] = mather.get_median(field_freqs) (self.variance[f_no], self.stddev[f_no]) \ = mather.get_variance_and_stddev(field_freqs, self.field_mean[f_no]) else: self.field_mean[f_no] = None self.field_median[f_no] = None self.variance[f_no] = None self.stddev[f_no] = None