def test_language_analysis(): from langdetect import DetectorFactory DetectorFactory.seed = 0 WORDS = { 'en': ['because', 'tree', 'merge', 'work', 'interpret', 'call', 'think'], 'ru': ['только', 'говорить', 'когда', 'человек', 'быть', 'первый', 'осень'], 'de': ['führen', 'stelle', 'heißen', 'konnten', 'schlimm', 'mögen', 'nähe'], } sent_size = 7 num_sents = 10 for lang, words in WORDS.items(): sentences = [random.sample(words, sent_size) for _ in range(num_sents)] nr_words, word_dist, nr_words_dist = analyze_sentences( ' '.join(sent) for sent in sentences) assert nr_words == len(sentences * sent_size) lang_dist = get_language_dist(' '.join(sent) for sent in sentences) assert lang_dist[lang] == len(sentences) assert 'Unknown' in lang_dist and lang_dist['Unknown'] == 0
def get_column_data_type(self, data, full_data, col_name): """ Provided the column data, define its data type and data subtype. :param data: an iterable containing a sample of the data frame :param full_data: an iterable containing the whole column of a data frame :return: type and type distribution, we can later use type_distribution to determine data quality NOTE: type distribution is the count that this column has for belonging cells to each DATA_TYPE """ additional_info = { 'other_potential_subtypes': [], 'other_potential_types': [] } if len(data) == 0: self.log.warning( f'Column {col_name} has no data in it. ' f'Please remove {col_name} from the training file or fill in some of the values !' ) return None, None, None, None, additional_info type_dist, subtype_dist = {}, {} # User-provided dtype if col_name in self.transaction.lmd['data_subtypes']: curr_data_type = self.transaction.lmd['data_types'][col_name] curr_data_subtype = self.transaction.lmd['data_subtypes'][col_name] type_dist[curr_data_type] = len(data) subtype_dist[curr_data_subtype] = len(data) self.log.info( f'Manually setting the types for column {col_name} to {curr_data_type}->{curr_data_subtype}' ) return curr_data_type, curr_data_subtype, type_dist, subtype_dist, additional_info # Forced categorical dtype if col_name in self.transaction.lmd['force_categorical_encoding']: curr_data_type = DATA_TYPES.CATEGORICAL curr_data_subtype = DATA_SUBTYPES.MULTIPLE type_dist[DATA_TYPES.CATEGORICAL] = len(data) subtype_dist[DATA_SUBTYPES.MULTIPLE] = len(data) return curr_data_type, curr_data_subtype, type_dist, subtype_dist, additional_info type_dist, subtype_dist, new_additional_info = self.count_data_types_in_column( data) if new_additional_info: additional_info.update(new_additional_info) # @TODO consider removing or flagging rows where data type is unknown in the future, might just be corrupt data... known_type_dist = { k: v for k, v in type_dist.items() if k != 'Unknown' } if known_type_dist: max_known_dtype, max_known_dtype_count = max( known_type_dist.items(), key=lambda kv: kv[0]) else: max_known_dtype, max_known_dtype_count = None, None nr_vals = len(full_data) nr_distinct_vals = len(set(full_data)) # Data is mostly not unknown, go with type counting results if max_known_dtype and max_known_dtype_count > type_dist['Unknown']: curr_data_type = max_known_dtype possible_subtype_counts = [ (k, v) for k, v in subtype_dist.items() if k in DATA_TYPES_SUBTYPES[curr_data_type] ] curr_data_subtype, _ = max(possible_subtype_counts, key=lambda pair: pair[0]) else: curr_data_type, curr_data_subtype = None, None # Check for Tags subtype if curr_data_subtype != DATA_SUBTYPES.ARRAY: lengths = [] unique_tokens = set() can_be_tags = False if all(isinstance(x, str) for x in data): can_be_tags = True delimiter = self.transaction.lmd.get('tags_delimiter', ',') for item in data: item_tags = [t.strip() for t in item.split(delimiter)] lengths.append(len(item_tags)) unique_tokens = unique_tokens.union(set(item_tags)) # If more than 30% of the samples contain more than 1 category and there's more than 6 of them and they are shared between the various cells if can_be_tags and np.mean(lengths) > 1.3 and len( unique_tokens ) >= 6 and len(unique_tokens) / np.mean(lengths) < (len(data) / 4): curr_data_type = DATA_TYPES.CATEGORICAL curr_data_subtype = DATA_SUBTYPES.TAGS # Categorical based on unique values if curr_data_type != DATA_TYPES.DATE and curr_data_subtype != DATA_SUBTYPES.TAGS: if nr_distinct_vals < (nr_vals / 20) or nr_distinct_vals < 6: if (curr_data_type != DATA_TYPES.NUMERIC) or (nr_distinct_vals < 20): if curr_data_type is not None: additional_info['other_potential_types'].append( curr_data_type) additional_info['other_potential_subtypes'].append( curr_data_subtype) curr_data_type = DATA_TYPES.CATEGORICAL # If curr_data_type is still None, then it's text or category if curr_data_type is None: lang_dist = get_language_dist(data) # Normalize lang probabilities for lang in lang_dist: lang_dist[lang] /= len(data) # If most cells are unknown language then it's categorical if lang_dist['Unknown'] > 0.5: curr_data_type = DATA_TYPES.CATEGORICAL else: nr_words, word_dist, nr_words_dist = analyze_sentences(data) if 1 in nr_words_dist and nr_words_dist[1] == nr_words: curr_data_type = DATA_TYPES.CATEGORICAL else: curr_data_type = DATA_TYPES.TEXT if len(word_dist) > 500 and nr_words / len(data) > 5: curr_data_subtype = DATA_SUBTYPES.RICH else: curr_data_subtype = DATA_SUBTYPES.SHORT type_dist = {curr_data_type: len(data)} subtype_dist = {curr_data_subtype: len(data)} return curr_data_type, curr_data_subtype, type_dist, subtype_dist, additional_info if curr_data_type == DATA_TYPES.CATEGORICAL and curr_data_subtype != DATA_SUBTYPES.TAGS: if nr_distinct_vals > 2: curr_data_subtype = DATA_SUBTYPES.MULTIPLE else: curr_data_subtype = DATA_SUBTYPES.SINGLE if curr_data_type in [DATA_TYPES.CATEGORICAL, DATA_TYPES.TEXT]: type_dist = {curr_data_type: len(data)} subtype_dist = {curr_data_subtype: len(data)} return curr_data_type, curr_data_subtype, type_dist, subtype_dist, additional_info
def run(self, input_data): stats_v2 = self.transaction.lmd['stats_v2'] sample_settings = self.transaction.lmd['sample_settings'] population_size = len(input_data.data_frame) if sample_settings['sample_for_analysis']: sample_margin_of_error = sample_settings['sample_margin_of_error'] sample_confidence_level = sample_settings['sample_confidence_level'] sample_percentage = sample_settings['sample_percentage'] sample_function = self.transaction.hmd['sample_function'] sample_df = input_data.sample_df(sample_function, sample_margin_of_error, sample_confidence_level, sample_percentage) sample_size = len(sample_df) else: sample_size = population_size sample_df = input_data.data_frame self.transaction.log.info(f'Analyzing a sample of {sample_size} ' f'from a total population of {population_size}, ' f'this is equivalent to {round(sample_size * 100 / population_size, 1)}% of your data.') for col_name in self.transaction.lmd['empty_columns']: stats_v2[col_name] = {} stats_v2[col_name]['empty'] = {'is_empty': True} self.log.warning(f'Column {col_name} is empty.') for col_name in sample_df.columns.values: self.log.info(f'Analyzing column: {col_name} !') data_type = stats_v2[col_name]['typing']['data_type'] data_subtype = stats_v2[col_name]['typing']['data_subtype'] col_data = sample_df[col_name].dropna() if data_type == DATA_TYPES.NUMERIC or data_subtype == DATA_SUBTYPES.TIMESTAMP: col_data = clean_int_and_date_data(col_data, self.log) stats_v2[col_name]['empty'] = get_column_empty_values_report(input_data.data_frame[col_name]) if data_type == DATA_TYPES.CATEGORICAL: hist_data = input_data.data_frame[col_name] stats_v2[col_name]['unique'] = get_uniq_values_report(input_data.data_frame[col_name]) else: hist_data = col_data histogram, percentage_buckets = get_histogram(hist_data, data_type=data_type, data_subtype=data_subtype) stats_v2[col_name]['histogram'] = histogram stats_v2[col_name]['percentage_buckets'] = percentage_buckets if histogram: S, biased_buckets = compute_entropy_biased_buckets(histogram['y'], histogram['x']) stats_v2[col_name]['bias'] = { 'entropy': S, 'description': """Under the assumption of uniformly distributed data (i.e., same probability for Head or Tails on a coin flip) mindsdb tries to detect potential divergences from such case, and it calls this "potential bias". Thus by our data having any potential bias mindsdb means any divergence from all categories having the same probability of being selected.""" } if biased_buckets: stats_v2[col_name]['bias']['biased_buckets'] = biased_buckets if S < 0.8: if data_type == DATA_TYPES.CATEGORICAL: warning_str = "You may to check if some categories occur too often to too little in this columns." else: warning_str = "You may want to check if you see something suspicious on the right-hand-side graph." stats_v2[col_name]['bias']['warning'] = warning_str + " This doesn't necessarily mean there's an issue with your data, it just indicates a higher than usual probability there might be some issue." if data_type == DATA_TYPES.NUMERIC: outliers = lof_outliers(data_subtype, col_data) stats_v2[col_name]['outliers'] = { 'outlier_values': outliers, 'outlier_buckets': compute_outlier_buckets( outlier_values=outliers, hist_x=histogram['x'], hist_y=histogram['y'], percentage_buckets=percentage_buckets, col_stats=stats_v2[col_name] ), 'description': """Potential outliers can be thought as the "extremes", i.e., data points that are far from the center of mass (mean/median/interquartile range) of the data.""" } if data_type == DATA_TYPES.TEXT: lang_dist = get_language_dist(col_data) nr_words, word_dist, nr_words_dist = analyze_sentences(col_data) stats_v2[col_name]['avg_words_per_sentence'] = nr_words / len(col_data) stats_v2[col_name]['word_dist'] = shrink_word_dist(word_dist) stats_v2[col_name]['nr_words_dist'] = nr_words_dist stats_v2[col_name]['lang_dist'] = lang_dist stats_v2[col_name]['nr_warnings'] = 0 for x in stats_v2[col_name].values(): if isinstance(x, dict) and 'warning' in x: self.log.warning(x['warning']) stats_v2[col_name]['nr_warnings'] += 1 self.log.info(f'Finished analyzing column: {col_name} !\n') if data_type == DATA_TYPES.CATEGORICAL: if data_subtype == DATA_SUBTYPES.TAGS: delimiter = self.transaction.lmd.get('tags_delimiter', ',') stats_v2[col_name]['tag_hist'] = Counter() for item in col_data: arr = [x.strip() for x in item.split(delimiter)] stats_v2[col_name]['tag_hist'].update(arr) stats_v2[col_name]['guess_probability'] = np.mean([(v / len(col_data))**2 for v in stats_v2[col_name]['tag_hist'].values()]) stats_v2[col_name]['balanced_guess_probability'] = 0.5 else: stats_v2[col_name]['guess_probability'] = sum((k / len(col_data))**2 for k in histogram['y']) stats_v2[col_name]['balanced_guess_probability'] = 1 / len(histogram['y']) self.transaction.lmd['data_preparation']['accepted_margin_of_error'] = self.transaction.lmd['sample_settings']['sample_margin_of_error'] self.transaction.lmd['data_preparation']['total_row_count'] = len(input_data.data_frame) self.transaction.lmd['data_preparation']['used_row_count'] = len(sample_df)