def detect(df, max_avg_length=30, columns_ignore=list()): positive_semantic_types = set(["http://schema.org/Text"]) cols_to_detect = HelperFunction.cols_to_clean(df, positive_semantic_types) require_checking = list( set(cols_to_detect).difference(set(columns_ignore))) extends = {"columns_to_perform": [], "split_to": []} for one_column in require_checking: rows = df.iloc[:, one_column] filtered_rows = [ len(str(row)) for row in rows if len(str(row)) > 0 ] if len(filtered_rows) > 0: avg_len = sum(filtered_rows) / len(filtered_rows) if avg_len < max_avg_length: if not NumAlphaParser.num_check(df.iloc[:, one_column]): isnum_alpha = NumAlphaParser.is_num_alpha( df.iloc[:, one_column]) if isnum_alpha: result = NumAlphaParser.num_alpha_splitter( df.iloc[:, one_column]) extends["columns_to_perform"].append(one_column) extends["split_to"].append(len(result)) return extends
def detect(df, columns_ignore=list()): positive_semantic_types = set(["http://schema.org/Text"]) cols_to_detect = HelperFunction.cols_to_clean(df, positive_semantic_types) require_checking = \ list(set(cols_to_detect).difference(set(columns_ignore))) extends = {"columns_to_perform": [], "split_to": []} for one_column in require_checking: if PhoneParser.is_phone(df.iloc[:, one_column]): extends["columns_to_perform"].append(one_column) return extends
def detect_date_columns(self, sampled_df, except_list=list()): """ Detects date columns in the sampled_df and returns a list of column indices which have dates params: - sampled_df [DataFrame]: a sample of rows from the original dataframe for detecting dates - except_list [List]: list of column indices to be ignored """ positive_semantic_types = set([ "https://metadata.datadrivendiscovery.org/types/Time", "http://schema.org/Text" ]) cols_to_detect = HelperFunction.cols_to_clean(sampled_df, positive_semantic_types) date_cols = [] for idx in cols_to_detect: if idx not in except_list: if self._parse_column(sampled_df, idx) is not None: date_cols.append(idx) return date_cols