示例#1
0
    def _update_type(df, added_cols):

        indices = list()
        for key in added_cols:
            indices.append(df.columns.get_loc(key))

        for idx in indices:
            old_metadata = dict(df.metadata.query((mbase.ALL_ELEMENTS, idx)))

            numerics = pd.to_numeric(df.iloc[:, idx], errors='coerce')
            length = numerics.shape[0]
            nans = numerics.isnull().sum()

            if nans / length > 0.9:
                if HelperFunction.is_categorical(df.iloc[:, idx]):
                    old_metadata['semantic_types'] = (
                        "https://metadata.datadrivendiscovery.org/types/CategoricalData",)
                else:
                    old_metadata['semantic_types'] = ("http://schema.org/Text",)
                    old_metadata['structural_type'] = type("type")
            else:
                intcheck = (numerics % 1) == 0
                if np.sum(intcheck) / length > 0.9:
                    old_metadata['semantic_types'] = ("http://schema.org/Integer",)
                    old_metadata['structural_type'] = type(10)
                else:
                    old_metadata['semantic_types'] = ("http://schema.org/Float",)
                    old_metadata['structural_type'] = type(10.1)

            old_metadata['semantic_types'] += ("https://metadata.datadrivendiscovery.org/types/Attribute",)

            df.metadata = df.metadata.update((mbase.ALL_ELEMENTS, idx), old_metadata)

        return df
    def update_types(self, col_name):
        old_metadata = dict(
            self.df.metadata.query(
                (mbase.ALL_ELEMENTS, self.df.columns.get_loc(col_name))))

        numerics = pd.to_numeric(self.df[col_name], errors='coerce')
        length = numerics.shape[0]
        nans = numerics.isnull().sum()

        if nans / length > 0.9:
            if HelperFunction.is_categorical(self.df[col_name]):
                old_metadata['semantic_types'] = (
                    "https://metadata.datadrivendiscovery.org/types/CategoricalData",
                )
            else:
                old_metadata['semantic_types'] = ("http://schema.org/Text", )
        else:
            intcheck = (numerics % 1) == 0
            if np.sum(intcheck) / length > 0.9:
                old_metadata['semantic_types'] = (
                    "http://schema.org/Integer", )
            else:
                old_metadata['semantic_types'] = ("http://schema.org/Float", )

        old_metadata['semantic_types'] += \
            ("https://metadata.datadrivendiscovery.org/types/Attribute",)

        self.df.metadata = self.df.metadata.update(
            (mbase.ALL_ELEMENTS, self.df.columns.get_loc(col_name)),
            old_metadata)
示例#3
0
    def detect(df, max_avg_length=30, columns_ignore=list()):
        positive_semantic_types = set(["http://schema.org/Text"])
        cols_to_detect = HelperFunction.cols_to_clean(df,
                                                      positive_semantic_types)
        require_checking = list(
            set(cols_to_detect).difference(set(columns_ignore)))
        extends = {"columns_to_perform": [], "split_to": []}
        for one_column in require_checking:
            rows = df.iloc[:, one_column]
            filtered_rows = [
                len(str(row)) for row in rows if len(str(row)) > 0
            ]
            if len(filtered_rows) > 0:
                avg_len = sum(filtered_rows) / len(filtered_rows)
                if avg_len < max_avg_length:
                    if not NumAlphaParser.num_check(df.iloc[:, one_column]):
                        isnum_alpha = NumAlphaParser.is_num_alpha(
                            df.iloc[:, one_column])
                        if isnum_alpha:
                            result = NumAlphaParser.num_alpha_splitter(
                                df.iloc[:, one_column])
                            extends["columns_to_perform"].append(one_column)
                            extends["split_to"].append(len(result))

        return extends
示例#4
0
    def _relabel_categorical(inputs: Input) -> Output:
        for col in range(inputs.shape[1]):
            old_metadata = dict(inputs.metadata.query((mbase.ALL_ELEMENTS, col)))
            semantic_type = old_metadata.get('semantic_types', [])

            if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_type:
                if not HelperFunction.is_categorical(inputs.iloc[:, col]):
                    old_metadata['semantic_types'] = tuple(i for i in old_metadata['semantic_types'] if
                                                           i != 'https://metadata.datadrivendiscovery.org/types/CategoricalData')

                    numerics = pd.to_numeric(inputs.iloc[:, col], errors='coerce')
                    length = numerics.shape[0]
                    nans = numerics.isnull().sum()

                    if nans / length > 0.9:
                        if "http://schema.org/Text" not in old_metadata['semantic_types']:
                            old_metadata['semantic_types'] += ("http://schema.org/Text",)

                    else:
                        intcheck = (numerics % 1) == 0
                        if np.sum(intcheck) / length > 0.9:
                            if "http://schema.org/Integer" not in old_metadata['semantic_types']:
                                old_metadata['semantic_types'] += ("http://schema.org/Integer",)
                                # old_metadata['structural_type'] = type(10)
                                # inputs.iloc[:, col] = numerics
                        else:
                            if "http://schema.org/Float" not in old_metadata['semantic_types']:
                                old_metadata['semantic_types'] += ("http://schema.org/Float",)
                                # old_metadata['structural_type'] = type(10.2)
                                # inputs.iloc[:, col] = numerics

            inputs.metadata = inputs.metadata.update((mbase.ALL_ELEMENTS, col), old_metadata)

        return inputs
示例#5
0
    def detect(df, columns_ignore=list()):
        positive_semantic_types = set(["http://schema.org/Text"])

        cols_to_detect = HelperFunction.cols_to_clean(df,
                                                      positive_semantic_types)
        require_checking = \
            list(set(cols_to_detect).difference(set(columns_ignore)))
        extends = {"columns_to_perform": [], "split_to": []}
        for one_column in require_checking:
            if PhoneParser.is_phone(df.iloc[:, one_column]):
                extends["columns_to_perform"].append(one_column)
        return extends
    def _update_structural_type(self):
        for col in range(self._input_data_copy.shape[1]):
            old_metadata = dict(
                self._input_data_copy.metadata.query(
                    (mbase.ALL_ELEMENTS, col)))
            semantic_type = old_metadata.get('semantic_types', None)
            if not semantic_type:
                numerics = pd.to_numeric(self._input_data_copy.iloc[:, col],
                                         errors='coerce')
                length = numerics.shape[0]
                nans = numerics.isnull().sum()

                if nans / length > 0.9:
                    if HelperFunction.is_categorical(
                            self._input_data_copy.iloc[:, col]):
                        old_metadata['semantic_types'] = (
                            "https://metadata.datadrivendiscovery.org/types/CategoricalData",
                        )
                    else:
                        old_metadata['semantic_types'] = (
                            "http://schema.org/Text", )
                else:
                    intcheck = (numerics % 1) == 0
                    if np.sum(intcheck) / length > 0.9:
                        old_metadata['semantic_types'] = (
                            "http://schema.org/Integer", )
                        old_metadata['structural_type'] = type(10)
                        self._input_data_copy.iloc[:, col] = numerics
                    else:
                        old_metadata['semantic_types'] = (
                            "http://schema.org/Float", )
                        old_metadata['structural_type'] = type(10.2)
                        self._input_data_copy.iloc[:, col] = numerics

                old_metadata['semantic_types'] += (
                    "https://metadata.datadrivendiscovery.org/types/Attribute",
                )

            else:
                if "http://schema.org/Integer" in semantic_type:
                    self._input_data_copy.iloc[:, col] = pd.to_numeric(
                        self._input_data_copy.iloc[:, col], errors='coerce')
                    old_metadata['structural_type'] = type(10)
                elif "http://schema.org/Float" in semantic_type:
                    self._input_data_copy.iloc[:, col] = pd.to_numeric(
                        self._input_data_copy.iloc[:, col], errors='coerce')
                    old_metadata['structural_type'] = type(10.2)

            self._input_data_copy.metadata = self._input_data_copy.metadata.update(
                (mbase.ALL_ELEMENTS, col), old_metadata)
def compute_lang(column, feature):
    """
    compute which language(s) it use for a given series (column); store the result into (feature).
    not apply for numbers

    PROBLEMS:
    1. not accurate when string contains digits
    2. not accurate when string is too short
    maybe need to consider the special cases for the above conditions
    """
    column = column.dropna()  # ignore all missing value
    if (column.size == 0):  # if the column is empty, do nothing
        return

    feature["natural_language_of_feature"] = list()
    language_count = {}

    for cell in column:
        if cell.isdigit() or HelperFunction.is_Decimal_Number(cell):
            continue
        else:
            #detecting language
            try:
                language = detect(cell)
                if language in language_count:
                    language_count[language] += 1
                else:
                    language_count[language] = 1
            except Exception as e:
                print(
                    "there is something may not be any language nor number: {}"
                    .format(cell))
                pass

    languages_ordered = sorted(language_count,
                               key=language_count.get,
                               reverse=True)
    for lang in languages_ordered:
        lang_obj = {}
        lang_obj['name'] = lang
        lang_obj['count'] = language_count[lang]
        feature["natural_language_of_feature"].append(lang_obj)
示例#8
0
def update_type(extends, df_origin):
    extends_df = pd.DataFrame.from_dict(extends)
    extends_df = d3m_DataFrame(extends_df, generate_metadata=True)
    if extends != {}:
        extends_df.index = df_origin.index.copy()

    new_df = d3m_DataFrame.append_columns(df_origin, extends_df)

    indices = list()
    for key in extends:
        indices.append(new_df.columns.get_loc(key))

    for idx in indices:
        old_metadata = dict(new_df.metadata.query((mbase.ALL_ELEMENTS, idx)))

        numerics = pd.to_numeric(new_df.iloc[:, idx], errors='coerce')
        length = numerics.shape[0]
        nans = numerics.isnull().sum()

        if nans / length > 0.9:
            if HelperFunction.is_categorical(new_df.iloc[:, idx]):
                old_metadata['semantic_types'] = (
                    "https://metadata.datadrivendiscovery.org/types/CategoricalData",
                )
            else:
                old_metadata['semantic_types'] = ("http://schema.org/Text", )
        else:
            intcheck = (numerics % 1) == 0
            if np.sum(intcheck) / length > 0.9:
                old_metadata['semantic_types'] = (
                    "http://schema.org/Integer", )
            else:
                old_metadata['semantic_types'] = ("http://schema.org/Float", )

        old_metadata['semantic_types'] += (
            "https://metadata.datadrivendiscovery.org/types/Attribute", )

        new_df.metadata = new_df.metadata.update((mbase.ALL_ELEMENTS, idx),
                                                 old_metadata)

    return new_df
    def detect_date_columns(self, sampled_df, except_list=list()):
        """
        Detects date columns in the sampled_df and returns a list of column indices which have dates

        params:
        - sampled_df [DataFrame]: a sample of rows from the original dataframe for detecting dates
        - except_list [List]: list of column indices to be ignored
        """
        positive_semantic_types = set([
            "https://metadata.datadrivendiscovery.org/types/Time",
            "http://schema.org/Text"
        ])
        cols_to_detect = HelperFunction.cols_to_clean(sampled_df,
                                                      positive_semantic_types)

        date_cols = []
        for idx in cols_to_detect:
            if idx not in except_list:
                if self._parse_column(sampled_df, idx) is not None:
                    date_cols.append(idx)
        return date_cols
def compute_punctuation(column, feature, weight_outlier):
    """
    compute the statistical values related to punctuations, for details, see the format section of README.

    not apply for numbers (eg: for number 1.23, "." does not count as a punctuation)

    weight_outlier: = number_of_sigma in function "helper_outlier_calcu"
    """

    column = column.dropna()  # get rid of all missing value
    if (column.size == 0):  # if the column is empty, do nothing
        return

    number_of_chars = sum(column.apply(len))  # number of all chars in column
    num_chars_cell = np.zeros(column.size)  # number of chars for each cell
    puncs_cell = np.zeros(
        [column.size, len(string.punctuation)],
        dtype=int)  # (number_of_cell * number_of_puncs) sized array

    # step 1: pre-calculations
    cell_id = -1
    for cell in column:
        cell_id += 1
        num_chars_cell[cell_id] = len(cell)
        # only counts puncs for non-number cell
        if cell.isdigit() or HelperFunction.is_Decimal_Number(cell):
            continue
        else:
            counts_cell_punc = np.asarray(
                list(cell.count(c) for c in string.punctuation))
            puncs_cell[cell_id] = counts_cell_punc

    counts_column_punc = puncs_cell.sum(
        axis=0)  # number of possible puncs in this column
    cell_density_array = puncs_cell / num_chars_cell.reshape([column.size, 1])
    puncs_density_average = cell_density_array.sum(axis=0) / column.size

    # step 2: extract from pre-calculated data
    # only create this feature when punctuations exist
    if (sum(counts_column_punc) > 0):
        feature["most_common_punctuations"] = list()  # list of dict

        # extract the counts to feature, for each punctuation
        for i in range(len(string.punctuation)):
            if (counts_column_punc[i] == 0
                ):  # if no this punctuation occur in the whole column, ignore
                continue
            else:
                punc_obj = {}
                punc_obj["punctuation"] = string.punctuation[i]
                punc_obj["count"] = counts_column_punc[i]
                punc_obj["ratio"] = counts_column_punc[i] / float(
                    number_of_chars)
                punc_obj["punctuation_density_aggregate"] = {
                    "mean": puncs_density_average[i]
                }
                # calculate outlier
                outlier_array = helper_outlier_calcu(cell_density_array[:, i],
                                                     weight_outlier)
                # only one element in outlier
                punc_obj["punctuation_density_outliers"] = [{
                    "n":
                    weight_outlier,
                    "count":
                    sum(outlier_array)
                }]
                feature["most_common_punctuations"].append(punc_obj)

    # step 3: sort
    feature["most_common_punctuations"] = sorted(
        feature["most_common_punctuations"],
        key=lambda k: k['count'],
        reverse=True)