예제 #1
0
def test_dataframe():
    d2 = DirectAccessV2(
        api_key=DIRECTACCESS_API_KEY,
        client_id=DIRECTACCESS_CLIENT_ID,
        client_secret=DIRECTACCESS_CLIENT_SECRET,
        access_token=DIRECTACCESS_TOKEN,
    )
    df = d2.to_dataframe("rigs", pagesize=10000, deleteddate="null")

    # Check index is set to API endpoint "primary key"
    assert df.index.name == "RigID"

    # Check datetime64 dtypes
    assert is_datetime64_ns_dtype(df.CreatedDate)
    assert is_datetime64_ns_dtype(df.DeletedDate)
    assert is_datetime64_ns_dtype(df.SpudDate)
    assert is_datetime64_ns_dtype(df.UpdatedDate)

    # Check Int64 dtypes
    assert is_int64_dtype(df.PermitDepth)
    assert is_int64_dtype(df.FormationDepth)

    # Check float dtypes
    assert is_float_dtype(df.RigLatitudeWGS84)
    assert is_float_dtype(df.RigLongitudeWGS84)

    return
예제 #2
0
def apply_to_one(dataframe, columns):
    for column in columns:
        if types.is_int64_dtype(dataframe[column]):
            c_min = dataframe[column].min()
            c_max = dataframe[column].max()
            dataframe[column] = np.log1p(dataframe[column])

    return dataframe
예제 #3
0
    def test_search(self, mp_wfs, mp_remote_describefeaturetype, mp_remote_md,
                    mp_remote_fc, mp_remote_wfs_feature, mp_dov_xml):
        """Test the search method with only the query parameter.

        Test whether the result is correct.

        Parameters
        ----------
        mp_wfs : pytest.fixture
            Monkeypatch the call to the remote GetCapabilities request.
        mp_remote_describefeaturetype : pytest.fixture
            Monkeypatch the call to a remote DescribeFeatureType.
        mp_remote_md : pytest.fixture
            Monkeypatch the call to get the remote metadata.
        mp_remote_fc : pytest.fixture
            Monkeypatch the call to get the remote feature catalogue.
        mp_remote_wfs_feature : pytest.fixture
            Monkeypatch the call to get WFS features.
        mp_dov_xml : pytest.fixture
            Monkeypatch the call to get the remote XML data.

        """
        df = self.get_search_object().search(
            query=self.get_valid_query_single())

        assert type(df) is DataFrame

        assert list(df) == self.get_df_default_columns()

        datatype = self.get_type()
        allfields = datatype.get_field_names()
        ownfields = datatype.get_field_names(include_subtypes=False)
        subfields = [f for f in allfields if f not in ownfields]

        assert len(df) >= 1

        for field in list(df):
            if field in ownfields:
                assert len(df[field].unique()) == 1
            elif field in subfields:
                assert len(df[field].unique()) >= 1

        # dtype checks of the resulting df columns
        fields = self.get_type().get_fields(source=('wfs', 'xml', 'custom'))

        for field in list(df):
            datatype = fields[field]['type']
            if datatype == 'string':
                assert (is_object_dtype(df[field])
                        or df[field].isnull().values.all())  # all Nan/None
            elif datatype == 'float':
                assert is_float_dtype(df[field])
            elif datatype == 'integer':
                assert is_int64_dtype(df[field])
            elif datatype == 'date':
                assert is_object_dtype(df[field])
            elif datatype == 'boolean':
                assert is_bool_dtype(df[field])
예제 #4
0
def revisar_resultados(df_reto_3):
    
    import pandas as np
    import pandas.api.types as pdtypes
    
    assert pdtypes.is_int64_dtype(df_reto_3['is_potentially_hazardous_asteroid']), 'La columna "is_potentially_hazardous_asteroid" no ha sido transformada a tipo numerico'
    assert len(df_reto_3['is_potentially_hazardous_asteroid'].unique()) == 2, 'Hubo un error con la correspondencia de valores booleanos a numéricos. Hay más de dos valores posibles en la columna resultante'
    assert df_reto_3['relative_velocity.kilometers_per_minute'].equals(df_reto_3['relative_velocity.kilometers_per_hour'] / 60), 'La conversión de kilometros por hora a kilómetros por minuto no fue realizada correctamente'
    
    print(f'Todos los procesos fueron realizados exitosamente!')
    def test_dataframe_v3(self):
        df = self.v3.to_dataframe("rigs", pagesize=1000, deleteddate="null")

        # Check index is set to API endpoint "primary keys"
        self.assertListEqual(df.index.names, ["CompletionID", "WellID"])

        # Check object dtypes
        self.assertTrue(is_object_dtype(df.API_UWI))
        self.assertTrue(is_object_dtype(df.ActiveStatus))

        # Check datetime64 dtypes
        self.assertTrue(is_datetime64_ns_dtype(df.DeletedDate))
        self.assertTrue(is_datetime64_ns_dtype(df.SpudDate))
        self.assertTrue(is_datetime64_ns_dtype(df.UpdatedDate))

        # Check Int64 dtypes
        self.assertTrue(is_int64_dtype(df.RatedWaterDepth))
        self.assertTrue(is_int64_dtype(df.RatedHP))

        # Check float dtypes
        self.assertTrue(is_float_dtype(df.RigLatitudeWGS84))
        self.assertTrue(is_float_dtype(df.RigLongitudeWGS84))
예제 #6
0
 def test_data_from_zip(self):
     with open('tests/pc6hnr.csv', 'rb') as f:
         res = populate.data_from_zip(f.read())
         # must return a DF
         self.assertTrue(isinstance(res, pd.DataFrame))
         # pc6 must be unique
         self.assertTrue(res['pc6'].is_unique)
         # must contain the following columns
         self.assertListEqual(['pc6', 'buurt', 'wijk', 'gemeente'],
                              list(res.columns))
         # must have the correct data types
         self.assertTrue(ptypes.is_string_dtype(res['pc6']))
         self.assertTrue(
             all(
                 ptypes.is_int64_dtype(res[col])
                 for col in ['buurt', 'wijk', 'gemeente']))
예제 #7
0
def create_dtypes(df):
    dtypes = {}

    for key, value in df.dtypes.items():
        if is_string_dtype(value):
            dtypes.update({'{}'.format(key): String})

        elif is_int64_dtype(value):
            dtypes.update({'{}'.format(key): Integer})

        elif is_float_dtype(value):
            dtypes.update({'{}'.format(key): Float})

        elif is_object_dtype(value):
            dtypes.update({'{}'.format(key): Float})

    return dtypes
예제 #8
0
 def factorize(self, na_sentinel=-1):
     # type: (int) -> Tuple[np.ndarray, ExtensionArray]
     """Encode the extension array as an enumerated type.
     Parameters
     ----------
     na_sentinel : int, default -1
         Value to use in the `labels` array to indicate missing values.
     Returns
     -------
     labels : ndarray
         An integer NumPy array that's an indexer into the original
         ExtensionArray.
     uniques : ExtensionArray
         An ExtensionArray containing the unique values of `self`.
         .. note::
            uniques will *not* contain an entry for the NA value of
            the ExtensionArray if there are any missing values present
            in `self`.
     See Also
     --------
     pandas.factorize : Top-level factorize method that dispatches here.
     Notes
     -----
     :meth:`pandas.factorize` offers a `sort` keyword as well.
     """
     if pa.types.is_dictionary(self.data.type):
         raise NotImplementedError()
     elif self.data.num_chunks == 1:
         # Dictionaryencode and do the same as above
         encoded = self.data.chunk(0).dictionary_encode()
         indices = encoded.indices.to_pandas()
         if indices.dtype.kind == "f":
             indices[np.isnan(indices)] = na_sentinel
             indices = indices.astype(int)
         if not is_int64_dtype(indices):
             indices = indices.astype(np.int64)
         return indices, type(self)(encoded.dictionary)
     else:
         np_array = pa.column("dummy", self.data).to_pandas().values
         return pd.factorize(np_array, na_sentinel=na_sentinel)
예제 #9
0
    # 2569个欺诈用户,5582个正常用户。总共8151个用户
    train_data["label"].value_counts()

    # 913个用户ARPU为0,且全部为欺诈用户。
    train_data[train_data["arpu"] == 0]["label"].value_counts()
    # 去除这913个用户之后,仍剩下7238个用户。其中1656个欺诈用户,5582个正常用户
    except_arpu_data = train_data[train_data.apply(lambda x: False if x["arpu"] == 0 else True, axis=1)].copy()
    except_arpu_data["label"].value_counts()
    except_arpu_data[except_arpu_data["flow"] == 0]["label"].value_counts()

    # 43个用户的arpu大于等于550,且全部为欺诈用户。
    except_arpu_data[except_arpu_data["arpu"] >= 550]["label"].value_counts()
    # 剩下7238-43=7195个用户
    except_arpu_data = except_arpu_data[except_arpu_data["arpu"] < 550]

    # 当天最多与135个及以上的不同用户进行过通话。有78个用户,且全部为欺诈用户
    except_arpu_data[except_arpu_data["voc_inner_day_user_cnt_max"] >= 135]["label"].value_counts()
    # 剩下7195-78=7117个用户
    except_arpu_data = except_arpu_data[except_arpu_data["voc_inner_day_user_cnt_max"] < 135]

    #
    except_arpu_data[except_arpu_data["passive_voc_inner_hour_cnt_max"]>30]["label"].value_counts()

    dtypes_all = except_arpu_data.dtypes
    for column in except_arpu_data.columns:
        if is_float_dtype(dtypes_all[column]) or (is_int64_dtype(dtypes_all[column])):
            temp_statistic = except_arpu_data[except_arpu_data[column] == 0]["label"].value_counts()
            if len(temp_statistic) == 1:
                print("--- " + column)
                print(temp_statistic)
예제 #10
0
def apply_log1p_transformation(dataframe, columns):
    for column in columns:
        if types.is_int64_dtype(dataframe[column]):
            dataframe[column] = np.log1p(dataframe[column])

    return dataframe
예제 #11
0
def tran_pivot_longer(
    df,
    columns,
    index_to=None,
    names_to=None,
    #names_prefix = None,
    names_sep=None,
    names_pattern=None,
    #names_ptypes = list(),
    #names_transform = list(),
    #names_repair,
    values_to=None,
    #values_drop_na = False,
    #values_ptypes = list(),
    #values_transform = list(),
):
    """Lengthen a dataset

    "Lengthens" data by increasing the number of rows and decreasing the
    number of columns.

    Args:
        df (DataFrame): DataFrame passed through
        columns (str): Label of column(s) to pivot into longer format
        index_to(str): str name to create a new representation index of observations; Optional.
        names_to (str): name to use for the 'variable' column, if None frame.columns.name
                        is used or ‘variable’
                          • .value indicates that component of the name defines
                            the name of the column containing the cell values,
                            overriding values_to
        names_sep (str OR list of int): delimter to seperate the values of the argument(s) from
                        the 'columns' parameter into 2 new columns with those
                        values split by that delimeter
                          • Regex expression is a valid input for names_sep
        names_pattern (str): Regular expression with capture groups to define targets for names_to.
        values_to (str): name to use for the 'value' column; overridden if ".value" is provided in names_to argument.

    Notes:
        Only one of names_sep OR names_pattern may be given.

    Returns:
        DataFrame: result of being pivoted into a longer format

    Examples::

        import grama as gr
        ## Simple example
        (
            gr.df_make(
                A=[1, 2, 3],
                B=[4, 5, 6],
                C=[7, 8, 9],
            )
            >> gr.tf_pivot_longer(
                columns=["A", "B", "C"],
                names_to="variable",
                values_to="value",
            )
        )

        ## Matching columns on patterns
        (
            gr.df_make(
                x1=[1, 2, 3],
                x2=[4, 5, 6],
                x3=[7, 8, 9],
            )
            >> gr.tf_pivot_longer(
                columns=gr.matches("\\d+"),
                names_to="variable",
                values_to="value",
            )
        )

        ## Separating column names and data on a names_pattern
        (
            gr.df_make(
                E00=[1, 2, 3],
                E45=[4, 5, 6],
                E90=[7, 8, 9],
            )
            >> gr.tf_pivot_longer(
                columns=gr.matches("\\d+"),
                names_to=[".value", "angle"],
                names_pattern="(E)(\\d+)",
            )
        )

    """

    ########### Pre-Check List #############
    ### Check if tran_select was used
    if isinstance(columns, DataFrame):
        columns = columns.columns.values

    ### Check if selection helper was used:
    if isinstance(columns, Intention):
        columns = pivot_select(df, columns)
        if size(columns) == 0:
            raise ValueError("""Selection helper has found no matches. Revise
                columns input.""")

    ### Check if names_to is a list or str
    names_str = False
    if isinstance(names_to, str):
        names_str = True
        if names_sep is not None:
            raise TypeError("""In order to use names_sep more than 1 value
                needs to passed to names_to""")

    ### Check for .value input
    dot_value = False
    if names_str is False:
        for i, v in enumerate(names_to):
            if names_to[i] == ".value":
                dot_value = True
    else:
        if names_to == ".value":
            dot_value = True

    ### Check values_to argument
    if values_to is None:
        values_to = "values"

    if names_pattern and names_sep:
        raise ValueError("""Both names_sep and names_pattern were used,
            only one or the other is required""")

    #######################################

    ########### .value pivot #############

    ### Check if .value operation needs to occur
    if dot_value is True:

        ### collect unused columns to pivot around
        data_index = collect_indexes(df, columns)

        if names_sep is not None or names_pattern is not None:
            ### Add index and split column to dataset
            longer = df.reset_index().melt(id_vars="index",
                                           var_name="split",
                                           value_vars=columns,
                                           value_name=values_to)

            ### DataFrame Cleanup
            longer = split_cleanup(longer=longer,
                                   names_to=names_to,
                                   names_pattern=names_pattern,
                                   names_sep=names_sep,
                                   values_to=values_to)

        else:
            ### Add index column and .value column
            longer = df.reset_index().melt(id_vars="index",
                                           var_name=".value",
                                           value_vars=columns,
                                           value_name=values_to)

        ### clean up index_to call
        longer = index_to_cleanup(df, longer, data_index)

        ### arrange what indexes_from should be
        if names_str is True:
            indexes = ["index"] + data_index
        else:
            names_to = list(names_to)
            value_loc = names_to.index(".value")
            if value_loc == 0:
                indexes = ["index"] + data_index + names_to[1:]
            else:
                indexes = ["index"] + data_index + names_to[0:value_loc] \
                    + names_to[(value_loc+1):]

        ### Pivot wider the .value column
        value_longer = tran_pivot_wider(longer,
                                        indexes_from=indexes,
                                        names_from=".value",
                                        values_from=values_to)

        if index_to is None:
            ### drop "index" column
            value_longer.drop("index", axis=1, inplace=True)
        else:
            ### rename index column to desired: index_to
            value_longer.rename(columns={'index': index_to}, inplace=True)

        return value_longer

    #########################################

    ########### names_sep pivot #############

    ### Only if names_sep is used
    if names_sep is not None or names_pattern is not None:

        ### collect unused columns to pivot around
        data_index = collect_indexes(df, columns)

        if index_to is None:
            ### initial pivoted DataFrame
            longer = df.reset_index().melt(id_vars=data_index,
                                           var_name="split",
                                           value_vars=columns,
                                           value_name=values_to)

            ### DataFrame Cleanup
            longer = split_cleanup(longer=longer,
                                   names_to=names_to,
                                   names_pattern=names_pattern,
                                   names_sep=names_sep,
                                   values_to=values_to)

            return (longer)

        ### Add index column to dataset
        longer = df.reset_index().melt(id_vars="index",
                                       var_name="split",
                                       value_vars=columns,
                                       value_name=values_to)
        ### rename index column to desired: index_to
        longer.rename(columns={'index': index_to}, inplace=True)

        longer = index_to_cleanup(df, longer, data_index)

        ### DataFrame Cleanup
        longer = split_cleanup(longer=longer,
                               names_to=names_to,
                               names_pattern=names_pattern,
                               names_sep=names_sep,
                               values_to=values_to)

        return (longer)

    ######################################

    ########### normal pivot #############

    ### Check if index_to is provided
    if index_to is None:

        ### check to see if all columns are used already
        data_columns = df.columns.values
        data_index = [x for x in data_columns if x not in columns]

        ### check if data_index is empty and if it has a RangeIndex
        if not data_index:
            if is_int64_dtype(df.index.dtype):
                # if so do not add extra index column and pivot
                longer = df.reset_index().melt(id_vars=None,
                                               var_name=names_to,
                                               value_vars=columns,
                                               value_name=values_to)
                return (longer)

            # if it does not have a RangeIndex create new column from ID column
            # and add RangeIndex
            longer = df.reset_index().melt(id_vars="index",
                                           var_name=names_to,
                                           value_vars=columns,
                                           value_name=values_to)
            return (longer)

        ### look for unused columns to pivot around
        data_used = columns
        data_index = [x for x in data_columns if x not in data_used]

        ### pivot with leftover name that would be the index column
        if data_index:
            longer = df.reset_index().melt(id_vars=data_index,
                                           var_name=names_to,
                                           value_vars=columns,
                                           value_name=values_to)
            return (longer)

    ### collect unused columns to preserve post pivot
    data_index = collect_indexes(df, columns)

    ### Add index column to dataset
    longer = df.reset_index().melt(id_vars="index",
                                   var_name=names_to,
                                   value_vars=columns,
                                   value_name=values_to)
    ### rename index column to desired: index_to
    longer.rename(columns={'index': index_to}, inplace=True)

    longer = index_to_cleanup(df, longer, data_index)

    return longer
예제 #12
0
 def test_total_deaths_df_deaths_column_type(self):
     self.assertTrue(ptypes.is_int64_dtype(self.total_deaths_df['deaths']))
예제 #13
0
def explore_global_plot(data, label='label', n_feats=50, id=None, task='classification'):
    '''
    :param data: DataFrame
    :param label: label column name in the data
    :param n_feats: the number of features be used to analysis.
    :param task: regression or classification
    :return:
    '''
    columns = data.columns.tolist()
    columns.remove(label)

    if id is not None:
        if columns[id].duplicated().sum():
            print('{} is duplicated !!!'.format(id))

        columns.remove(id)
        data.drop(id, axis=1, inplace=True)

    numeric_features = [True if any([ptypes.is_integer_dtype(i),ptypes.is_int64_dtype(i),ptypes.is_float_dtype(i)]) else False for i in data[columns].dtypes]
    numeric_names = [columns[i] for i, v in enumerate(numeric_features) if v]
    category_names = list(set(columns) - set(numeric_names))

    if task == 'classification':
        if len(category_names):
            # data distribution for each class
            new_data = data.dropna(axis=0)
            famd = prince.FAMD(
                n_components=2,
                n_iter=3,
                copy=True,
                check_input=True,
                engine='auto',
                random_state=42
            )
            famd = famd.fit(new_data[columns])
            ax = famd.plot_row_coordinates(
                new_data,
                ax=None,
                x_component=0,
                y_component=1,
                labels=new_data.index,
                color_labels=['{}'.format(t) for t in new_data[label]],
                ellipse_outline=False,
                ellipse_fill=True,
                show_points=True
            )
            plt.show()
        else:
            new_data = data.dropna(axis=0)
            pca = PCA(n_components=2, random_state=seed)
            X_pca = pca.fit_transform(new_data[columns])
            sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=label, data=new_data)
            plt.show()

    # sort features for correlation plot
    sorted_feat_name = numeric_names
    if len(numeric_names) > 6:
        n_clusters = 3
        new_data = data[[label] + numeric_names].dropna(axis=0)
        new_data_feat = new_data[numeric_names]
        new_data_stand = StandardScaler().fit_transform(new_data_feat)
        kmean_init = KMeans(n_clusters=n_clusters, random_state=seed)
        new_data_kmean=kmean_init.fit_transform(
            new_data_stand.reshape(len(numeric_names), -1))
        sorted_feat = sorted(zip(numeric_names, kmean_init.labels_), key=lambda x: x[1])
        sorted_feat_name = [i[0] for i in sorted_feat]

    # correlation plot for all features
    sns.heatmap(data[[label] + sorted_feat_name + category_names].corr())
    plt.show()

    # outlier detection just for numeric features
    outlier = data[numeric_names].apply(mad_based_outlier)
    for i, column in enumerate(outlier.columns):
        print('outlier:\n {}'.format(data[[column]][outlier.iloc[:, i]]))

    # missing value pattern plot for all features
    msno.matrix(data[columns[:n_feats]])
    plt.show()

    msno.bar(data[columns[:n_feats]])
    plt.show()

    miss_data = data[columns[:n_feats]].isnull().sum(axis=1)
    miss_data = miss_data.to_frame()
    miss_data.columns = ['number_of_missing_attributes']
    miss_data.sort_values('number_of_missing_attributes', inplace=True)
    miss_data['index'] = list(range(0, miss_data.shape[0]))
    sns.jointplot(x="index", y="number_of_missing_attributes", data=miss_data)
    plt.show()
예제 #14
0
def create_table(ctx, infile, table_name, col_spacing, varchar_factor, sql,
                 encoding, separator):
    """
    Display SQL table create command from a CSV file.
    """

    ordered_columns = OrderedDict()

    if infile.endswith(".xls") or infile.endswith(".xlsx"):
        print("Loading Excel file...")
        df = pd.ExcelFile(infile).parse()
    else:
        print("Loading CSV file...")
        df = pd.read_csv(infile, encoding=encoding, sep=separator)

    count = 0
    for column in df.columns:
        #print(df[column].dtype)
        count += 1
        sys.stdout.write(f"{str(count):3} ")

        # The entire column is empty.  No rows have values.
        if df[column].isna().all():
            ordered_columns[column] = {'type': None, 'length': None}
            print("{:{col_spacing}}: {}".format("No values",
                                                column,
                                                col_spacing=col_spacing))
            continue

        # Handling of numeric fields
        if is_numeric_dtype(df[column]):
            # Find the max value
            maxVal = None
            validVals = [i for i in df[column].dropna()]
            if validVals:
                maxVal = max(validVals)

            if is_float_dtype(df[column]):
                # Pandas stores numerical columns with null values as floats.  We
                # need to do some extra work to determine if the column is an int
                allIntegers = all(i.is_integer() for i in df[column].dropna())

                if allIntegers:
                    # this is an Integer column
                    ordered_columns[column] = {
                        'type': get_int_type(maxVal),
                        'length': maxVal
                    }
                    print(
                        f"int, {str(maxVal):{col_spacing-5}}: {column} : ({df[column].dtype})"
                    )
                    #df[df[column].fillna(0) != 0.0][column].astype(int)
                else:
                    # this is a Float column
                    ordered_columns[column] = {
                        'type': get_float_type(maxVal),
                        'length': maxVal
                    }
                    print(
                        f"{df[column].dtype}, {str(maxVal):{col_spacing-5}}: {column}"
                    )
            else:
                # These types were detected as integers during loading of the file.
                if is_int64_dtype(df[column]) or is_integer(df[column]):
                    ordered_columns[column] = {
                        'type': get_int_type(maxVal),
                        'length': maxVal
                    }
                    print(f"int, {str(maxVal):{col_spacing-5}}: {column}")
                else:
                    unknown = "???"
                    print(f"{unknown:{col_spacing}}: {column}")
        # Handling of Strings
        else:
            # Look for values that look like dates in 2018/01/01 or 01/01/2018 form
            patterns = [
                re.compile('^\d{1,2}[-/]\d{1,2}[-/]20\d\d$'),
                # re.compile('^\d{1,2}[-/]\d{1,2}[-/]\d{1,4}$'),
                re.compile('^20\d\d[-/]\d{1,2}[-/]\d{1,2}$')
                # re.compile('^\d{1,4}[-/]\d{1,2}[-/]\d{1,2}$')
            ]

            foundDate = False
            for pattern in patterns:
                if any(i == True for i in df[column].str.contains(pattern)):
                    foundDate = True

            foundBool = False
            try:
                maxVal = str(int(df[column].dropna().str.len().max()))
            except:
                # Could be boolean?
                # if "otc" in column:
                #     import pdb; pdb.set_trace()
                # if all(i.lower == "false" or i.lower() == "true" for i in df[column].dropna()):
                if any(type(i) == bool for i in df[column].dropna()):
                    maxVal = 0
                else:
                    maxVal = 0

            if foundDate:
                ordered_columns[column] = {'type': "DATE", 'length': maxVal}
                print(f"Date, {maxVal:{col_spacing-6}}: {column}")
            # elif foundBool:
            #     ordered_columns[column] = {'type': "BOOL", 'length': maxVal}
            #     print(f"Bool, {maxVal:{col_spacing-6}}: {column}")
            else:
                ordered_columns[column] = {
                    'type': f"VARCHAR({int(maxVal)*varchar_factor})",
                    'length': maxVal
                }
                print(f"String, {maxVal:{col_spacing-8}}: {column}")

    print("-------------------------------------")
    print(f"Total columns are: {len(df.columns)}")
    print("-------------------------------------")

    if sql:
        create_table_sql(ordered_columns, table_name)