예제 #1
0
    def get_dataset(self):
        """
        Form a dataframe with the descriptions from all openml datasets
        :return: unique dataset descriptions with length min=50
        """
        dataset_list = datasets.list_datasets(output_format='dataframe',
                                              status='active')
        data_dict = defaultdict(list)
        for did in dataset_list['did']:
            try:
                data = datasets.get_dataset(did, download_data=False)
                if data.description is not None and data.name is not None:
                    data_dict['id'].append(did)
                    data_dict['name'].append(data.name)
                    data_dict['text'].append(data.description + " " +
                                             data.name + " ")
            except:
                # TODO: Exception type
                # For some reasons we get multiple exceptions apart from FileNotFound
                pass

        self.df = pd.DataFrame(data_dict)
        self.df.sort_values(by='id', inplace=True)
        self.df_unique = self._remove_duplicates()
        return self.df_unique
예제 #2
0
    def _get_compatible_rand_dataset(self) -> List:

        compatible_datasets = []
        active_datasets = list_datasets(status='active')

        # depending on the task type, find either datasets
        # with only symbolic features or datasets with only
        # numerical features.
        if self.task_type_id == 2:
            # regression task
            for dataset_id, dataset_info in active_datasets.items():
                if 'NumberOfSymbolicFeatures' in dataset_info:
                    if dataset_info['NumberOfSymbolicFeatures'] == 0:
                        compatible_datasets.append(dataset_id)
        elif self.task_type_id == 5:
            # clustering task
            compatible_datasets = list(active_datasets.keys())
        else:
            for dataset_id, dataset_info in active_datasets.items():
                # extra checks because of:
                # https://github.com/openml/OpenML/issues/959
                if 'NumberOfNumericFeatures' in dataset_info:
                    if dataset_info['NumberOfNumericFeatures'] == 0:
                        compatible_datasets.append(dataset_id)

        # in-place shuffling
        shuffle(compatible_datasets)
        return compatible_datasets
예제 #3
0
def test_all_datasets(dash_br):
    df = datasets.list_datasets(output_format='dataframe')
    ids = []
    for id in df['did'].values[:30]:
        dash_br.server_url = BASE_URL + 'data/' + str(id)
        time.sleep(5)
        if dash_br.get_logs() != []:
            ids.append(id)
            print(id)
    np.save('ids.npy', np.asarray(ids))
예제 #4
0
def get_datasets():
    datasets_list = open_ml_dataset.list_datasets()

    datasets = []
    for (dataset_id, dataset) in datasets_list.items():
        if 'NumberOfInstances' not in dataset or 'NumberOfClasses' not in dataset:
            continue
        if dataset['NumberOfInstances'] > 20000.0:
            continue
        datasets.append(DatasetInfo(dataset_id, dataset['NumberOfInstances'],
                                       dataset['NumberOfFeatures'], dataset['NumberOfClasses']))
    return datasets
예제 #5
0
    def _get_compatible_rand_dataset(self) -> int:

        compatible_datasets = []
        active_datasets = list_datasets(status='active')

        # depending on the task type, find either datasets
        # with only symbolic features or datasets with only
        # numerical features.
        if self.task_type_id != 2:
            for dataset_id, dataset_info in active_datasets.items():
                # extra checks because of:
                # https://github.com/openml/OpenML/issues/959
                if 'NumberOfNumericFeatures' in dataset_info:
                    if dataset_info['NumberOfNumericFeatures'] == 0:
                        compatible_datasets.append(dataset_id)
        else:
            for dataset_id, dataset_info in active_datasets.items():
                if 'NumberOfSymbolicFeatures' in dataset_info:
                    if dataset_info['NumberOfSymbolicFeatures'] == 0:
                        compatible_datasets.append(dataset_id)

        random_dataset_pos = randint(0, len(compatible_datasets) - 1)

        return compatible_datasets[random_dataset_pos]
예제 #6
0
    def dataset_overview(radio):
        """

        :return: overview of datasets page
        """
        if radio == "active":
            df = datasets.list_datasets(output_format="dataframe")

        else:
            df = datasets.list_datasets(output_format="dataframe", status="all")

        df.dropna(inplace=True)

        # Binning
        bins_1 = [
            1,
            500,
            1000,
            5000,
            10000,
            50000,
            100000,
            500000,
            max(df["NumberOfInstances"]),
        ]
        bins_2 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]
        df["Number of instances"] = pd.cut(
            df["NumberOfInstances"], bins=bins_1, precision=0
        ).astype(str)
        df["Number of features"] = pd.cut(
            df["NumberOfFeatures"], bins=bins_2, precision=0
        ).astype(str)
        for col in ["Number of instances", "Number of features"]:
            df[col] = df[col].str.replace(",", " -")
            df[col] = df[col].str.replace("(", "")
            df[col] = df[col].str.replace("]", "")
            df[col] = df[col].str.replace(".0", " ", regex=False)

        title = [
            "Attribute Types",
            "Number of classes",
            "Number of instances across datasets",
            "Number of features across datasets",
        ]

        # Attribute types
        df["Attribute Type"] = "mixed"
        df["Attribute Type"][df["NumberOfSymbolicFeatures"] <= 1] = "numeric"
        df["Attribute Type"][df["NumberOfNumericFeatures"] == 0] = "categorical"
        grouped = df.groupby("Attribute Type").size().reset_index(name="counts")
        colors = ["darkblue", "steelblue", "lightsteelblue"]
        types_chart = go.Pie(
            labels=grouped["Attribute Type"],
            values=grouped["counts"],
            marker=dict(colors=colors),
            showlegend=True,
        )
        fig1 = go.Figure(data=[types_chart])
        fig1.update_layout(height=400)

        # No of classes
        showlegend = False
        classes_plot = go.Violin(
            y=df["NumberOfClasses"],
            showlegend=showlegend,
            box_visible=True,
            fillcolor="mediumpurple",
            meanline_visible=True,
            name=" ",
        )
        fig2 = go.Figure(data=[classes_plot])
        fig2.update_xaxes(tickfont=dict(size=10))
        fig2.update_layout(height=400)

        # Instances plot
        df.sort_values(by="NumberOfInstances", inplace=True)

        instances_plot = go.Histogram(
            x=df["Number of instances"], marker_color="#EB89B5", showlegend=showlegend
        )
        fig3 = go.Figure(
            data=[instances_plot],
        )
        fig3.update_layout(bargap=0.4, width=900, height=400)
        fig3.update_xaxes(tickfont=dict(size=10))

        # Features plot
        df.sort_values(by="NumberOfFeatures", inplace=True)
        features_plot = go.Histogram(x=df["Number of features"], showlegend=showlegend)
        fig4 = go.Figure(data=[features_plot])
        fig4.update_layout(bargap=0.4, width=900, height=400)
        fig4.update_xaxes(tickfont=dict(size=10))

        return html.Div(
            [
                html.Div(
                    [html.P(title[0]), dcc.Graph(figure=fig1, id="fig1")],
                    className="row metric-row",
                    style={
                        "width": "48%",
                        "text-align": "center",
                        "display": "inline-block",
                    },
                ),
                html.Div(
                    [html.P(title[1]), dcc.Graph(figure=fig2, id="fig2")],
                    className="row metric-row",
                    style={
                        "width": "48%",
                        "text-align": "center",
                        "display": "inline-block",
                    },
                ),
                html.P(title[2]),
                dcc.Graph(figure=fig3, id="fig3"),
                html.P(title[3]),
                dcc.Graph(figure=fig4, id="fig4"),
            ],
        )
예제 #7
0
def get_dataset_overview():
    """

    :return: overview of datasets page
    """
    df = datasets.list_datasets(output_format='dataframe')
    df.dropna(inplace=True)
    bins_1 = [
        1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000,
        max(df["NumberOfInstances"])
    ]
    bins_2 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]
    df["Number of instances"] = pd.cut(df["NumberOfInstances"],
                                       bins=bins_1).astype(str)
    df["Number of features"] = pd.cut(df["NumberOfFeatures"],
                                      bins=bins_2).astype(str)

    title = [
        "Number of instances across datasets",
        "Number of features across datasets",
        "Attribute Type percentage distribution", "Number of classes"
    ]

    fig = plotly.subplots.make_subplots(rows=4,
                                        cols=1,
                                        subplot_titles=tuple(title))

    for col in ["Number of instances", "Number of features"]:
        df[col] = df[col].str.replace(',', ' -')
        df[col] = df[col].str.replace('(', "")
        df[col] = df[col].str.replace(']', "")
    df.sort_values(by="NumberOfInstances", inplace=True)
    showlegend = False
    fig.add_trace(go.Histogram(x=df["Number of instances"],
                               showlegend=showlegend),
                  row=1,
                  col=1)

    df.sort_values(by="NumberOfFeatures", inplace=True)
    fig.add_trace(go.Histogram(x=df["Number of features"],
                               showlegend=showlegend),
                  row=2,
                  col=1)

    df["Attribute Type"] = "mixed"
    df["Attribute Type"][df['NumberOfSymbolicFeatures'] <= 1] = 'numeric'
    df["Attribute Type"][df['NumberOfNumericFeatures'] == 0] = 'categorical'
    fig.add_trace(go.Histogram(x=df["Attribute Type"],
                               histnorm="percent",
                               showlegend=showlegend),
                  row=3,
                  col=1)

    fig.add_trace(go.Violin(x=df["NumberOfClasses"],
                            showlegend=showlegend,
                            name="NumberOfClasses"),
                  row=4,
                  col=1)

    fig.update_layout(height=1000)
    fig.update_xaxes(tickfont=dict(size=10))

    return html.Div(dcc.Graph(figure=fig), style={"fontsize": 10})
예제 #8
0
# flake8: noqa
import os
import openml
from openml import datasets
from openml.datasets.functions import DATASETS_CACHE_DIR_NAME

# get all datasets before running app, so that datasets are loaded faster
from openml.utils import _create_cache_directory_for_id

root_dir = os.path.abspath(os.sep)
openml.config.cache_directory = os.path.join(root_dir, "public",
                                             "python-cache", ".openml",
                                             "cache")
df = datasets.list_datasets(output_format="dataframe")

for idx, row in df.iterrows():
    data_id = row["did"]
    instances = row["NumberOfInstances"]
    cols = row["NumberOfFeatures"]
    print(data_id)
    # delete existing cache directory
    did_cache_dir = _create_cache_directory_for_id(
        DATASETS_CACHE_DIR_NAME,
        data_id,
    )
    # print(did_cache_dir)
    if os.path.exists(did_cache_dir):
        openml.utils._remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME,
                                              did_cache_dir)

    # download dataset and cache again, # (FileNotFoundError, openml.exceptions.OpenMLServerException)
예제 #9
0
from openml import datasets

# get all datasets before running app, so that datasets are loaded faster
df = datasets.list_datasets(output_format='dataframe')

for data_id in df['did']:
    try:
        datasets.get_dataset(data_id)
    except FileNotFoundError:
        pass
예제 #10
0
    def dataset_overview(radio):
        """

        :return: overview of datasets page
        """
        if radio == 'active':
            df = datasets.list_datasets(output_format='dataframe')

        else:
            df = datasets.list_datasets(output_format='dataframe',
                                        status='all')

        df.dropna(inplace=True)

        # Binning
        bins_1 = [
            1, 500, 1000, 5000, 10000, 50000, 100000, 500000,
            max(df["NumberOfInstances"])
        ]
        bins_2 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]
        df["Number of instances"] = pd.cut(df["NumberOfInstances"],
                                           bins=bins_1,
                                           precision=0).astype(str)
        df["Number of features"] = pd.cut(df["NumberOfFeatures"],
                                          bins=bins_2,
                                          precision=0).astype(str)
        for col in ["Number of instances", "Number of features"]:
            df[col] = df[col].str.replace(',', ' -')
            df[col] = df[col].str.replace('(', "")
            df[col] = df[col].str.replace(']', "")
            df[col] = df[col].str.replace('.0', " ", regex=False)

        title = [
            "Attribute Types",
            "Number of classes",
            "Number of instances across datasets",
            "Number of features across datasets",
        ]

        # Attribute types
        df["Attribute Type"] = "mixed"
        df["Attribute Type"][df['NumberOfSymbolicFeatures'] <= 1] = 'numeric'
        df["Attribute Type"][df['NumberOfNumericFeatures'] ==
                             0] = 'categorical'
        grouped = (df.groupby("Attribute Type").size().reset_index(
            name='counts'))
        colors = ['darkblue', 'steelblue', 'lightsteelblue']
        types_chart = go.Pie(labels=grouped["Attribute Type"],
                             values=grouped['counts'],
                             marker=dict(colors=colors),
                             showlegend=True)
        fig1 = go.Figure(data=[types_chart])
        fig1.update_layout(height=400)

        # No of classes
        showlegend = False
        classes_plot = go.Violin(y=df["NumberOfClasses"],
                                 showlegend=showlegend,
                                 box_visible=True,
                                 fillcolor='mediumpurple',
                                 meanline_visible=True,
                                 name=' ')
        fig2 = go.Figure(data=[classes_plot])
        fig2.update_xaxes(tickfont=dict(size=10))
        fig2.update_layout(height=400)

        # Instances plot
        df.sort_values(by="NumberOfInstances", inplace=True)

        instances_plot = go.Histogram(x=df["Number of instances"],
                                      marker_color='#EB89B5',
                                      showlegend=showlegend)
        fig3 = go.Figure(data=[instances_plot], )
        fig3.update_layout(bargap=0.4, width=900, height=400)
        fig3.update_xaxes(tickfont=dict(size=10))

        # Features plot
        df.sort_values(by="NumberOfFeatures", inplace=True)
        features_plot = go.Histogram(x=df["Number of features"],
                                     showlegend=showlegend)
        fig4 = go.Figure(data=[features_plot])
        fig4.update_layout(bargap=0.4, width=900, height=400)
        fig4.update_xaxes(tickfont=dict(size=10))

        return html.Div([
            html.Div(
                [html.P(title[0]),
                 dcc.Graph(figure=fig1, id='fig1')],
                className="row metric-row",
                style={
                    'width': '48%',
                    'text-align': 'center',
                    'display': 'inline-block',
                }),
            html.Div([html.P(title[1]),
                      dcc.Graph(figure=fig2, id='fig2')],
                     className="row metric-row",
                     style={
                         'width': '48%',
                         'text-align': 'center',
                         'display': 'inline-block'
                     }),
            html.P(title[2]),
            dcc.Graph(figure=fig3, id='fig3'),
            html.P(title[3]),
            dcc.Graph(figure=fig4, id='fig4')
        ], )