def reformat_goldstone(dataset_name, dataset_filename):
    targets = ['sftpcons']

    dataframe = pd.read_csv(os.path.join(data_input_dir, 'GoldstoneEtAl2013',
                                         dataset_filename),
                            delimiter='\t')

    dataframe.insert(0, 'd3mIndex', range(len(dataframe)))
    dataframe.dropna(inplace=True)
    dataframe.reset_index(drop=True, inplace=True)

    # build intermediate data file
    intermediate_data_path = os.path.join(data_intermediate_dir, dataset_name,
                                          'learningData.csv')
    os.makedirs(os.path.dirname(intermediate_data_path), exist_ok=True)

    dataframe.to_csv(intermediate_data_path, index=False)

    dataset_dir = os.path.join(data_output_dir, dataset_name)
    if os.path.exists(dataset_dir):
        shutil.rmtree(dataset_dir)

    d3m_wrap_dataset(data_output_dir,
                     dataPaths=[intermediate_data_path],
                     about={
                         'datasetName': dataset_name,
                     },
                     problem={
                         'targets':
                         targets,
                         'metrics':
                         ['rocAuc', 'accuracy', 'precision', 'recall', 'f1'],
                         'taskType':
                         'classification',
                         'taskSubType':
                         'binary',
                         'dataSplits': {
                             "method": "kFold",
                             "stratified": True,
                             "numRepeats": 0,
                             "splitsFile": "dataSplits.csv"
                         }
                     })

    problem_doc_path = os.path.join(data_output_dir, dataset_name, 'TRAIN',
                                    'problem_TRAIN', 'problemDoc.json')
    with open(problem_doc_path, 'r') as problem_file:
        problem_doc = json.load(problem_file)

    problem_doc['searchOptions'] = {'timeBoundSearch': 10, 'solutionsLimit': 5}

    with open(problem_doc_path, 'w') as problem_file:
        json.dump(problem_doc, problem_file)
def build_appliance():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'energydata_complete.csv')],
        about={
            'datasetName': 'TR_TS_appliance',
            'sourceURI': 'http://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction',
            'description': """The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters).""",
        },
        problem={
            'targets': ["Appliances"],
            'time': ['date'],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['forecasting', 'timeSeries']
        }
    )
def build_eeg_eye_state():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'eeg_eye_state.csv')],
        about={
            'datasetName': 'TR_TS_eeg_eye_state',
            'sourceURI': 'http://archive.ics.uci.edu/ml/datasets/EEG+Eye+State#',
            'description': """All data is from one continuous EEG measurement with the Emotiv EEG Neuroheadset. The duration of the measurement was 117 seconds. The eye state was detected via a camera during the EEG measurement and added later manually to the file after analysing the video frames. '1' indicates the eye-closed and '0' the eye-open state. All values are in chronological order with the first measured value at the top of the data."""
        },
        problem={
            'targets': ["eyeDetection"],
            'time': [],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['classification', 'timeSeries']
        }
    )
def build_order_demand():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'Daily_Demand_Forecasting_Orders.csv')],
        about={
            'datasetName': 'TR_TS_order_demand',
            'sourceURI': 'http://archive.ics.uci.edu/ml/datasets/Daily+Demand+Forecasting+Orders',
            'description': """The database was collected during 60 days, this is a real database of a Brazilian company of large logistics. Twelve predictive attributes and a target that is the total of orders for daily."""
        },
        problem={
            'targets': ["Target_(Total_orders)"],
            'time': ['Week of the month', 'Day of the week (Monday to Friday)'],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['forecasting', 'timeSeries']
        }
    )
def build_occupancy_forecasting():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'occupancy.csv')],
        about={
            'datasetName': 'TR_TS_occupancy_forecasting',
            'sourceURI': 'http://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+#',
            'description': """Experimental data used for binary classification (room occupancy) from Temperature,Humidity,Light and CO2. Ground-truth occupancy was obtained from time stamped pictures that were taken every minute."""
        },
        problem={
            'targets': ["Light"],
            'time': ['Date'],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['forecasting', 'timeSeries']
        }
    )
def build_air_quality():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'AirQualityUCI.csv')],
        about={
            'datasetName': 'TR_TS_air_quality',
            'sourceURI': 'http://archive.ics.uci.edu/ml/datasets/Air+Quality',
            'description': """Contains the responses of a gas multisensor device deployed on the field in an Italian city. Hourly responses averages are recorded along with gas concentrations references from a certified analyzer. The dataset contains 9358 instances of hourly averaged responses from an array of 5 metal oxide chemical sensors embedded in an Air Quality Chemical Multisensor Device. The device was located on the field in a significantly polluted area, at road level,within an Italian city. Data were recorded from March 2004 to February 2005 (one year)representing the longest freely available recordings of on field deployed air quality chemical sensor devices responses. Ground Truth hourly averaged concentrations for CO, Non Metanic Hydrocarbons, Benzene, Total Nitrogen Oxides (NOx) and Nitrogen Dioxide (NO2) and were provided by a co-located reference certified analyzer. Evidences of cross-sensitivities as well as both concept and sensor drifts are present as described in De Vito et al., Sens. And Act. B, Vol. 129,2,2008 (citation required) eventually affecting sensors concentration estimation capabilities. Missing values are tagged with -200 value.""",
            'license': 'This dataset can be used exclusively for research purposes. Commercial purposes are fully excluded.'
        },
        problem={
            'targets': ["RH"],
            'time': ['Date', 'Time'],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['forecasting', 'timeSeries']
        }
    )
def build_female_births():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'births.csv')],
        about={
            'datasetName': 'TR_TS_births',
            'sourceURI': 'https://machinelearningmastery.com/time-series-datasets-for-machine-learning/',
            'description': """This dataset describes the number of daily female births in California in 1959.

The units are a count and there are 365 observations. The source of the dataset is credited to Newton (1988)."""
        },
        problem={
            'targets': ["Births"],
            'time': ['Date'],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['forecasting', 'timeSeries']
        }
    )
def build_sunspots():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'sunspots.csv')],
        about={
            'datasetName': 'TR_TS_sunspots',
            'sourceURI': 'https://machinelearningmastery.com/time-series-datasets-for-machine-learning/',
            'description': """This dataset describes a monthly count of the number of observed sunspots for just over 230 years (1749-1983).

The units are a count and there are 2,820 observations. The source of the dataset is credited to Andrews & Herzberg (1985)."""
        },
        problem={
            'targets': ["Sunspots"],
            'time': ['Month'],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['forecasting', 'timeSeries']
        }
    )
def build_temperatures():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'minimum_daily_temperatures.csv')],
        about={
            'datasetName': 'TR_TS_minimum_daily_temp',
            'sourceURI': 'https://machinelearningmastery.com/time-series-datasets-for-machine-learning/',
            'description': """This dataset describes the minimum daily temperatures over 10 years (1981-1990) in the city Melbourne, Australia.

The units are in degrees Celsius and there are 3650 observations. The source of the data is credited as the Australian Bureau of Meteorology."""
        },
        problem={
            'targets': ["Temp"],
            'time': ['Date'],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['forecasting', 'timeSeries']
        }
    )
Exemplo n.º 10
0
def build_shampoo():
    d3m_wrap_dataset(
        data_output_dir,
        dataPaths=[os.path.join(data_input_dir, 'shampoo.csv')],
        about={
            'datasetName': 'TR_TS_shampoo',
            'sourceURI': 'https://machinelearningmastery.com/time-series-datasets-for-machine-learning/',
            'description': """This dataset describes the monthly number of sales of shampoo over a 3 year period.

The units are a sales count and there are 36 observations. The original dataset is credited to Makridakis, Wheelwright and Hyndman (1998)."""
        },
        problem={
            'targets': ["Sales"],
            'time': ['Month'],
            'metrics': ['meanSquaredError'],
            'taskKeywords': ['forecasting', 'timeSeries']
        }
    )
def reformat_chenowith_ulfelder(dataset_name, predictors_function):
    targets = ['nvc.start.1']

    dataframe_train_nas = pd.read_csv(os.path.join(
        data_input_dir, 'ChenowethUlfelder/cu_fig2_train.tsv'),
                                      delimiter='\t')
    dataframe_test_nas = pd.read_csv(os.path.join(
        data_input_dir, 'ChenowethUlfelder/cu_fig2_test.tsv'),
                                     delimiter='\t')

    dataframe_train = predictors_function(dataframe_train_nas)
    dataframe_test = predictors_function(dataframe_test_nas)

    dataframe_train[targets[0]] = dataframe_train_nas[[targets[0]]]
    dataframe_test[targets[0]] = dataframe_test_nas[[targets[0]]]

    dataframe_train.insert(0, 'd3mIndex', range(len(dataframe_train)))
    dataframe_test.insert(
        0, 'd3mIndex',
        [i + len(dataframe_train) for i in range(len(dataframe_test))])

    dataframe_train.dropna(inplace=True)
    dataframe_test.dropna(inplace=True)

    dataframe_train.reset_index(drop=True, inplace=True)
    dataframe_test.reset_index(drop=True, inplace=True)

    dataframe = pd.concat([dataframe_train, dataframe_test], ignore_index=True)

    # build intermediate data file
    intermediate_data_path = os.path.join(data_intermediate_dir, dataset_name,
                                          'learningData.csv')
    os.makedirs(os.path.dirname(intermediate_data_path), exist_ok=True)
    dataframe.to_csv(intermediate_data_path, index=False)

    dataset_dir = os.path.join(data_output_dir, dataset_name)
    if os.path.exists(dataset_dir):
        shutil.rmtree(dataset_dir)

    d3m_wrap_dataset(data_output_dir,
                     dataPaths=[intermediate_data_path],
                     about={
                         'datasetName': dataset_name,
                     },
                     problem={
                         'targets':
                         targets,
                         'time': ['year'],
                         'metrics':
                         ['rocAuc', 'accuracy', 'precision', 'recall', 'f1'],
                         'taskType':
                         'classification',
                         'taskSubType':
                         'binary',
                         'dataSplits': {
                             "method": "kFold",
                             "stratified": True,
                             "numRepeats": 0,
                             "splitsFile": "dataSplits.csv"
                         }
                     })

    # load custom out-of-sample splits into TwoRavens/d3m
    dataframe_splits = pd.DataFrame({
        'd3mIndex':
        range(len(dataframe)),
        'type':
        (['TRAIN'] * len(dataframe_train)) + (['TEST'] * len(dataframe_test)),
        'repeat': [0] * len(dataframe),
        'fold': [0] * len(dataframe)
    })
    sample_splits_path = os.path.join(data_output_dir, dataset_name, 'TRAIN',
                                      'problem_TRAIN', 'sampleSplits.csv')
    dataframe_splits.to_csv(sample_splits_path, index=False)
    problem_doc_path = os.path.join(data_output_dir, dataset_name, 'TRAIN',
                                    'problem_TRAIN', 'problemDoc.json')
    with open(problem_doc_path, 'r') as problem_file:
        problem_doc = json.load(problem_file)

    problem_doc['splitOptions'] = {
        'splitsFile': 'sampleSplits.csv',
        'splitsDir': os.path.dirname(sample_splits_path),
    }

    problem_doc['searchOptions'] = {'timeBoundSearch': 10, 'solutionsLimit': 5}

    with open(problem_doc_path, 'w') as problem_file:
        json.dump(problem_doc, problem_file)
def reformat_gelpi_avdan(dataset_name):
    predictors = [
        "polity2b", "polity2borigin", "loggdptarget", "logpop", "majpowhome",
        "majpoworigin", "coloniallink", "ethnictie", "ethnicPCW",
        "ethnicany911", "dyadalliance", "dyadalliancePCW", "rivalrydummy",
        "postCW", "post911", "lndyaddist", "dyadpcyear1", "dyadpcyear2",
        "dyadpcyear3", "dyadpcyear4"
    ]
    targets = ['incident']

    dataframe = pd.read_csv(os.path.join(data_input_dir,
                                         'GelpiAvdan2018/ga_TA2c.tsv'),
                            delimiter='\t')

    dataframe = dataframe[predictors + targets + ['year']]

    dataframe.insert(0, 'd3mIndex', range(len(dataframe)))
    dataframe.dropna(inplace=True)
    dataframe.reset_index(drop=True, inplace=True)

    # build intermediate data file
    intermediate_data_path = os.path.join(data_intermediate_dir, dataset_name,
                                          'learningData.csv')
    os.makedirs(os.path.dirname(intermediate_data_path), exist_ok=True)
    dataframe.to_csv(intermediate_data_path, index=False)

    dataset_dir = os.path.join(data_output_dir, dataset_name)
    if os.path.exists(dataset_dir):
        shutil.rmtree(dataset_dir)

    d3m_wrap_dataset(data_output_dir,
                     dataPaths=[intermediate_data_path],
                     about={
                         'datasetName': dataset_name,
                     },
                     problem={
                         'targets':
                         targets,
                         'metrics':
                         ['rocAuc', 'accuracy', 'precision', 'recall', 'f1'],
                         'taskType':
                         'classification',
                         'taskSubType':
                         'binary',
                         'dataSplits': {
                             "method": "kFold",
                             "stratified": True,
                             "numRepeats": 0,
                             "splitsFile": "dataSplits.csv"
                         }
                     })

    # load custom out-of-sample splits into TwoRavens/d3m
    dataframe_splits = pd.DataFrame({
        'd3mIndex':
        range(len(dataframe)),
        'type':
        ['TRAIN' if year < 2002 else 'TEST' for year in dataframe['year']],
        'repeat': [0] * len(dataframe),
        'fold': [0] * len(dataframe)
    })
    sample_splits_path = os.path.join(data_output_dir, dataset_name, 'TRAIN',
                                      'problem_TRAIN', 'sampleSplits.csv')
    dataframe_splits.to_csv(sample_splits_path, index=False)
    problem_doc_path = os.path.join(data_output_dir, dataset_name, 'TRAIN',
                                    'problem_TRAIN', 'problemDoc.json')
    with open(problem_doc_path, 'r') as problem_file:
        problem_doc = json.load(problem_file)

    problem_doc['splitOptions'] = {
        'splitsFile': 'sampleSplits.csv',
        'splitsDir': os.path.dirname(sample_splits_path),
    }

    problem_doc['searchOptions'] = {'timeBoundSearch': 60, 'solutionsLimit': 5}

    with open(problem_doc_path, 'w') as problem_file:
        json.dump(problem_doc, problem_file)