def get_cleaned_dataset(ws):
    found = False
    ds_key = "machine-cpu"
    description_text = "CPU performance dataset (UCI)."

    if ds_key in ws.datasets.keys():
        found = True
        ds_cleaned = ws.datasets[ds_key]

    # Otherwise, create it from the file
    if not found:

        with zipfile.ZipFile("./data/machine.zip", "r") as zip_ref:
            zip_ref.extractall("data")

        #Reading a json lines file into a DataFrame
        data = pd.read_csv('./data/machine.csv')
        # DataFrame with cleaned data
        cleaned_data = clean_data(data)
        exported_df = 'cleaned-machine-cpu.parquet'
        cleaned_data.to_parquet(exported_df)
        # Register Dataset in Workspace using experimental funcionality to upload and register pandas dataframe at once
        ds_cleaned = TabularDatasetFactory.register_pandas_dataframe(
            dataframe=cleaned_data,
            target=(ws.get_default_datastore(), exported_df),
            name=ds_key,
            description=description_text,
            show_progress=True)
    return ds_cleaned
示例#2
0
def infer_forecasting_dataset_tcn(X_test,
                                  y_test,
                                  model,
                                  output_path,
                                  output_dataset_name="results"):

    y_pred, df_all = model.forecast(X_test, y_test)

    run = Run.get_context()

    registered_train = TabularDatasetFactory.register_pandas_dataframe(
        df_all,
        target=(
            run.experiment.workspace.get_default_datastore(),
            datetime.now().strftime("%Y-%m-%d-") + str(uuid.uuid4())[:6],
        ),
        name=output_dataset_name,
    )
    df_all.to_csv(os.path.join(output_path, output_dataset_name + ".csv"),
                  index=False)
示例#3
0
def create_dataset(ws):  
    kaggle_api.dataset_download_file('divg07/malware-analysis-dataset','data.csv')

    data = pd.read_csv(
            './data.csv.zip',
            compression='zip',
            sep='|'
        )

    # Clean dataset 
    data = clean_data(data)

    # Register Dataset in Workspace
    datastore = Datastore(ws)
    name = "Malware Dataset"
    description_text = "Malware DataSet for Udacity Capstone Project"
    dataset = TabularDatasetFactory.register_pandas_dataframe(data,
                               datastore,
                               name,
                               description=description_text)
    
    return dataset
示例#4
0
def main():
    run = Run.get_context()
    ws = run.experiment.workspace
    found = False
    key = "wine-quality"
    description_text = "Wine Quality Dataset for Udacity Course 3"

    if key in ws.datasets.keys():
        found = True
        input_data = ws.datasets[key]
        features = input_data.to_pandas_dataframe()

    if not found:
        # Create AML Dataset and register it into Workspace
        url_white = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
        url_red = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
        white_data = TabularDatasetFactory.from_delimited_files(url_white,
                                                                separator=";")
        red_data = TabularDatasetFactory.from_delimited_files(url_red,
                                                              separator=";")
        features, target = clean_data(white_data, red_data)
        features.loc[:, "quality"] = target
        ds = ws.get_default_datastore()
        input_data = TabularDatasetFactory.register_pandas_dataframe(
            dataframe=features,
            target=ds,
            name=key,
            description=description_text)

    target = features.pop("quality")
    target = target.replace({"BAD": -1, "MEDIUM": 0, "GOOD": 1})
    x_train, x_test, y_train, y_test = train_test_split(features,
                                                        target,
                                                        random_state=0)
    # Add arguments to script
    parser = argparse.ArgumentParser()

    parser.add_argument("--max_depth",
                        type=int,
                        default=6,
                        help="Maximum depth of tree")
    parser.add_argument("--alpha",
                        type=float,
                        default=0,
                        help="L1 regularization")
    parser.add_argument("--learning_rate",
                        type=float,
                        default=0.1,
                        help="learning rate")
    parser.add_argument("--gamma",
                        type=float,
                        default=0.0,
                        help="minimal loss")

    args = parser.parse_args()

    run.log("Max Depth:", np.int(args.max_depth))
    run.log("Alpha:", np.float(args.alpha))
    run.log("Learning rate:", np.float(args.learning_rate))
    run.log("Gamma:", np.float(args.gamma))

    model = XGBClassifier(booster="gbtree",
                          objective="multi:softmax",
                          subsample=0.8,
                          tree_method="auto",
                          n_estimators=500,
                          max_depth=args.max_depth,
                          reg_alpha=args.alpha,
                          learning_rate=args.learning_rate,
                          gamma=args.gamma)
    model.fit(x_train, y_train)
    y_pred = model.predict_proba(x_test)
    auc = roc_auc_score(y_test,
                        y_pred,
                        average="weighted",
                        multi_class="ovr",
                        labels=model.classes_)

    os.makedirs("./outputs", exist_ok=True)
    joblib.dump(model, filename="./outputs/wine-quality-model.pkl")
    run.log("AUC_weighted", np.float(auc))
import pandas as pd
# In the original dataset values and labels were split into separate files
# we combine them now again into a single Pandas dataframe
original_data = pd.read_csv('data/train_values.csv')
original_data_labels = pd.read_csv('data/train_labels.csv')
original_data['rate_spread'] = original_data_labels['rate_spread']
print(f"{len(original_data)} total rows")
print(original_data.sample(10))


# %%
# Upload dataset to Azure
uncleaned_dataset_name = "UncleanedMortgageSpread"
print(f"Uploading uncleaned dataset to {uncleaned_dataset_name}...")
datastore = ws.get_default_datastore()
registered_set = TabularDatasetFactory.register_pandas_dataframe(original_data, datastore, uncleaned_dataset_name)
print("Done")

# %%
print("Loading cleaned data sets...")
import zipfile
import io
dataset_zip = zipfile.ZipFile("data/cleanedEngineeredData.zip", "r")
engineered_data = pd.read_csv(io.BytesIO(dataset_zip.read("train_cleaned.csv")))


# %%
print(f"{len(engineered_data)} total rows")
print(engineered_data.columns)
print(engineered_data.sample(10))
示例#6
0
def register_dataset_to_store(ws, df, name):
    datastore = Datastore.get_default(ws)
    TabularDatasetFactory.register_pandas_dataframe(df, datastore, name=name)
def create_DDoS_datasets(ws):  
    dtypes = {
        'Src IP': 'category',
        'Src Port': 'uint16',
        'Dst IP': 'category',
        'Dst Port': 'uint16',
        'Protocol': 'category',
        'Flow Duration': 'uint32',
        'Tot Fwd Pkts': 'uint32',
        'Tot Bwd Pkts': 'uint32',
        'TotLen Fwd Pkts': 'float32',
        'TotLen Bwd Pkts': 'float32',
        'Fwd Pkt Len Max': 'float32',
        'Fwd Pkt Len Min': 'float32',
        'Fwd Pkt Len Mean': 'float32',
        'Fwd Pkt Len Std': 'float32',
        'Bwd Pkt Len Max': 'float32',
        'Bwd Pkt Len Min': 'float32',
        'Bwd Pkt Len Mean': 'float32',
        'Bwd Pkt Len Std': 'float32',
        'Flow Byts/s': 'float32',
        'Flow Pkts/s': 'float32',
        'Flow IAT Mean': 'float32',
        'Flow IAT Std': 'float32',
        'Flow IAT Max': 'float32',
        'Flow IAT Min': 'float32',
        'Fwd IAT Tot': 'float32',
        'Fwd IAT Mean': 'float32',
        'Fwd IAT Std': 'float32',
        'Fwd IAT Max': 'float32',
        'Fwd IAT Min': 'float32',
        'Bwd IAT Tot': 'float32',
        'Bwd IAT Mean': 'float32',
        'Bwd IAT Std': 'float32',
        'Bwd IAT Max': 'float32',
        'Bwd IAT Min': 'float32',
        'Fwd PSH Flags': 'category',
        'Bwd PSH Flags': 'category',
        'Fwd URG Flags': 'category',
        'Bwd URG Flags': 'category',
        'Fwd Header Len': 'uint32',
        'Bwd Header Len': 'uint32',
        'Fwd Pkts/s': 'float32',
        'Bwd Pkts/s': 'float32',
        'Pkt Len Min': 'float32',
        'Pkt Len Max': 'float32',
        'Pkt Len Mean': 'float32',
        'Pkt Len Std': 'float32',
        'Pkt Len Var': 'float32',
        'FIN Flag Cnt': 'category',
        'SYN Flag Cnt': 'category',
        'RST Flag Cnt': 'category',
        'PSH Flag Cnt': 'category',
        'ACK Flag Cnt': 'category',
        'URG Flag Cnt': 'category',
        'CWE Flag Count': 'category',
        'ECE Flag Cnt': 'category',
        'Down/Up Ratio': 'float32',
        'Pkt Size Avg': 'float32',
        'Fwd Seg Size Avg': 'float32',
        'Bwd Seg Size Avg': 'float32',
        'Fwd Byts/b Avg': 'uint32',
        'Fwd Pkts/b Avg': 'uint32',
        'Fwd Blk Rate Avg': 'uint32',
        'Bwd Byts/b Avg': 'uint32',
        'Bwd Pkts/b Avg': 'uint32',
        'Bwd Blk Rate Avg': 'uint32',
        'Subflow Fwd Pkts': 'uint32',
        'Subflow Fwd Byts': 'uint32',
        'Subflow Bwd Pkts': 'uint32',
        'Subflow Bwd Byts': 'uint32',
        'Init Fwd Win Byts': 'uint32',
        'Init Bwd Win Byts': 'uint32',
        'Fwd Act Data Pkts': 'uint32',
        'Fwd Seg Size Min': 'uint32',
        'Active Mean': 'float32',
        'Active Std': 'float32',
        'Active Max': 'float32',
        'Active Min': 'float32',
        'Idle Mean': 'float32',
        'Idle Std': 'float32',
        'Idle Max': 'float32',
        'Idle Min': 'float32',
        'Label': 'category'
    }

    data = pd.read_csv(
            './final_dataset.csv',
            parse_dates=['Timestamp'],
            usecols=[*dtypes.keys(), 'Timestamp'],
            engine='c',
            low_memory=True,
            na_values=np.inf
        )

    # There are over 12 million rows in this orignal dataset. For this project, that much data is taking far too long, so I'm randomly sampling only .5% of the data
    data = data.sample(frac=0.005)

    # Register Base Dataset in Workspace
    datastore = Datastore(ws)
    name = "DDoS Dataset"
    description_text = "DDoS DataSet for Udacity Capstone Project"
    dataset = TabularDatasetFactory.register_pandas_dataframe(data,
                               datastore,
                               name,
                               description=description_text)
    
    # Clean dataset and register the clean version
    cleaned_data = clean_data(data)
    
    clean_dataset_name = "Clean DDoS Dataset"
    clean_description_text = description_text + " that has been cleaned"
    clean_dataset = TabularDatasetFactory.register_pandas_dataframe(cleaned_data,
                               datastore,
                               clean_dataset_name,
                               description=clean_description_text)