예제 #1
0
def main():

    io_utils.std_flush("Initialized at %s" %
                       time_utils.readable_time("%H:%M:%S"))
    _encoder = w2vGoogleNews.w2vGoogleNews()
    _encoder.setup()
    io_utils.std_flush("Set up encoder at %s" %
                       time_utils.readable_time("%H:%M:%S"))

    trainingData = BatchedLocal.BatchedLocal(
        data_source='./data/pure_new_dataset.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()
    io_utils.std_flush("Loaded training data at %s" %
                       time_utils.readable_time("%H:%M:%S"))

    X_train = _encoder.batchEncode(trainingData.getData())
    y_train = trainingData.getLabels()
    io_utils.std_flush("Batch encoded data at %s" %
                       time_utils.readable_time("%H:%M:%S"))

    model = kerasComplex.kerasComplex()
    io_utils.std_flush("Generated model at %s" %
                       time_utils.readable_time("%H:%M:%S"))

    io_utils.std_flush("Starting training at %s" %
                       time_utils.readable_time("%H:%M:%S"))
    precision, recall, score = model.fit_and_test(X_train, y_train)
    io_utils.std_flush(
        "Completed training with precision: %f\trecall: %f\tscore: %f" %
        (precision, recall, score))

    pdb.set_trace()
예제 #2
0
def main(experimentname):
    #
    f_write = open(experimentname + ".txt", "a")

    # set up the base config
    mlepConfig = io_utils.load_json("./MLEPServer.json")

    # update as per experiment requires
    mlepConfig["config"]["weight_method"] = "unweighted"
    mlepConfig["config"]["select_method"] = "recent"
    mlepConfig["config"]["filter_select"] = "nearest"

    # we are not updating internal timer...
    streamData = StreamLocal.StreamLocal(
        data_source="./data/realisticStreamComb_2013_feb19.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source="./data/collectedIrrelevant.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source="./data/initialTrainingData.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    MLEPLearner = MLEPModelDriftAdaptor.MLEPModelDriftAdaptor(
        config_dict=mlepConfig)
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0
    implicit_mistakes = 0.0
    implicit_count = 0
    explicit_mistakes = 0.0
    explicit_count = 0
    implicit_error_rate = []
    explicit_error_rate = []
    while streamData.next():
        if streamData.getLabel() is None:
            classification = MLEPLearner.classify(streamData.getObject(),
                                                  classify_mode="implicit")
            if classification != streamData.getObject().getValue("true_label"):
                implicit_mistakes += 1.0
            implicit_count += 1
        else:
            classification = MLEPLearner.classify(streamData.getObject(),
                                                  classify_mode="explicit")
            if classification != streamData.getLabel():
                explicit_mistakes += 1.0
            explicit_count += 1

        totalCounter += 1

        if totalCounter % 100 == 0 and totalCounter > 0.0:
            implicit_running_error = 2.00
            explicit_running_error = 2.00
            if implicit_count:
                implicit_running_error = implicit_mistakes / float(
                    implicit_count)
            if explicit_count:
                explicit_running_error = explicit_mistakes / float(
                    explicit_count)
            io_utils.std_flush(
                "Fin: %6i samples\t\texplicit error: %2.4f\t\t implicit error: %2.4f"
                %
                (totalCounter, explicit_running_error, implicit_running_error))
            implicit_error_rate.append(implicit_running_error)
            explicit_error_rate.append(explicit_running_error)
            implicit_mistakes = 0.0
            implicit_count = 0
            explicit_mistakes = 0.0
            explicit_count = 0
    f_write.write(experimentname + ",implicit," +
                  ",".join([str(item) for item in implicit_error_rate]) + "\n")
    f_write.write(experimentname + ",explicit," +
                  ",".join([str(item) for item in explicit_error_rate]) + "\n")
    f_write.close()
예제 #3
0
def main(runname, expstatslog, mlflowlog, earlystop):
    if mlflowlog:
        pass
    else:
        global mlflow
        mlflow = dumbflow()
    if expstatslog:
        exp_status_write = open(EXP_STATUS, "a")
    else:
        exp_status_write = sys.stdout

    exp_status_write.write("\n\n\n\n")
    exp_status_write.write("--------------------------")
    exp_status_write.write("  BEGINNING NEW EXECUTION (" + runname + ") AT " +
                           str(time_utils.readable_time("%Y-%m-%d %H:%M:%S")))
    exp_status_write.write("  ------------------------" + "\n\n")
    # We are tracking drift adaptivity
    # namely labeled drift detection

    # Set up explicit drift detection params
    explicit_drift_param_grid = {
        "allow_explicit_drift": [(True, "ExpDr")],
        "explicit_drift_class": [("LabeledDriftDetector", "LDD")],
        "explicit_drift_mode": [("PageHinkley", "PageHinkley"),
                                ("ADWIN", "ADWIN"), ("EDDM", "EDDM"),
                                ("DDM", "DDM")],
        "explicit_update_mode": [("all", "A"), ("errors", "E")],
        "allow_unlabeled_drift": [(False, "")],
        "allow_update_schedule": [(False, "")],
        "weight_method": [("unweighted", "U"), ("performance", "P")],
        "select_method": [("recent", "RR"), ("recent-new", "RN"),
                          ("recent-updates", "RU")],
        "filter_method": [("no-filter", "F"), ("top-k", "T"),
                          ("nearest", "N")],
        "kval": [(5, "5"), (10, "10")]
    }
    explicit_drift_params = ParameterGrid(explicit_drift_param_grid)

    for param_set in explicit_drift_params:
        # This is an experiment
        if param_set["explicit_update_mode"][0] == "all":
            continue
        # Load up configuration file
        mlepConfig = io_utils.load_json('./MLEPServer.json')

        # Update config file and generate an experiment name
        experiment_name = ''
        for _param in param_set:
            if param_set[_param][1] != "":
                experiment_name += param_set[_param][1] + '-'
            mlepConfig["config"][_param] = param_set[_param][0]
        experiment_name = experiment_name[:-1]

        # Now we have the Experimental Coonfig we can use for running an experiment
        # generate an experiment name
        exp_status_write.write("--STATUS-- " + experiment_name + "   ")
        exp_status_write.flush()
        try:
            runExperiment(runname, mlepConfig, experiment_name, expstatslog,
                          earlystop)
            exp_status_write.write("SUCCESS\n")
        except Exception as e:
            exp_status_write.write("FAILED\n")
            exp_status_write.write(traceback.format_exc())
            exp_status_write.write(str(e))
            exp_status_write.write("\n")
            exp_status_write.flush()
            mlflow.end_run()
        exp_status_write.flush()

    exp_status_write.write("\n\n")
    exp_status_write.write("--------------------------")
    exp_status_write.write("  FINISHED EXECUTION OF (" + runname + ") AT " +
                           str(time_utils.readable_time("%Y-%m-%d %H:%M:%S")))
    exp_status_write.write("  ------------------------" + "\n\n")
    exp_status_write.close()
예제 #4
0
def runExperiment(runname, mlepConfig, experiment_name, expstatuslog,
                  earlystop):

    # set up mlflow access
    # mlflow.set_tracking_uri -- not needed, defaults to mlruns
    # mlflow.create_experiment -- need experiment name. Should I programmatically create one? or go by timestamp
    if expstatuslog:
        sys.stdout = open(LOG_FILE, "w")
    else:
        sys.stdout = dumbwrite()

    mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs")
    mlflow.start_run(run_name=runname)

    # Log relevant details
    for _key in mlepConfig["config"]:
        # possible error
        if _key != "drift_metrics":
            mlflow.log_param(_key, mlepConfig["config"][_key])
    mlflow.log_param("experiment_name", experiment_name)

    internalTimer = 0
    streamData = StreamLocal.StreamLocal(
        data_source="data/2014_to_dec2018.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source='data/collectedIrrelevant.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source='data/initialTrainingData.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    # Now we have the data
    MLEPLearner = MLEPServer.MLEPLearningServer(config_dict=mlepConfig,
                                                safe_mode=False)

    # Perform initial traininig
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0.0
    mistakes = []
    _earlystopcond = False

    while streamData.next() and not _earlystopcond:
        if internalTimer < streamData.getObject().getValue("timestamp"):
            internalTimer = streamData.getObject().getValue("timestamp")
            MLEPLearner.updateTime(internalTimer)

        classification = MLEPLearner.classify(streamData.getObject())

        totalCounter += 1.0
        if classification != streamData.getLabel():
            mistakes.append(1.0)
        else:
            mistakes.append(0.0)
        if totalCounter % 1000 == 0 and totalCounter > 0.0:
            io_utils.std_flush("Completed", int(totalCounter),
                               " samples, with running error (past 100) of",
                               sum(mistakes[-100:]) / 100.0)
        if earlystop and totalCounter == earlystop:
            _earlystopcond = True
        if totalCounter % 100 == 0 and totalCounter > 0.0:
            running_error = sum(mistakes[-100:]) / 100.0
            mlflow.log_metric("running_err" + str(int(totalCounter / 100)),
                              running_error)

    MLEPLearner.shutdown()

    io_utils.std_flush(
        "\n-----------------------------\nCOMPLETED\n-----------------------------\n"
    )

    mlflow.log_param("total_samples", totalCounter)
    if expstatuslog:
        mlflow.log_artifact(LOG_FILE)
    mlflow.log_param("run_complete", True)
    mlflow.end_run()

    if expstatuslog:
        sys.stdout.close()
        sys.stdout = sys.__stdout__
    else:
        sys.stdout = sys.__stdout__
예제 #5
0
def main(experimentname, allow_explicit_drift, explicit_drift_class,
         explicit_drift_mode, explicit_update_mode, allow_unlabeled_drift,
         unlabeled_drift_class, unlabeled_drift_mode, unlabeled_update_mode,
         allow_update_schedule, update_schedule, schedule_update_mode,
         weight_method, select_method, filter_method, kval, update_prune):

    # Tracking URI -- yeah it's not very secure, but w/e
    # mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs")
    # Where to save data:
    # mlflow.start_run(run_name=experimentname)

    # We'll load the config file, make changes, and write a secondary file for experiments
    mlepConfig = io_utils.load_json('./MLEPServer.json')

    # Get the option if in args, else use the one present in the config...
    for _item in mlepConfig["config"]:
        try:
            mlepConfig["config"][_item] = eval(
                _item
            )  # If option not given furing launch, eval(*) will return NameError
        except NameError:
            pass  # Here the value of config in the MLEPServer.json file is used

    internalTimer = 0
    streamData = StreamLocal.StreamLocal(
        data_source="data/realisticStreamComb_2013_feb19.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source='data/collectedIrrelevant.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source='data/initialTrainingData.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    # Now we have the data
    MLEPLearner = MLEPDriftAdaptor.MLEPDriftAdaptor(config_dict=mlepConfig,
                                                    safe_mode=False)

    # Perform initial traininig
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0
    implicit_mistakes = 0.0
    implicit_count = 0
    explicit_mistakes = 0.0
    explicit_count = 0

    while streamData.next():
        if internalTimer < streamData.getObject().getValue("timestamp"):
            internalTimer = streamData.getObject().getValue("timestamp")
            MLEPLearner.updateTime(internalTimer)

        if streamData.getLabel() is None:
            classification = MLEPLearner.classify(streamData.getObject(),
                                                  "implicit")
        else:
            classification = MLEPLearner.classify(streamData.getObject(),
                                                  "explicit")

        if streamData.getLabel() is None:
            if classification != streamData.getObject().getValue("true_label"):
                implicit_mistakes += 1.0
            implicit_count += 1
        else:
            if classification != streamData.getLabel():
                explicit_mistakes += 1.0
            explicit_count += 1
        totalCounter += 1

        if totalCounter % 100 == 0 and totalCounter > 0.0:
            implicit_running_error = 2.00
            explicit_running_error = 2.00
            if implicit_count:
                implicit_running_error = implicit_mistakes / float(
                    implicit_count)
            if explicit_count:
                explicit_running_error = explicit_mistakes / float(
                    explicit_count)
            io_utils.std_flush(
                "Fin: %6i samples\t\texplicit error: %2.4f\t\t implicit error: %2.4f"
                %
                (totalCounter, explicit_running_error, implicit_running_error))

            implicit_mistakes = 0.0
            implicit_count = 0
            explicit_mistakes = 0.0
            explicit_count = 0

    MLEPLearner.shutdown()

    io_utils.std_flush(
        "\n-----------------------------\nCOMPLETED\n-----------------------------\n"
    )
예제 #6
0
def main(experimentname, allow_explicit_drift, explicit_drift_class,
         explicit_drift_mode, explicit_update_mode, allow_unlabeled_drift,
         unlabeled_drift_class, unlabeled_drift_mode, unlabeled_update_mode,
         allow_update_schedule, update_schedule, schedule_update_mode,
         weight_method, select_method, filter_method, kval, update_prune):

    # Tracking URI -- yeah it's not very secure, but w/e
    # mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs")
    # Where to save data:
    # mlflow.start_run(run_name=experimentname)

    # We'll load the config file, make changes, and write a secondary file for experiments
    mlepConfig = io_utils.load_json('./MLEPServer.json')

    for _item in mlepConfig["config"]:
        try:
            mlepConfig["config"][_item] = eval(_item)
        except NameError:
            pass

    # Log relevant details
    """
    for _key in mlepConfig["config"]:
        # possible error
        if _key != "drift_metrics":
            mlflow.log_param(_key, mlepConfig["config"][_key])
    """

    internalTimer = 0
    streamData = StreamLocal.StreamLocal(
        data_source="data/2014_to_dec2018.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source='data/collectedIrrelevant.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source='data/initialTrainingData.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    # Now we have the data
    MLEPLearner = MLEPServer.MLEPLearningServer(config_dict=mlepConfig,
                                                safe_mode=False)

    # Perform initial traininig
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0.0
    mistakes = []
    while streamData.next():
        if internalTimer < streamData.getObject().getValue("timestamp"):
            internalTimer = streamData.getObject().getValue("timestamp")
            MLEPLearner.updateTime(internalTimer)

        classification = MLEPLearner.classify(streamData.getObject())
        totalCounter += 1.0
        if classification != streamData.getLabel():
            mistakes.append(1.0)
        else:
            mistakes.append(0.0)
        if totalCounter % 1000 == 0 and totalCounter > 0.0:
            io_utils.std_flush("Completed", int(totalCounter),
                               " samples, with running error (past 100) of",
                               sum(mistakes[-100:]) / 100.0)
        if totalCounter % 100 == 0 and totalCounter > 0.0:
            running_error = sum(mistakes[-100:]) / 100.0
            io_utils.std_flush("\tCompleted", int(totalCounter),
                               " samples, with running error (past 100) of",
                               running_error)
            #mlflow.log_metric("running_err"+str(int(totalCounter/100)), running_error)

    MLEPLearner.shutdown()

    io_utils.std_flush(
        "\n-----------------------------\nCOMPLETED\n-----------------------------\n"
    )