Exemplo n.º 1
0
def main():

    io_utils.std_flush("Initialized at %s" %
                       time_utils.readable_time("%H:%M:%S"))
    _encoder = w2vGoogleNews.w2vGoogleNews()
    _encoder.setup()
    io_utils.std_flush("Set up encoder at %s" %
                       time_utils.readable_time("%H:%M:%S"))

    trainingData = BatchedLocal.BatchedLocal(
        data_source='./data/pure_new_dataset.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()
    io_utils.std_flush("Loaded training data at %s" %
                       time_utils.readable_time("%H:%M:%S"))

    X_train = _encoder.batchEncode(trainingData.getData())
    y_train = trainingData.getLabels()
    io_utils.std_flush("Batch encoded data at %s" %
                       time_utils.readable_time("%H:%M:%S"))

    model = kerasComplex.kerasComplex()
    io_utils.std_flush("Generated model at %s" %
                       time_utils.readable_time("%H:%M:%S"))

    io_utils.std_flush("Starting training at %s" %
                       time_utils.readable_time("%H:%M:%S"))
    precision, recall, score = model.fit_and_test(X_train, y_train)
    io_utils.std_flush(
        "Completed training with precision: %f\trecall: %f\tscore: %f" %
        (precision, recall, score))

    pdb.set_trace()
Exemplo n.º 2
0
def main():

    traindata = BatchedLocal.BatchedLocal(data_source="./data/initialTrainingData.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    traindata.load()
    _encoder = w2vGeneric.w2vGeneric()
    _encoder.setup(modelPath="w2v-wiki-wikipedia-5000.bin", trainMode="python")

    X_train = _encoder.batchEncode(traindata.getData())
    X_centroid = _encoder.getCentroid(X_train)

    load_data = StreamLocal.StreamLocal(data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    while load_data.next():
        _data = load_data.getData()
        _encoded = _encoder.encode(_data)

        _distance = _encoder.getDistance(_encoded, X_centroid)

        print(_distance)
Exemplo n.º 3
0
def main(experimentname):
    #
    f_write = open(experimentname + ".txt", "a")

    # set up the base config
    mlepConfig = io_utils.load_json("./MLEPServer.json")

    # update as per experiment requires
    mlepConfig["config"]["weight_method"] = "unweighted"
    mlepConfig["config"]["select_method"] = "recent"
    mlepConfig["config"]["filter_select"] = "nearest"

    # we are not updating internal timer...
    streamData = StreamLocal.StreamLocal(
        data_source="./data/realisticStreamComb_2013_feb19.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source="./data/collectedIrrelevant.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source="./data/initialTrainingData.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    MLEPLearner = MLEPModelDriftAdaptor.MLEPModelDriftAdaptor(
        config_dict=mlepConfig)
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0
    implicit_mistakes = 0.0
    implicit_count = 0
    explicit_mistakes = 0.0
    explicit_count = 0
    implicit_error_rate = []
    explicit_error_rate = []
    while streamData.next():
        if streamData.getLabel() is None:
            classification = MLEPLearner.classify(streamData.getObject(),
                                                  classify_mode="implicit")
            if classification != streamData.getObject().getValue("true_label"):
                implicit_mistakes += 1.0
            implicit_count += 1
        else:
            classification = MLEPLearner.classify(streamData.getObject(),
                                                  classify_mode="explicit")
            if classification != streamData.getLabel():
                explicit_mistakes += 1.0
            explicit_count += 1

        totalCounter += 1

        if totalCounter % 100 == 0 and totalCounter > 0.0:
            implicit_running_error = 2.00
            explicit_running_error = 2.00
            if implicit_count:
                implicit_running_error = implicit_mistakes / float(
                    implicit_count)
            if explicit_count:
                explicit_running_error = explicit_mistakes / float(
                    explicit_count)
            io_utils.std_flush(
                "Fin: %6i samples\t\texplicit error: %2.4f\t\t implicit error: %2.4f"
                %
                (totalCounter, explicit_running_error, implicit_running_error))
            implicit_error_rate.append(implicit_running_error)
            explicit_error_rate.append(explicit_running_error)
            implicit_mistakes = 0.0
            implicit_count = 0
            explicit_mistakes = 0.0
            explicit_count = 0
    f_write.write(experimentname + ",implicit," +
                  ",".join([str(item) for item in implicit_error_rate]) + "\n")
    f_write.write(experimentname + ",explicit," +
                  ",".join([str(item) for item in explicit_error_rate]) + "\n")
    f_write.close()
Exemplo n.º 4
0
def runExperiment(runname, mlepConfig, experiment_name, expstatuslog,
                  earlystop):

    # set up mlflow access
    # mlflow.set_tracking_uri -- not needed, defaults to mlruns
    # mlflow.create_experiment -- need experiment name. Should I programmatically create one? or go by timestamp
    if expstatuslog:
        sys.stdout = open(LOG_FILE, "w")
    else:
        sys.stdout = dumbwrite()

    mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs")
    mlflow.start_run(run_name=runname)

    # Log relevant details
    for _key in mlepConfig["config"]:
        # possible error
        if _key != "drift_metrics":
            mlflow.log_param(_key, mlepConfig["config"][_key])
    mlflow.log_param("experiment_name", experiment_name)

    internalTimer = 0
    streamData = StreamLocal.StreamLocal(
        data_source="data/2014_to_dec2018.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source='data/collectedIrrelevant.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source='data/initialTrainingData.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    # Now we have the data
    MLEPLearner = MLEPServer.MLEPLearningServer(config_dict=mlepConfig,
                                                safe_mode=False)

    # Perform initial traininig
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0.0
    mistakes = []
    _earlystopcond = False

    while streamData.next() and not _earlystopcond:
        if internalTimer < streamData.getObject().getValue("timestamp"):
            internalTimer = streamData.getObject().getValue("timestamp")
            MLEPLearner.updateTime(internalTimer)

        classification = MLEPLearner.classify(streamData.getObject())

        totalCounter += 1.0
        if classification != streamData.getLabel():
            mistakes.append(1.0)
        else:
            mistakes.append(0.0)
        if totalCounter % 1000 == 0 and totalCounter > 0.0:
            io_utils.std_flush("Completed", int(totalCounter),
                               " samples, with running error (past 100) of",
                               sum(mistakes[-100:]) / 100.0)
        if earlystop and totalCounter == earlystop:
            _earlystopcond = True
        if totalCounter % 100 == 0 and totalCounter > 0.0:
            running_error = sum(mistakes[-100:]) / 100.0
            mlflow.log_metric("running_err" + str(int(totalCounter / 100)),
                              running_error)

    MLEPLearner.shutdown()

    io_utils.std_flush(
        "\n-----------------------------\nCOMPLETED\n-----------------------------\n"
    )

    mlflow.log_param("total_samples", totalCounter)
    if expstatuslog:
        mlflow.log_artifact(LOG_FILE)
    mlflow.log_param("run_complete", True)
    mlflow.end_run()

    if expstatuslog:
        sys.stdout.close()
        sys.stdout = sys.__stdout__
    else:
        sys.stdout = sys.__stdout__
Exemplo n.º 5
0
import mlep.text.DataCharacteristics.OnlineSimilarityDistribution as OnlineSimilarityDistribution
import mlep.text.DataCharacteristics.CosineSimilarityDataCharacteristics as CosineSimilarityDataCharacteristics
import mlep.text.DataCharacteristics.L2NormDataCharacteristics as L2NormDataCharacteristics

import warnings
# warnings.filterwarnings(action="ignore", category=FutureWarning)

import collections

_encoder = w2vGoogleNews.w2vGoogleNews()
_encoder.setup()

# we are not updating internal timer...
trainingData = BatchedLocal.BatchedLocal(
    data_source="./data/initialTrainingData.json",
    data_mode="single",
    data_set_class=PseudoJsonTweets.PseudoJsonTweets)
trainingData.load()

X_train = _encoder.batchEncode(trainingData.getData())
X_centroid = _encoder.getCentroid(X_train)

nBins = 40
alpha = 0.6

# Set up initial Distribution
#charac = CosineSimilarityDataCharacteristics.CosineSimilarityDataCharacteristics()
charac = L2NormDataCharacteristics.L2NormDataCharacteristics()
charac.buildDistribution(X_centroid, X_train)

driftWindowTracker = MemoryTracker.MemoryTracker()
Exemplo n.º 6
0
def main():
    # update as per experiment requires
    # Checking Kullback Leibler...
    _encoder = w2vGoogleNews.w2vGoogleNews()
    _encoder.setup()

    # we are not updating internal timer...
    #streamData = StreamLocal.StreamLocal(data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    trainingData = BatchedLocal.BatchedLocal(
        data_source="./data/initialTrainingData.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    X_train = _encoder.batchEncode(trainingData.getData())
    X_centroid = _encoder.getCentroid(X_train)

    nBins = 40
    # Set up initial Distribution
    initialDistribution = OnlineSimilarityDistribution.OnlineSimilarityDistribution(
        nBins)
    for _data in trainingData.getData():
        initialDistribution.update(
            _encoder.getDistance(_encoder.encode(_data), X_centroid))
    kullback = KullbackLeibler.KullbackLeibler(initialDistribution)

    driftWindowTracker = MemoryTracker.MemoryTracker()
    driftWindowTracker.addNewMemory(memory_name="kullback",
                                    memory_store='memory')
    """
    totalCounter = 0
    implicit_mistakes = 0.0
    implicit_count = 0
    explicit_mistakes = 0.0
    explicit_count = 0
    """
    raw_vals = [0]
    dqlen = 100.0
    windowed_raw = collections.deque([], int(dqlen))

    #streamData = StreamLocal.StreamLocal(data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    streamData = StreamLocal.StreamLocal(
        data_source="./data/realisticStreamComb_2013_feb19.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    secondary_distribution = OnlineSimilarityDistribution.OnlineSimilarityDistribution(
        nBins)
    processLength = 0
    #genCount = 0
    #axv = []
    while streamData.next():
        processLength += 1
        # add to memory
        #driftWindowTracker.addToMemory("kullbak", streamData.getObject())

        # Perform drift detection (but just tracking f. now)
        _encoded = _encoder.encode(streamData.getData())
        _distance = _encoder.getDistance(_encoded, X_centroid)
        secondary_distribution.update(_distance)
        raw_val = kullback.detect(_distance, secondary_distribution)
        windowed_raw.append(raw_val)
        #if streamData.streamLength()>3000:
        #    pdb.set_trace()
        #raw_vals.append((raw_vals[-1]+raw_val)/streamData.streamLength())
        raw_vals.append(raw_val)
        #raw_vals.append(raw_val)
        """
        driftWindowTracker.addToMemory(memory_name="kullback", data=streamData.getObject())
        if processLength>dqlen:
            if raw_vals[-1] > .02:
                genCount += 1

                print("processed ",streamData.streamLength(), " and detected drift:  ", str(raw_vals[-1]))
                # transfer memory, etc etc
                trainingData = driftWindowTracker.transferMemory("kullback")
                driftWindowTracker.clearMemory("kullback")
                X_train = _encoder.batchEncode(trainingData.getData())
                X_centroid = _encoder.getCentroid(X_train)
                # update distribution
                kullback.reset()
                secondary_distribution = OnlineSimilarityDistribution.OnlineSimilarityDistribution(nBins)
                processLength = 0
        """
        """
        driftWindowTracker.addToMemory(memory_name="kullback", data=streamData.getObject())
        if driftWindowTracker.memorySize("kullback") > 3000:
            #print("processed ",streamData.streamLength())
            # transfer memory, etc etc
            trainingData = driftWindowTracker.transferMemory("kullback")
            driftWindowTracker.clearMemory("kullback")
            X_train = _encoder.batchEncode(trainingData.getData())
            X_centroid = _encoder.getCentroid(X_train)
            # update distribution
            kullback.reset()
            secondary_distribution = OnlineSimilarityDistribution.OnlineSimilarityDistribution(nBins)
        """
    pdb.set_trace()
Exemplo n.º 7
0
def main(experimentname, allow_explicit_drift, explicit_drift_class,
         explicit_drift_mode, explicit_update_mode, allow_unlabeled_drift,
         unlabeled_drift_class, unlabeled_drift_mode, unlabeled_update_mode,
         allow_update_schedule, update_schedule, schedule_update_mode,
         weight_method, select_method, filter_method, kval, update_prune):

    # Tracking URI -- yeah it's not very secure, but w/e
    # mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs")
    # Where to save data:
    # mlflow.start_run(run_name=experimentname)

    # We'll load the config file, make changes, and write a secondary file for experiments
    mlepConfig = io_utils.load_json('./MLEPServer.json')

    # Get the option if in args, else use the one present in the config...
    for _item in mlepConfig["config"]:
        try:
            mlepConfig["config"][_item] = eval(
                _item
            )  # If option not given furing launch, eval(*) will return NameError
        except NameError:
            pass  # Here the value of config in the MLEPServer.json file is used

    internalTimer = 0
    streamData = StreamLocal.StreamLocal(
        data_source="data/realisticStreamComb_2013_feb19.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source='data/collectedIrrelevant.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source='data/initialTrainingData.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    # Now we have the data
    MLEPLearner = MLEPDriftAdaptor.MLEPDriftAdaptor(config_dict=mlepConfig,
                                                    safe_mode=False)

    # Perform initial traininig
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0
    implicit_mistakes = 0.0
    implicit_count = 0
    explicit_mistakes = 0.0
    explicit_count = 0

    while streamData.next():
        if internalTimer < streamData.getObject().getValue("timestamp"):
            internalTimer = streamData.getObject().getValue("timestamp")
            MLEPLearner.updateTime(internalTimer)

        if streamData.getLabel() is None:
            classification = MLEPLearner.classify(streamData.getObject(),
                                                  "implicit")
        else:
            classification = MLEPLearner.classify(streamData.getObject(),
                                                  "explicit")

        if streamData.getLabel() is None:
            if classification != streamData.getObject().getValue("true_label"):
                implicit_mistakes += 1.0
            implicit_count += 1
        else:
            if classification != streamData.getLabel():
                explicit_mistakes += 1.0
            explicit_count += 1
        totalCounter += 1

        if totalCounter % 100 == 0 and totalCounter > 0.0:
            implicit_running_error = 2.00
            explicit_running_error = 2.00
            if implicit_count:
                implicit_running_error = implicit_mistakes / float(
                    implicit_count)
            if explicit_count:
                explicit_running_error = explicit_mistakes / float(
                    explicit_count)
            io_utils.std_flush(
                "Fin: %6i samples\t\texplicit error: %2.4f\t\t implicit error: %2.4f"
                %
                (totalCounter, explicit_running_error, implicit_running_error))

            implicit_mistakes = 0.0
            implicit_count = 0
            explicit_mistakes = 0.0
            explicit_count = 0

    MLEPLearner.shutdown()

    io_utils.std_flush(
        "\n-----------------------------\nCOMPLETED\n-----------------------------\n"
    )
Exemplo n.º 8
0
def main(experimentname, allow_explicit_drift, explicit_drift_class,
         explicit_drift_mode, explicit_update_mode, allow_unlabeled_drift,
         unlabeled_drift_class, unlabeled_drift_mode, unlabeled_update_mode,
         allow_update_schedule, update_schedule, schedule_update_mode,
         weight_method, select_method, filter_method, kval, update_prune):

    # Tracking URI -- yeah it's not very secure, but w/e
    # mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs")
    # Where to save data:
    # mlflow.start_run(run_name=experimentname)

    # We'll load the config file, make changes, and write a secondary file for experiments
    mlepConfig = io_utils.load_json('./MLEPServer.json')

    for _item in mlepConfig["config"]:
        try:
            mlepConfig["config"][_item] = eval(_item)
        except NameError:
            pass

    # Log relevant details
    """
    for _key in mlepConfig["config"]:
        # possible error
        if _key != "drift_metrics":
            mlflow.log_param(_key, mlepConfig["config"][_key])
    """

    internalTimer = 0
    streamData = StreamLocal.StreamLocal(
        data_source="data/2014_to_dec2018.json",
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)

    augmentation = BatchedLocal.BatchedLocal(
        data_source='data/collectedIrrelevant.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    augmentation.load_by_class()

    trainingData = BatchedLocal.BatchedLocal(
        data_source='data/initialTrainingData.json',
        data_mode="single",
        data_set_class=PseudoJsonTweets.PseudoJsonTweets)
    trainingData.load()

    # Now we have the data
    MLEPLearner = MLEPServer.MLEPLearningServer(config_dict=mlepConfig,
                                                safe_mode=False)

    # Perform initial traininig
    MLEPLearner.initialTrain(traindata=trainingData)
    io_utils.std_flush("Completed training at", time_utils.readable_time())
    MLEPLearner.addAugmentation(augmentation)
    io_utils.std_flush("Added augmentation at", time_utils.readable_time())

    totalCounter = 0.0
    mistakes = []
    while streamData.next():
        if internalTimer < streamData.getObject().getValue("timestamp"):
            internalTimer = streamData.getObject().getValue("timestamp")
            MLEPLearner.updateTime(internalTimer)

        classification = MLEPLearner.classify(streamData.getObject())
        totalCounter += 1.0
        if classification != streamData.getLabel():
            mistakes.append(1.0)
        else:
            mistakes.append(0.0)
        if totalCounter % 1000 == 0 and totalCounter > 0.0:
            io_utils.std_flush("Completed", int(totalCounter),
                               " samples, with running error (past 100) of",
                               sum(mistakes[-100:]) / 100.0)
        if totalCounter % 100 == 0 and totalCounter > 0.0:
            running_error = sum(mistakes[-100:]) / 100.0
            io_utils.std_flush("\tCompleted", int(totalCounter),
                               " samples, with running error (past 100) of",
                               running_error)
            #mlflow.log_metric("running_err"+str(int(totalCounter/100)), running_error)

    MLEPLearner.shutdown()

    io_utils.std_flush(
        "\n-----------------------------\nCOMPLETED\n-----------------------------\n"
    )