def main(): traindata = BatchedLocal.BatchedLocal(data_source="./data/initialTrainingData.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) traindata.load() _encoder = w2vGeneric.w2vGeneric() _encoder.setup(modelPath="w2v-wiki-wikipedia-5000.bin", trainMode="python") X_train = _encoder.batchEncode(traindata.getData()) X_centroid = _encoder.getCentroid(X_train) load_data = StreamLocal.StreamLocal(data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) while load_data.next(): _data = load_data.getData() _encoded = _encoder.encode(_data) _distance = _encoder.getDistance(_encoded, X_centroid) print(_distance)
def main(experimentname): # f_write = open(experimentname + ".txt", "a") # set up the base config mlepConfig = io_utils.load_json("./MLEPServer.json") # update as per experiment requires mlepConfig["config"]["weight_method"] = "unweighted" mlepConfig["config"]["select_method"] = "recent" mlepConfig["config"]["filter_select"] = "nearest" # we are not updating internal timer... streamData = StreamLocal.StreamLocal( data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation = BatchedLocal.BatchedLocal( data_source="./data/collectedIrrelevant.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation.load_by_class() trainingData = BatchedLocal.BatchedLocal( data_source="./data/initialTrainingData.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) trainingData.load() MLEPLearner = MLEPModelDriftAdaptor.MLEPModelDriftAdaptor( config_dict=mlepConfig) MLEPLearner.initialTrain(traindata=trainingData) io_utils.std_flush("Completed training at", time_utils.readable_time()) MLEPLearner.addAugmentation(augmentation) io_utils.std_flush("Added augmentation at", time_utils.readable_time()) totalCounter = 0 implicit_mistakes = 0.0 implicit_count = 0 explicit_mistakes = 0.0 explicit_count = 0 implicit_error_rate = [] explicit_error_rate = [] while streamData.next(): if streamData.getLabel() is None: classification = MLEPLearner.classify(streamData.getObject(), classify_mode="implicit") if classification != streamData.getObject().getValue("true_label"): implicit_mistakes += 1.0 implicit_count += 1 else: classification = MLEPLearner.classify(streamData.getObject(), classify_mode="explicit") if classification != streamData.getLabel(): explicit_mistakes += 1.0 explicit_count += 1 totalCounter += 1 if totalCounter % 100 == 0 and totalCounter > 0.0: implicit_running_error = 2.00 explicit_running_error = 2.00 if implicit_count: implicit_running_error = implicit_mistakes / float( implicit_count) if explicit_count: explicit_running_error = explicit_mistakes / float( explicit_count) io_utils.std_flush( "Fin: %6i samples\t\texplicit error: %2.4f\t\t implicit error: %2.4f" % (totalCounter, explicit_running_error, implicit_running_error)) implicit_error_rate.append(implicit_running_error) explicit_error_rate.append(explicit_running_error) implicit_mistakes = 0.0 implicit_count = 0 explicit_mistakes = 0.0 explicit_count = 0 f_write.write(experimentname + ",implicit," + ",".join([str(item) for item in implicit_error_rate]) + "\n") f_write.write(experimentname + ",explicit," + ",".join([str(item) for item in explicit_error_rate]) + "\n") f_write.close()
def runExperiment(runname, mlepConfig, experiment_name, expstatuslog, earlystop): # set up mlflow access # mlflow.set_tracking_uri -- not needed, defaults to mlruns # mlflow.create_experiment -- need experiment name. Should I programmatically create one? or go by timestamp if expstatuslog: sys.stdout = open(LOG_FILE, "w") else: sys.stdout = dumbwrite() mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs") mlflow.start_run(run_name=runname) # Log relevant details for _key in mlepConfig["config"]: # possible error if _key != "drift_metrics": mlflow.log_param(_key, mlepConfig["config"][_key]) mlflow.log_param("experiment_name", experiment_name) internalTimer = 0 streamData = StreamLocal.StreamLocal( data_source="data/2014_to_dec2018.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation = BatchedLocal.BatchedLocal( data_source='data/collectedIrrelevant.json', data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation.load_by_class() trainingData = BatchedLocal.BatchedLocal( data_source='data/initialTrainingData.json', data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) trainingData.load() # Now we have the data MLEPLearner = MLEPServer.MLEPLearningServer(config_dict=mlepConfig, safe_mode=False) # Perform initial traininig MLEPLearner.initialTrain(traindata=trainingData) io_utils.std_flush("Completed training at", time_utils.readable_time()) MLEPLearner.addAugmentation(augmentation) io_utils.std_flush("Added augmentation at", time_utils.readable_time()) totalCounter = 0.0 mistakes = [] _earlystopcond = False while streamData.next() and not _earlystopcond: if internalTimer < streamData.getObject().getValue("timestamp"): internalTimer = streamData.getObject().getValue("timestamp") MLEPLearner.updateTime(internalTimer) classification = MLEPLearner.classify(streamData.getObject()) totalCounter += 1.0 if classification != streamData.getLabel(): mistakes.append(1.0) else: mistakes.append(0.0) if totalCounter % 1000 == 0 and totalCounter > 0.0: io_utils.std_flush("Completed", int(totalCounter), " samples, with running error (past 100) of", sum(mistakes[-100:]) / 100.0) if earlystop and totalCounter == earlystop: _earlystopcond = True if totalCounter % 100 == 0 and totalCounter > 0.0: running_error = sum(mistakes[-100:]) / 100.0 mlflow.log_metric("running_err" + str(int(totalCounter / 100)), running_error) MLEPLearner.shutdown() io_utils.std_flush( "\n-----------------------------\nCOMPLETED\n-----------------------------\n" ) mlflow.log_param("total_samples", totalCounter) if expstatuslog: mlflow.log_artifact(LOG_FILE) mlflow.log_param("run_complete", True) mlflow.end_run() if expstatuslog: sys.stdout.close() sys.stdout = sys.__stdout__ else: sys.stdout = sys.__stdout__
nBins = 40 alpha = 0.6 # Set up initial Distribution #charac = CosineSimilarityDataCharacteristics.CosineSimilarityDataCharacteristics() charac = L2NormDataCharacteristics.L2NormDataCharacteristics() charac.buildDistribution(X_centroid, X_train) driftWindowTracker = MemoryTracker.MemoryTracker() driftWindowTracker.addNewMemory(memory_name="gen-mem", memory_store='memory') driftWindowTracker.addNewMemory(memory_name="core-mem", memory_store='memory') driftWindowTracker.addNewMemory(memory_name="edge-mem", memory_store='memory') #streamData = StreamLocal.StreamLocal(data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) streamData = StreamLocal.StreamLocal( data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) edge_centroid = np.zeros(X_centroid.shape[0]) edge_sum = np.zeros(X_centroid.shape[0]) edge_seen = False processLength = 0 zones = [] while streamData.next(): processLength += 1 # For each data, we are checking where it falls on the distribution _encoded = _encoder.encode(streamData.getData()) _distance = _encoder.getDistance(_encoded, X_centroid)
def main(): # update as per experiment requires # Checking Kullback Leibler... _encoder = w2vGoogleNews.w2vGoogleNews() _encoder.setup() # we are not updating internal timer... #streamData = StreamLocal.StreamLocal(data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) trainingData = BatchedLocal.BatchedLocal( data_source="./data/initialTrainingData.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) trainingData.load() X_train = _encoder.batchEncode(trainingData.getData()) X_centroid = _encoder.getCentroid(X_train) nBins = 40 # Set up initial Distribution initialDistribution = OnlineSimilarityDistribution.OnlineSimilarityDistribution( nBins) for _data in trainingData.getData(): initialDistribution.update( _encoder.getDistance(_encoder.encode(_data), X_centroid)) kullback = KullbackLeibler.KullbackLeibler(initialDistribution) driftWindowTracker = MemoryTracker.MemoryTracker() driftWindowTracker.addNewMemory(memory_name="kullback", memory_store='memory') """ totalCounter = 0 implicit_mistakes = 0.0 implicit_count = 0 explicit_mistakes = 0.0 explicit_count = 0 """ raw_vals = [0] dqlen = 100.0 windowed_raw = collections.deque([], int(dqlen)) #streamData = StreamLocal.StreamLocal(data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) streamData = StreamLocal.StreamLocal( data_source="./data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) secondary_distribution = OnlineSimilarityDistribution.OnlineSimilarityDistribution( nBins) processLength = 0 #genCount = 0 #axv = [] while streamData.next(): processLength += 1 # add to memory #driftWindowTracker.addToMemory("kullbak", streamData.getObject()) # Perform drift detection (but just tracking f. now) _encoded = _encoder.encode(streamData.getData()) _distance = _encoder.getDistance(_encoded, X_centroid) secondary_distribution.update(_distance) raw_val = kullback.detect(_distance, secondary_distribution) windowed_raw.append(raw_val) #if streamData.streamLength()>3000: # pdb.set_trace() #raw_vals.append((raw_vals[-1]+raw_val)/streamData.streamLength()) raw_vals.append(raw_val) #raw_vals.append(raw_val) """ driftWindowTracker.addToMemory(memory_name="kullback", data=streamData.getObject()) if processLength>dqlen: if raw_vals[-1] > .02: genCount += 1 print("processed ",streamData.streamLength(), " and detected drift: ", str(raw_vals[-1])) # transfer memory, etc etc trainingData = driftWindowTracker.transferMemory("kullback") driftWindowTracker.clearMemory("kullback") X_train = _encoder.batchEncode(trainingData.getData()) X_centroid = _encoder.getCentroid(X_train) # update distribution kullback.reset() secondary_distribution = OnlineSimilarityDistribution.OnlineSimilarityDistribution(nBins) processLength = 0 """ """ driftWindowTracker.addToMemory(memory_name="kullback", data=streamData.getObject()) if driftWindowTracker.memorySize("kullback") > 3000: #print("processed ",streamData.streamLength()) # transfer memory, etc etc trainingData = driftWindowTracker.transferMemory("kullback") driftWindowTracker.clearMemory("kullback") X_train = _encoder.batchEncode(trainingData.getData()) X_centroid = _encoder.getCentroid(X_train) # update distribution kullback.reset() secondary_distribution = OnlineSimilarityDistribution.OnlineSimilarityDistribution(nBins) """ pdb.set_trace()
def main(experimentname, allow_explicit_drift, explicit_drift_class, explicit_drift_mode, explicit_update_mode, allow_unlabeled_drift, unlabeled_drift_class, unlabeled_drift_mode, unlabeled_update_mode, allow_update_schedule, update_schedule, schedule_update_mode, weight_method, select_method, filter_method, kval, update_prune): # Tracking URI -- yeah it's not very secure, but w/e # mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs") # Where to save data: # mlflow.start_run(run_name=experimentname) # We'll load the config file, make changes, and write a secondary file for experiments mlepConfig = io_utils.load_json('./MLEPServer.json') # Get the option if in args, else use the one present in the config... for _item in mlepConfig["config"]: try: mlepConfig["config"][_item] = eval( _item ) # If option not given furing launch, eval(*) will return NameError except NameError: pass # Here the value of config in the MLEPServer.json file is used internalTimer = 0 streamData = StreamLocal.StreamLocal( data_source="data/realisticStreamComb_2013_feb19.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation = BatchedLocal.BatchedLocal( data_source='data/collectedIrrelevant.json', data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation.load_by_class() trainingData = BatchedLocal.BatchedLocal( data_source='data/initialTrainingData.json', data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) trainingData.load() # Now we have the data MLEPLearner = MLEPDriftAdaptor.MLEPDriftAdaptor(config_dict=mlepConfig, safe_mode=False) # Perform initial traininig MLEPLearner.initialTrain(traindata=trainingData) io_utils.std_flush("Completed training at", time_utils.readable_time()) MLEPLearner.addAugmentation(augmentation) io_utils.std_flush("Added augmentation at", time_utils.readable_time()) totalCounter = 0 implicit_mistakes = 0.0 implicit_count = 0 explicit_mistakes = 0.0 explicit_count = 0 while streamData.next(): if internalTimer < streamData.getObject().getValue("timestamp"): internalTimer = streamData.getObject().getValue("timestamp") MLEPLearner.updateTime(internalTimer) if streamData.getLabel() is None: classification = MLEPLearner.classify(streamData.getObject(), "implicit") else: classification = MLEPLearner.classify(streamData.getObject(), "explicit") if streamData.getLabel() is None: if classification != streamData.getObject().getValue("true_label"): implicit_mistakes += 1.0 implicit_count += 1 else: if classification != streamData.getLabel(): explicit_mistakes += 1.0 explicit_count += 1 totalCounter += 1 if totalCounter % 100 == 0 and totalCounter > 0.0: implicit_running_error = 2.00 explicit_running_error = 2.00 if implicit_count: implicit_running_error = implicit_mistakes / float( implicit_count) if explicit_count: explicit_running_error = explicit_mistakes / float( explicit_count) io_utils.std_flush( "Fin: %6i samples\t\texplicit error: %2.4f\t\t implicit error: %2.4f" % (totalCounter, explicit_running_error, implicit_running_error)) implicit_mistakes = 0.0 implicit_count = 0 explicit_mistakes = 0.0 explicit_count = 0 MLEPLearner.shutdown() io_utils.std_flush( "\n-----------------------------\nCOMPLETED\n-----------------------------\n" )
def main(experimentname, allow_explicit_drift, explicit_drift_class, explicit_drift_mode, explicit_update_mode, allow_unlabeled_drift, unlabeled_drift_class, unlabeled_drift_mode, unlabeled_update_mode, allow_update_schedule, update_schedule, schedule_update_mode, weight_method, select_method, filter_method, kval, update_prune): # Tracking URI -- yeah it's not very secure, but w/e # mlflow.set_tracking_uri("mysql://*****:*****@127.0.0.1:3306/mlflow_runs") # Where to save data: # mlflow.start_run(run_name=experimentname) # We'll load the config file, make changes, and write a secondary file for experiments mlepConfig = io_utils.load_json('./MLEPServer.json') for _item in mlepConfig["config"]: try: mlepConfig["config"][_item] = eval(_item) except NameError: pass # Log relevant details """ for _key in mlepConfig["config"]: # possible error if _key != "drift_metrics": mlflow.log_param(_key, mlepConfig["config"][_key]) """ internalTimer = 0 streamData = StreamLocal.StreamLocal( data_source="data/2014_to_dec2018.json", data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation = BatchedLocal.BatchedLocal( data_source='data/collectedIrrelevant.json', data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) augmentation.load_by_class() trainingData = BatchedLocal.BatchedLocal( data_source='data/initialTrainingData.json', data_mode="single", data_set_class=PseudoJsonTweets.PseudoJsonTweets) trainingData.load() # Now we have the data MLEPLearner = MLEPServer.MLEPLearningServer(config_dict=mlepConfig, safe_mode=False) # Perform initial traininig MLEPLearner.initialTrain(traindata=trainingData) io_utils.std_flush("Completed training at", time_utils.readable_time()) MLEPLearner.addAugmentation(augmentation) io_utils.std_flush("Added augmentation at", time_utils.readable_time()) totalCounter = 0.0 mistakes = [] while streamData.next(): if internalTimer < streamData.getObject().getValue("timestamp"): internalTimer = streamData.getObject().getValue("timestamp") MLEPLearner.updateTime(internalTimer) classification = MLEPLearner.classify(streamData.getObject()) totalCounter += 1.0 if classification != streamData.getLabel(): mistakes.append(1.0) else: mistakes.append(0.0) if totalCounter % 1000 == 0 and totalCounter > 0.0: io_utils.std_flush("Completed", int(totalCounter), " samples, with running error (past 100) of", sum(mistakes[-100:]) / 100.0) if totalCounter % 100 == 0 and totalCounter > 0.0: running_error = sum(mistakes[-100:]) / 100.0 io_utils.std_flush("\tCompleted", int(totalCounter), " samples, with running error (past 100) of", running_error) #mlflow.log_metric("running_err"+str(int(totalCounter/100)), running_error) MLEPLearner.shutdown() io_utils.std_flush( "\n-----------------------------\nCOMPLETED\n-----------------------------\n" )