Пример #1
0
    def clusterTestDataUsingMultipleClusterings(self, eventlog, testData):
        num_clusters = self.parameters["num_clusters"]
        eventAttributes = eventlog.data["attributes"]["event"]

        activities = eventlog.getActivityOccurrences(testData)
        for activityId, activity in activities.items():
            numEvents = len(activity["occ"])
            writeLog("Clustering %d test events for activity: %s (id: %s)" %
                     (numEvents, activity["name"], activityId))
            model = self.model[activityId] if activityId in self.model else None

            if ((numEvents < 2) or (model == None)):
                i = 0
                for e in activity["occ"]:
                    e.append(0)
                    i += 1
                continue

            events = [None] * numEvents
            maxLen = len(eventAttributes) + 2
            for i, e in enumerate(activity["occ"]):
                events[i] = e[2:maxLen]

            df = pd.DataFrame(events,
                              columns=eventlog.data["attributes"]["event"])
            labels = self.predict(df, model, self.vectorizer[activityId],
                                  self.known_values[activityId])

            i = 0
            for e in activity["occ"]:
                e.append(labels[i])
                i += 1
Пример #2
0
    def predict(self, df, model, vectorizer, known_values):
        threshold = self.parameters["ignore_values_threshold"] * len(df)
        if threshold > 0:
            for col in df.columns:
                writeLog("Replacing unusual values in column %s." % (col))
                if col in known_values:
                    isin = df[col].isin(known_values[col])
                    df[col].loc[-isin] = OTHER_TOKEN

        writeLog("Vectorizing data frame of shape: %s" % (str(df.shape)))
        XX = vectorizer.transform(df.to_dict(orient='records'))
        alg = algorithms[self.algorithm]
        return alg["predict"](XX, model)
Пример #3
0
    def setTrainingSize(self, parameters, pTraining):
        cases = np.asarray(self.data["cases"])

        maxNumCases = parameters["max_num_cases_in_training"]
        if (maxNumCases != None) and (maxNumCases < len(cases)):
            writeLog("Filtering out %d cases out of %d" %
                     (maxNumCases, len(cases)))
            cases = np.random.choice(cases, maxNumCases, replace=False)
            self.data["cases"] = cases

        nTraining = int(len(cases) * pTraining)
        indexes = self.rng.permutation(len(cases))
        self.trainingData = cases[indexes[:nTraining]]
        self.testData = cases[indexes[nTraining:]]
        self.initializeDerivedData()
Пример #4
0
def testPaused(parameters):
    wasPaused = False
    while True:
        filename = parameters["pause_filename"]
        if (not isFile(filename)):
            filename = getInputPath() + parameters["pause_filename"]
            if (not isFile(filename)):
                filename = getOutputPath() + parameters["pause_filename"]
                if (not isFile(filename)):
                    break
        if not wasPaused:
            writeLog("Tests paused until file is removed: %s" % filename)
            wasPaused = True
        sleep(1)
    if wasPaused:
        writeLog("Tests continued...")
Пример #5
0
    def performCrossValidatedTestsForFullEventLog(self):
        parameters = self.parameters
        nSplits = parameters["cross-validation-splits"]
        writeLog("Performing cross-validation using %d splits" % (nSplits))

        fullTestData = np.asarray(self.data["cases"])

        self.initializationReport()

        kf = KFold(n_splits=nSplits, random_state=self.rng, shuffle=True)
        cvRunIndex = 0
        for trainIndex, testIndex in kf.split(fullTestData):
            cvRunIndex += 1
            parameters["cross-validation-run"] = cvRunIndex
            self.performCrossValidationRun(fullTestData, trainIndex, testIndex,
                                           parameters)
Пример #6
0
def train_hashvalue(df, parameters):
    hashes = {}
    hashId = 0
    nextHashId = 0
    labels = []
    for row in df:
        hashValue = hash(tuple(row))
        if (hashValue in hashes):
            hashId = hashes[hashValue]
        else:
            nextHashId += 1
            hashId = hashes[hashValue] = nextHashId
        labels.append(hashId)
    writeLog(
        "Hashvalue clustering resulted into %d unique hash values for %d rows."
        % (len(hashes), len(labels)))
    return hashes, labels, [i for i in range(nextHashId)]
Пример #7
0
    def initializeDerivedData(self, forSplittedEventLog=False):
        self.activities = {}
        self.activitiesByLabel = {}
        if ("activities" in self.data):
            for a in self.data["activities"]:
                self.activities[a["id"]] = {"name": a["name"], "occ": []}
                self.activitiesByLabel[a["name"].replace(" ", "_")] = a

            if (not forSplittedEventLog):
                writeLog("Initializing activity counts for %d cases" %
                         (len(self.data["cases"])))
                for c in self.data["cases"]:
                    counters = collections.Counter(t[0] for t in c["t"])
                    c["occ"] = [
                        counters[act["id"]] for act in self.data["activities"]
                    ]

        self.flows = {}
Пример #8
0
    def trainForCaseClustering(self, eventlog, cases):
        if self.parameters["disable_case_attributes"] and self.parameters[
                "disable_raw_case_attributes"]:
            writeLog("Case clustering not needed. Skipping it.")
            for t in cases:
                t["_cluster"] = 0
            return
        writeLog("Clustering %d cases" % (len(cases)))
        #        num_clusters = self.parameters["num_clusters"]
        #        if (num_clusters <= 1):
        #            for t in cases:
        #                t["_cluster"] = 0
        #            return

        t0 = time()

        data = []
        cols = []
        ica_clustering, iao_clustering, ica_filtering, iao_filtering = self.getCaseFeatureGroupsToInclude(
        )
        ica_cols = []
        iao_cols = []
        if ica_filtering:
            data += [c["a"] + c["occ"] for c in cases
                     ] if iao_filtering else [c["a"] for c in cases]
            ica_cols = ["A_" + a for a in eventlog.data["attributes"]["case"]]
            cols += ica_cols
        if iao_filtering:
            if not ica_filtering:
                data += [c["occ"] for c in cases]
            iao_cols = ["O_" + a["name"] for a in eventlog.data["activities"]]
            cols += iao_cols
        df = pd.DataFrame(data, columns=cols)
        self.known_values = self.filterUnusualValues(df, self.parameters)
        if (ica_filtering and (not ica_clustering)):
            df = df.drop(ica_cols, axis=1)
        if (iao_filtering and (not iao_clustering)):
            df = df.drop(iao_cols, axis=1)

        if ("Cost" in df.columns):
            df = df.drop(["Cost"], axis=1)
        if ("_cluster" in df.columns):
            df = df.drop(["_cluster"], axis=1)

        if not self.parameters["disable_case_attributes"]:
            self.model, self.vectorizer, labels = self.train(
                df, self.parameters)
            for i, d in enumerate(labels):
                cases[i]["_cluster"] = d
            writeLog("Case clustering done in %0.3fs" % (time() - t0))
        else:
            self.model = None
            self.vectorizer = None
            writeLog("Case data filtering done in %0.3fs" % (time() - t0))
Пример #9
0
    def __init__(self,
                 parameters,
                 rng,
                 filename=None,
                 pTraining=0.0,
                 modelCluster=None,
                 inputJson=None):
        writeLog("Initializing event log")

        self.rng = rng
        self.parameters = dict(parameters)
        self.trainingData = []
        self.testData = []

        if (inputJson != None):
            self.data = json.loads(inputJson)
            self.filename = "unnamed"
            self.filepath = ""
        elif (filename != None):
            path = Path(filename)
            if (not path.is_file()):
                filename = getInputDatasetFilename(filename)
            self.filepath = filename
            self.filename = ntpath.basename(filename)
            with open(filename) as f:
                self.data = json.load(f)
        else:
            return

        self.pTraining = pTraining
        if pTraining == None:
            return

        if (modelCluster != None):
            model = modelCluster.models[0]
            if not ("activities" in self.data):
                self.data["activities"] = model.eventlogActivities
            if not ("attributes" in self.data):
                self.data["attributes"] = model.eventlogAttributes
        self.setTrainingSize(parameters, pTraining)
        self.initializationReport()
Пример #10
0
 def __init__(self,
              algorithm=None,
              globalParameters=None,
              parameters=None,
              copyFrom=None):
     if copyFrom != None:
         self.algorithm = copyFrom.algorithm
         self.parameters = dict(copyFrom.parameters)
     else:
         self.algorithm = algorithm
         if (globalParameters != None):
             self.parameters = dict(globalParameters)
             self.parameters.update(parameters)
         else:
             self.parameters = parameters
     writeLog("Creating new clustering object for algorithm: " +
              self.algorithm)
     self.model = None
     self.vectorizer = None
     self.known_values = None
     self.labels = []
Пример #11
0
def waitForConfiguration(origFilename, parameters):
    wasPaused = False
    filename = None
    while True:
        filename = origFilename
        if isFile(filename):
            break

        filename = getInputPath() + origFilename
        if isFile(filename):
            break

        filename = getOutputPath() + origFilename
        if isFile(filename):
            break

        if not wasPaused:
            writeLog(
                "Tests paused until a new configuration file appears in: %s" %
                origFilename)
            wasPaused = True
        sleep(1)
    if wasPaused:
        writeLog("Got new configuration. Continuing...")
    writeLog("Reading new configuration from %a" % filename)
    result = loadConfiguration(filename, parameters)
    os.remove(filename)
    return result
Пример #12
0
def train_xmeans(df, parameters):
    # create object of X-Means algorithm that uses CCORE for processing
    # initial centers - optional parameter, if it is None, then random centers will be used by the algorithm.
    # let's avoid random initial centers and initialize them using K-Means++ method:
    max_num_clusters = parameters["max_num_clusters"]
    num_clusters = parameters["num_clusters"]
    initial_centers = kmeans_plusplus_initializer(
        df, min(df.shape[0], num_clusters)).initialize()
    xmeans_instance = xmeans(df,
                             initial_centers,
                             ccore=True,
                             kmax=max_num_clusters)

    # run cluster analysis
    xmeans_instance.process()

    # obtain results of clustering
    clusters = xmeans_instance.get_clusters()
    writeLog(
        "X-means clustered using %d clusters (init: %d, max: %d). Using that as the desired number of clusters for k-means."
        % (len(clusters), num_clusters, max_num_clusters))
    return do_train_kmeans(df, len(clusters), xmeans_instance.get_centers())
Пример #13
0
    def performCrossValidationRun(self, fullTestData, trainIndex, testIndex,
                                  parameters):
        maxNumCases = parameters["max_num_cases_in_training"]
        cvRunIndex = parameters["cross-validation-run"]
        nSplits = parameters["cross-validation-splits"]

        writeLog("Starting cross-validation run %d of %d" %
                 (cvRunIndex, nSplits))

        if (maxNumCases != None) and (maxNumCases < len(trainIndex)):
            writeLog("Filtering out %d cases out of %d" %
                     (maxNumCases, len(trainIndex)))
            trainIndex = np.random.choice(trainIndex,
                                          maxNumCases,
                                          replace=False)

        runEventLog = self.createEmptyCopy()

        runEventLog.data["cases"] = fullTestData[trainIndex]
        runEventLog.pTraining = parameters["test_data_percentage"]
        runEventLog.setTrainingSize(parameters, runEventLog.pTraining)
        runEventLog.initializationReport()

        m = ModelCluster(runEventLog.rng)
        m.initialize(
            parameters=parameters,
            case_clustering=Clustering(
                parameters["case_clustering_method"], parameters, {
                    "num_clusters":
                    parameters["num_case_clusters"],
                    "max_num_clusters":
                    parameters["max_num_case_clusters"],
                    "ignore_values_threshold":
                    parameters["ignore_values_threshold_for_case_attributes"]
                }),
            event_clustering=Clustering(
                parameters["event_clustering_method"], parameters, {
                    "num_clusters":
                    parameters["num_event_clusters"],
                    "max_num_clusters":
                    parameters["max_num_event_clusters"],
                    "ignore_values_threshold":
                    parameters["ignore_values_threshold_for_event_attributes"]
                }),
            rng=runEventLog.rng)
        trainResult = m.train(runEventLog)

        writeLog("Starting cross-validation test for run %d" % (cvRunIndex))

        runEventLog = self.createEmptyCopy()
        runEventLog.data["cases"] = fullTestData[testIndex]
        runEventLog.testData = fullTestData[testIndex]
        runEventLog.trainingData = []
        runEventLog.pTraining = 0.0
        runEventLog.initializeDerivedData()
        runEventLog.initializationReport()
        maxNumTraces = parameters[
            "max_num_traces_in_testing"] if "max_num_traces_in_testing" in parameters else None
        m.test(runEventLog, 1.0, trainResult, maxNumTraces)
Пример #14
0
    def splitLog(self, eventlog, onlyTest=False):
        self.eventlog = eventlog
        true_k = len(self.models)
        if (true_k == 1):
            return [self.eventlog]

        t0 = time()
        result = [
            self.eventlog.createEmptyCopy(self.parameters)
            for model in self.models
        ]

        if (not onlyTest):
            cases = np.array([c["occ"] for c in self.eventlog.trainingData])
            df = pd.DataFrame(
                cases,
                columns=[a["name"] for a in self.eventlog.data["activities"]])

            self.caseClusterVectorizer = DictVectorizer(sparse=False)
            X = self.caseClusterVectorizer.fit_transform(
                df.to_dict(orient='records'))

            writeLog("Event log splitting done in %fs" % (time() - t0))
            writeLog("n_samples: %d, n_features: %d" % X.shape)

            # #############################################################################
            # Do the actual clustering
            #        if opts.minibatch:
            self.caseClusterModel = MiniBatchKMeans(n_clusters=true_k,
                                                    init='k-means++',
                                                    n_init=1,
                                                    init_size=1000,
                                                    batch_size=1000,
                                                    verbose=False)
            #        else:
            #            self.caseClusterModel = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
            #                        verbose=opts.verbose)

            #        writeLog("Clustering sparse data with %s" % self.caseClusterModel)
            t0 = time()
            x = self.caseClusterModel.fit(X)
            writeLog("done in %0.3fs" % (time() - t0))

            for i, d in enumerate(x.labels_):
                result[d].addTrace(self.eventlog.trainingData[i], True)

        cases = np.array([c["occ"] for c in self.eventlog.testData])
        df = pd.DataFrame(
            cases,
            columns=[a["name"] for a in self.eventlog.data["activities"]])
        XX = self.caseClusterVectorizer.transform(df.to_dict(orient='records'))
        x = self.caseClusterModel.predict(XX)
        for i, d in enumerate(x):
            result[d].addTrace(self.eventlog.testData[i], False)

        for eventlog in result:
            eventlog.initializeDerivedData(True)

        return result
Пример #15
0
 def load(self, filename, parameters):
     path = Path(filename)
     if (not path.is_file()):
         filename = getOutputPath() + filename
     with open(filename, 'rb') as f:
         saved = pickle.load(
             f
         )  # https://groups.google.com/d/msg/lasagne-users/w8safJOJYvI/SvdiuIHIDQAJ
     self.parameters = dict(parameters)
     self.parameters.update(saved["parameters"])
     self.caseClusterModel = saved["case_cluster_model"]
     self.caseClusterVectorizer = saved["case_cluster_vectorizer"]
     self.case_clustering = saved["case_clustering"]
     self.event_clustering = saved["event_clustering"]
     self.algorithm = saved["nn_params"]["algorithm"]
     self.num_layers = saved["nn_params"]["num_layers"]
     self.optimizer = saved["nn_params"]["optimizer"]
     self.learning_rate = saved["nn_params"]["learning_rate"]
     self.batch_size = saved["nn_params"]["batch_size"]
     self.num_callbacks = saved["nn_params"]["num_callbacks"]
     self.case_name = saved["nn_params"]["case_name"]
     self.hidden_dim_size = saved["nn_params"]["hidden_dim_size"]
     self.num_iterations_between_reports = saved["nn_params"][
         "num_iterations_between_reports"]
     self.grad_clipping = saved["nn_params"]["grad_clipping"]
     self.predict_only_outcome = saved["nn_params"]["predict_only_outcome"]
     self.final_trace_only = saved["nn_params"]["final_trace_only"]
     self.max_num_words = saved["nn_params"]["max_num_words"]
     self.trace_length_modifier = saved["nn_params"][
         "trace_length_modifier"]
     self.truncate_unknowns = saved["nn_params"]["truncate_unknowns"]
     self.num_models = saved["nn_params"]["num_models"]
     self.models = []
     for i in range(self.num_models):
         writeLog("Loading model %d of %d" % (i + 1, self.num_models))
         model = Model(self.parameters)
         self.models.append(model)
         model.load(saved["saved_models"][i])
Пример #16
0
    def train(self, eventlog):
        self.eventlogs = self.splitLog(eventlog)

        writeLog("Trace distribution by models:")
        trainDatasetSize = 0
        for i, eventlog in enumerate(self.eventlogs):
            writeLog(
                "Model #%d: Train: %d traces, Test: %d traces" %
                (i + 1, len(eventlog.trainingData), len(eventlog.testData)))
            trainDatasetSize += len(eventlog.trainingData) + len(
                eventlog.testData)

        tutrain = 0
        numSuccess = 0
        numFail = 0
        titu = 0
        litu = 0
        numEpochs = []
        ivs = []
        bestIterations = []
        for i, eventlog in enumerate(self.eventlogs):
            model = self.models[i]
            writeLog("Training model %d of %d" % (i + 1, len(self.eventlogs)))
            ns, ne, tu = model.train(eventlog)
            numEpochs.append(model.epoch)
            ivs.append(len(model.word_to_index))
            bestIterations.append(model.best_iteration)
            tutrain += tu
            numSuccess += ns
            numFail += ne
            titu += model.train_initialization_time_used
            litu += model.layer_initialization_time_used
        srtrain = numSuccess / numFail
        writeLog("Total time used in training: %d (success rate = %f)" %
                 (tutrain, srtrain))
        return {
            "success_rate": srtrain,
            "train_dataset_size": trainDatasetSize,
            "train_time_used": tutrain,
            "train_init_time_used": titu,
            "layer_init_time_used": litu,
            "num_epochs": np.mean(np.asarray(numEpochs)),
            "test_iterations": self.parameters["num_callbacks"],
            "input_vector_size": np.mean(ivs),
            "best_iteration": np.mean(bestIterations)
        }
Пример #17
0
def do_train_kmeans(df, num_clusters, centers=None):
    if (df.shape[1] == 0) or (num_clusters < 2):
        writeLog(
            "No columns in the table to be clustered. Returning constant labels."
        )
        model = None
        labels = len(df) * [0]
        return model, labels, [0]

    if centers == None:
        model = MiniBatchKMeans(n_clusters=num_clusters,
                                init='k-means++',
                                n_init=1,
                                init_size=1000,
                                batch_size=1000,
                                verbose=False)
    else:
        model = KMeans(n_clusters=num_clusters,
                       init=np.asarray(centers),
                       n_init=1,
                       max_iter=1)
    x = model.fit(df)
    writeLog("K-means model created for %d clusters." % (model.n_clusters))
    return model, x.labels_, [i for i in range(model.n_clusters)]
Пример #18
0
    def filterUnusualValues(self, df, parameters):
        writeLog("Number of colums to filter unusual values from %d" %
                 (len(df.columns)))

        t0 = time()

        threshold = parameters["ignore_values_threshold"] * len(df)
        known_values = {}
        for col in df.columns:
            writeLog(
                "Replacing unusual values in column '%s' with minimum usage of %d rows."
                % (col, threshold))
            vc = df[col].value_counts()
            toRemove = vc[vc <= threshold].index
            toKeep = vc[vc > threshold].index
            known_values[col] = toKeep
            writeLog(
                "Remaining known values: %s (removed %d values out of %d values)"
                % (str([i for i in toKeep]), len(toRemove), len(toKeep)))
            if len(toRemove) > 0:
                df[col].replace(toRemove, OTHER_TOKEN, inplace=True)
        writeLog("Unusual value filtering done in %f s" % (time() - t0))
        return known_values
Пример #19
0
    def preProcessForTraining(self, parameters):
        disableDurations = parameters["disable_durations"]
        if not disableDurations:
            numEvents = 0
            writeLog("Pre-processing %d cases" % (len(self.trainingData)))
            for c in self.trainingData:
                prev = None
                prevDate = None
                evts = c["t"]
                numEvents += len(evts)
                for e in evts:
                    eDate = parse_date(e[1])
                    if prev is not None:
                        key = "%s->%s" % (prev[0], e[0])
                        if (key in self.flows):
                            flow = self.flows[key]
                        else:
                            flow = self.flows[key] = {"name": key, "occ": []}
                        delta = eDate - prevDate
                        flow["occ"].append(delta)
                    prevDate = eDate
                    prev = e

            writeLog(
                "Total number of events in training data: %d (Average case length: %f)"
                % (numEvents, (numEvents / len(self.trainingData))))
            writeLog("Pre-processing %d flows" % (len(self.flows)))
            for key in self.flows:
                f = self.flows[key]
                nOcc = len(f["occ"])
                f["occ"].sort()
                if (nOcc > 0):
                    min = f["min"] = f["occ"][0]
                    max = f["max"] = f["occ"][nOcc - 1]
                    f["avg"] = np.mean(f["occ"])
                    f["med"] = np.median(f["occ"])
                    f["perc10"] = np.percentile(f["occ"], 10)
                    f["perc25"] = np.percentile(f["occ"], 25)
                    f["perc75"] = np.percentile(f["occ"], 75)
                    f["perc90"] = np.percentile(f["occ"], 90)
                    f["diff"] = f["max"] - f["min"]
                    f["fast"] = f["perc10"]
                    f["slow"] = f["perc90"]
Пример #20
0
        os.remove(started_tests_filename)

    if ("test_config_filename" in default_parameters) and (
            default_parameters["test_config_filename"] != None):
        parameters = dict(default_parameters)
        configuration = waitForConfiguration(
            parameters["test_config_filename"], parameters)
        main(configuration, parameters)
    writeLog("Tests finished.")


started_tests_filename = default_parameters[
    "output_directory"] + "current-tests.json"

if (opts.configuration_from_standard_input):
    writeLog("Reading configuration from standard input")
    jsonConfig = sys.stdin.readline()
    configuration = json.loads(jsonConfig)
    writeLog("Standard input reading finished")

parameters = dict(default_parameters)
configuration = None
if (opts.configuration_filename != None):
    configuration = loadConfiguration(opts.configuration_filename, parameters)
configure(parameters["input_directory"], parameters["output_directory"],
          opts.log_to_file_only)

writeLog(__doc__)

if __name__ == '__main__':
    main(configuration, parameters)
Пример #21
0
def main(configuration, parameters):
    def saveConfigs(testConfigs):
        jsonConfig = json.dumps(testConfigs)
        with open(started_tests_filename, "w") as f:
            f.write(jsonConfig)

    if configuration != None:
        tests = []
        if Path(started_tests_filename).is_file():
            tsts = None
            with open(started_tests_filename) as data:
                tsts = json.load(data)
            for t in tsts:
                ts = dict(default_parameters)
                ts.update(t)
                tests.append(ts)
            writeLog("Loaded remaining %d test configurations from %s." %
                     (len(tests), started_tests_filename))
        else:
            if (not collect(configuration, default_parameters, tests)):
                writeLog("Exit requested. Finishing tests...")
                return

            saveConfigs(tests)
            writeLog("Generated %d test configurations." % (len(tests)))

        if opts.skip_tests > 0:
            tests = tests[opts.skip_tests:]
            saveConfigs(tests)
            writeLog(
                "Skipping %d first test configurations leaving total of %d test remaining."
                % (opts.skip_tests, len(tests)))

        testPaused(parameters)
        nTests = len(tests)
        i = 1
        while (len(tests) > 0):
            writeLog("Starting test %d of %d." % (i, nTests))
            try:
                run(tests[0])
            except:
                writeLog("Exception: " + traceback.format_exc())
            tests = tests[1:]
            saveConfigs(tests)
            testPaused(parameters)
            i = i + 1

        os.remove(started_tests_filename)

    if ("test_config_filename" in default_parameters) and (
            default_parameters["test_config_filename"] != None):
        parameters = dict(default_parameters)
        configuration = waitForConfiguration(
            parameters["test_config_filename"], parameters)
        main(configuration, parameters)
    writeLog("Tests finished.")
Пример #22
0
    def test(self,
             eventlog,
             tracePercentage=1.0,
             trainResult=None,
             maxNumTraces=None):
        self.eventlogs = self.splitLog(eventlog, True)

        writeLog("Trace distribution by models:")
        for i, eventlog in enumerate(self.eventlogs):
            writeLog(
                "Model #%d: Train: %d cases, Test: %d cases" %
                (i + 1, len(eventlog.trainingData), len(eventlog.testData)))

        traces = []
        predictions = []
        probs = []
        numSuccess = 0

        t0 = time()
        for i, model in enumerate(self.models):
            writeLog("Testing model %d of %d" % (i + 1, len(self.eventlogs)))
            t, pred, prob, ns = model.test(self.eventlogs[i], tracePercentage,
                                           maxNumTraces)
            traces += t
            predictions += pred
            probs += prob
            numSuccess += ns

        tutest = (time() - t0)
        sr_test = numSuccess / len(predictions)
        writeLog("Success rate for test data: %d/%d (=%f%%)" %
                 (numSuccess, len(predictions), 100 * sr_test))

        train_success_rate = ""
        train_time_used = ""
        train_init_time_used = ""
        train_layer_init_time_used = ""
        num_epochs = ""
        test_iterations = ""
        train_dataset_size = 0
        if trainResult != None:
            train_success_rate = trainResult["success_rate"]
            train_time_used = trainResult["train_time_used"]
            train_init_time_used = trainResult["train_init_time_used"]
            train_layer_init_time_used = trainResult["layer_init_time_used"]
            train_dataset_size = trainResult["train_dataset_size"]
            num_epochs = trainResult["num_epochs"]
            test_iterations = trainResult["test_iterations"]
            train_input_vector_size = trainResult["input_vector_size"]
            train_best_iteration = trainResult["best_iteration"]

        writeTestResultRow([
            datetime.now().replace(microsecond=0).isoformat(), "ok-test",
            self.parameters["test_name"], self.case_name,
            self.parameters["dataset_name"] if
            (("dataset_name" in self.parameters) and
             (self.parameters["dataset_name"] != None)) else
            self.eventlog.filename, self.parameters["cross-validation-run"] if
            (("cross-validation-run" in self.parameters) and
             (self.parameters["cross-validation-run"] != None)) else "",
            train_dataset_size,
            len(traces),
            len(traces), self.algorithm, self.num_layers, self.hidden_dim_size,
            self.optimizer, self.learning_rate, "", train_input_vector_size,
            self.batch_size, self.grad_clipping,
            self.num_iterations_between_reports, train_best_iteration,
            test_iterations, "", num_epochs, train_init_time_used,
            train_layer_init_time_used, train_time_used, train_time_used,
            train_time_used, tutest, tutest, train_success_rate, sr_test, "",
            "", "", "", "", "", "", "", "", "", self.predict_only_outcome,
            self.final_trace_only, self.trace_length_modifier,
            self.num_iterations_between_reports *
            self.num_callbacks == 100000 * 50, self.max_num_words,
            self.truncate_unknowns,
            not self.parameters["disable_activity_labels"],
            not self.parameters["disable_durations"],
            not self.parameters["disable_event_attributes"],
            not self.parameters["disable_case_attributes"],
            not self.parameters["disable_raw_event_attributes"],
            not self.parameters["disable_raw_case_attributes"],
            self.parameters["predict_next_activity"],
            self.parameters["use_single_event_clustering"],
            self.parameters["duration_split_method"],
            self.parameters["case_clustering_method"],
            self.parameters["event_clustering_method"],
            self.parameters["case_clustering_include_activity_occurrences"],
            self.parameters["case_clustering_include_case_attributes"], self.
            parameters["include_activity_occurrences_as_raw_case_attributes"],
            self.parameters["use_single_value_for_duration"],
            self.parameters["max_num_case_clusters"],
            self.parameters["max_num_event_clusters"],
            self.parameters["ignore_values_threshold_for_case_attributes"],
            self.parameters["ignore_values_threshold_for_event_attributes"]
        ])

        writeLog("Collecting results...")
        result = {}
        for i, trace in enumerate(traces):
            pred = predictions[i]
            result[trace.traceId] = {
                "outcome":
                pred[len(OUTCOME_SELECTION_TOKEN_PREFIX):]
                if pred.startswith(OUTCOME_SELECTION_TOKEN_PREFIX) else pred,
                "p":
                probs[i],
                "expected":
                trace.outcome if trace.outcome != None else ""
            }
        return result
Пример #23
0
 def __init__(self, rng):
     lasagne.random.set_rng(rng)
     writeLog("Creating new model cluster object")
Пример #24
0
    def train(self, df, parameters):
        writeLog("Number of colums to cluster %d" % (len(df.columns)))

        t0 = time()

        vectorizer = DictVectorizer(sparse=False)
        writeLog("Vectorizing data frame of shape: %s" % (str(df.shape)))
        X = vectorizer.fit_transform(df.to_dict(orient='records'))

        writeLog("Data vectorization done in %fs" % (time() - t0))
        writeLog("n_samples: %d, n_features: %d" % X.shape)

        t0 = time()
        alg = algorithms[self.algorithm]

        # #############################################################################
        # Do the actual clustering
        if df.shape[0] < 2:
            writeLog("One row or less to cluster. Returning constant labels.")
            model = None
            labels = len(df) * [0]
            allLabels = [0]
        elif df.shape[1] == 0:
            writeLog(
                "No columns in the table to be clustered. Returning constant labels."
            )
            model = None
            labels = len(df) * [0]
            allLabels = [0]
        else:
            model, labels, allLabels = alg["train"](X, parameters)

        if (len(allLabels) > len(self.labels)):
            self.labels = allLabels

        writeLog("Clustering using %s done in %fs" %
                 (self.algorithm, time() - t0))
        return model, vectorizer, labels
Пример #25
0
    def trainForEventClustering(self, eventlog, cases):
        if self.parameters["disable_event_attributes"] and self.parameters[
                "disable_raw_event_attributes"]:
            writeLog("Event clustering not needed. Skipping it.")
            for c in cases:
                for e in c["t"]:
                    e.append(0)
            return
        writeLog("Clustering events in %d cases" % (len(cases)))
        #        num_clusters = self.parameters["num_clusters"]
        #        if (num_clusters <= 1):
        #            for c in cases:
        #                for e in c["t"]:
        #                    e.append(0)
        #            return

        t0 = time()

        if (self.parameters["use_single_event_clustering"]):
            events = []
            for c in cases:
                for e in c["t"]:
                    events.append(["" if i == None else i for i in e[2:]])
            df = pd.DataFrame(events,
                              columns=eventlog.data["attributes"]["event"])
            known_values = self.filterUnusualValues(df, self.parameters)
            if not self.parameters["disable_event_attributes"]:
                model, vectorizer, labels = self.train(df, self.parameters)
                i = 0
                for c in cases:
                    for e in c["t"]:
                        e.append(labels[i])
                        i += 1
                self.vectorizer = {"primary": vectorizer}
                self.model = {"primary": model}
            else:
                model = None
                vectorizer = None
            self.known_values = {"primary": known_values}
        else:
            self.model = {}
            self.vectorizer = {}
            self.known_values = {}

            eventAttributes = eventlog.data["attributes"]["event"]

            activities = eventlog.getActivityOccurrences(cases)
            for activityId, activity in activities.items():
                t0 = time()
                writeLog("Clustering %d events for activity: %s (id: %s)" %
                         (len(activity["occ"]), activity["name"], activityId))

                events = [None] * len(activity["occ"])
                maxLen = len(eventAttributes) + 2
                for i, e in enumerate(activity["occ"]):
                    events[i] = e[2:maxLen]

                if (len(events) < 1):
                    i = 0
                    for e in activity["occ"]:
                        e.append(0)
                        i += 1
                    continue

                df = pd.DataFrame(events,
                                  columns=eventlog.data["attributes"]["event"])
                self.known_values[activityId] = self.filterUnusualValues(
                    df, self.parameters)

                if not self.parameters["disable_event_attributes"]:
                    self.model[activityId], self.vectorizer[
                        activityId], labels = self.train(df, self.parameters)
                    i = 0
                    if not self.parameters["disable_event_attributes"]:
                        for e in activity["occ"]:
                            e.append(labels[i])
                            i += 1
                else:
                    self.model[activityId] = None
                    self.vectorizer[activityId] = None

        writeLog("Event clustering done in %0.3fs" % (time() - t0))
Пример #26
0
    def convertTracesFromInputData(self, data, parameters,
                                   trace_length_modifier):
        writeLog("Converting %d cases into event traces." % (len(data)))

        enableDurations = not parameters["disable_durations"]
        splitDurationsInto5Buckets = parameters[
            "duration_split_method"] == "5-buckets"
        addOnlyFullTraceForFinisher = not parameters["predict_next_activity"]
        useSingleValueForDuration = parameters["use_single_value_for_duration"]
        includeActivityOccurrencesAsRawCaseAttributes = parameters[
            "include_activity_occurrences_as_raw_case_attributes"]
        disableEventAttributes = parameters["disable_event_attributes"]
        splitTracesToPrefixes = parameters["split_traces_to_prefixes"]
        minPrefixLength = parameters["min_splitted_trace_prefix_length"]
        maxTraceLength = parameters["max_trace_length"]

        result = []
        numFilteredCases = 0
        numFilteredTraces = 0
        for c in data:
            traces = []
            l = len(c["t"])
            finisherTraceFiltered = False

            if l > minPrefixLength:
                if splitTracesToPrefixes:
                    if (l > maxTraceLength):
                        numFilteredCases += 1
                        numFilteredTraces += l - maxTraceLength - minPrefixLength
                        l = maxTraceLength
                        finisherTraceFiltered = True
                    for i in range(minPrefixLength, l):
                        traces.append(c["t"][:i])
                else:
                    if (l > maxTraceLength):
                        numFilteredCases += 1
                        numFilteredTraces += 1
                        finisherTraceFiltered = True
                        traces.append(c["t"][:maxTraceLength])
                    else:
                        traces.append(c["t"])

            if len(traces) == 0:
                continue

            lastTrace = traces[len(traces) - 1]
            for trace in traces:
                sentence = []
                durations = []
                cAttributes = (
                    c["a"] + c["occ"]
                ) if includeActivityOccurrencesAsRawCaseAttributes else c["a"]
                prev = None
                prevDate = None
                eAttributes = []
                for e in trace:
                    eDate = parse_date(e[1])
                    durationPart = DURATION_TOKEN_PREFIX + "normal"
                    dp = 0.5
                    if enableDurations and prev is not None:
                        key = "%s->%s" % (prev[0], e[0])
                        flow = self.flows[key] if key in self.flows else None
                        delta = eDate - prevDate
                        if (flow != None) and ("slow" in flow):
                            if splitDurationsInto5Buckets:
                                if (delta > flow["perc90"]):
                                    durationPart = DURATION_TOKEN_PREFIX + "perc90"
                                    dp = 0.0
                                elif (delta > flow["perc75"]):
                                    durationPart = DURATION_TOKEN_PREFIX + "perc75"
                                    dp = 0.25
                                elif (delta > flow["perc25"]):
                                    durationPart = DURATION_TOKEN_PREFIX + "perc25"
                                    dp = 0.5
                                elif (delta > flow["perc10"]):
                                    durationPart = DURATION_TOKEN_PREFIX + "perc10"
                                    dp = 0.75
                                else:
                                    durationPart = DURATION_TOKEN_PREFIX + "perc0"
                                    dp = 1.0
                            else:
                                if (delta > flow["slow"]):
                                    durationPart = DURATION_TOKEN_PREFIX + "slow"
                                    dp = 0.0
                                elif (delta < flow["fast"]):
                                    durationPart = DURATION_TOKEN_PREFIX + "fast"
                                    dp = 1.0
                    actPart = self.activities[e[0]]["name"]
                    eAttributes += [
                        e[2:(len(e) - 1) if disableEventAttributes else -1]
                    ]
                    clusterPart = EVENT_ATTRIBUTE_TOKEN_PREFIX + str(
                        e[len(e) - 1])
                    sentence.append(durationPart + WORD_PART_SEPARATOR +
                                    actPart.replace(WORD_PART_SEPARATOR, "_") +
                                    WORD_PART_SEPARATOR + clusterPart)
                    if useSingleValueForDuration:
                        durations.append(dp)
                    prevDate = eDate
                    prev = e
                finisher = c["f"] if "f" in c else (
                    (trace == lastTrace) and (not finisherTraceFiltered))
                cluster = c["_cluster"] if ("_cluster" in c) else None
                if (not (addOnlyFullTraceForFinisher and finisher)):
                    result.append(
                        TraceData(c["n"], c["s"] if "s" in c else None, "s"
                                  in c, cAttributes, eAttributes, cluster,
                                  sentence, durations, parameters,
                                  trace_length_modifier, self.model, False))
                if (finisher):
                    result.append(
                        TraceData(c["n"] + "_f", c["s"] if "s" in c else None,
                                  "s" in c, cAttributes, eAttributes, cluster,
                                  sentence, durations, parameters,
                                  trace_length_modifier, self.model, True))
        writeLog("Generated %d event traces out of %d cases." %
                 (len(result), len(data)))
        if (numFilteredTraces > 0):
            writeLog(
                "Filtered %d traces in %d cases due to them having more than maximum allowed number of events (%d)"
                % (numFilteredTraces, numFilteredCases, maxTraceLength))
        return result
Пример #27
0
def train_skmeans(df, parameters):
    #    num_clusters = parameters["num_clusters"]
    # create instance of Elbow method using K value from 1 to 10.
    #    kmin, kmax = 1, 20
    #    elbow_instance = elbow(df, kmin, kmax)
    # process input data and obtain results of analysis
    #    elbow_instance.process()
    #    num_clusters = elbow_instance.get_amount()   # most probable amount of clusters
    # https://datascience.stackexchange.com/questions/34187/kmeans-using-silhouette-score

    max_num_clusters = parameters["max_num_clusters"]
    Ks = range(2, min(max_num_clusters, len(df)) + 1)
    kms = [
        MiniBatchKMeans(n_clusters=i,
                        init='k-means++',
                        n_init=1,
                        init_size=1000,
                        batch_size=1000,
                        verbose=False) for i in Ks
    ]
    writeLog("Performing K-means for cluster sizes 2 - %d" %
             (min(max_num_clusters, len(df))))
    sil_coeff = []
    all_labels = []
    distance_matrix = None
    max_num_samples_training_cluster = parameters[
        "max_num_samples_training_cluster"]
    if len(df) > max_num_samples_training_cluster:
        writeLog(
            "The number of samples to be clustered (%d) exceeds the configured maximum of %d. Taking random sample of the configured maximum size."
            % (len(df), max_num_samples_training_cluster))
        traindf = df[np.random.choice(
            df.shape[0], max_num_samples_training_cluster, replace=False), :]
    else:
        traindf = df
    for i, km in enumerate(kms):
        x = km.fit(traindf)
        if (i == 0):
            distance_matrix = pairwise_distances(traindf, metric="euclidean")
        score = 0.0
        try:
            score = silhouette_score(distance_matrix,
                                     x.labels_,
                                     metric='precomputed')
            writeLog("sihouette_score for cluster size %d = %f" %
                     (km.n_clusters, score))
        except:
            writeLog(
                "Unable to calculate sihouette_score for cluster size %d. Using %f."
                % (km.n_clusters, score))
        if len(traindf) < len(df):
            labels = km.predict(df)
        else:
            labels = x.labels_
        sil_coeff.append(score)
        all_labels.append(labels)
        if score >= 1.0:
            writeLog(
                "Maximum silhouette score reached. No need to consider any more clusters."
            )
            break
    max_index = np.asarray(sil_coeff).argmax(axis=0)
    model = kms[max_index]
    labels = all_labels[max_index]
    writeLog("Optimum number of clusters: " + str(model.n_clusters))
    return model, labels, [i for i in range(model.n_clusters)]
Пример #28
0
def run(parameters):
    rng = np.random.RandomState(random_seed)

    writeLog("Running test using parameters: " + json.dumps(parameters))

    inputJson = None
    if (opts.input_data_from_standard_input):
        writeLog("Reading from standard input")
        inputJson = sys.stdin.readline()
        writeLog("Standard input reading finished")
        if (parameters["write_input_to_file"]):
            filename = get_filename(
                "testdata_", "%s_%s_%s" % (parameters["file_handle"], "", ""),
                "json")
            with open(filename, "w") as f:
                f.write(inputJson)

    if (parameters["model_filename"] != None):
        m = ModelCluster(rng)
        m.load(parameters["model_filename"], parameters)
        inputFilename = None if parameters[
            "test_filename"] == None else parameters["test_filename"]
        if (inputFilename != None):
            writeLog("Reading test data from file: " + inputFilename)
        el = EventLog(parameters,
                      rng,
                      inputFilename,
                      modelCluster=m,
                      inputJson=inputJson)
        jsonResult = "{}"
        if (len(el.testData) > 0):
            writeLog("Test set contains %d cases." % (len(el.testData)))
            result = m.test(el)
            jsonResult = json.dumps(result)
            filename = get_filename(
                "predict_result", "%s_%s_%s" %
                (parameters["file_handle"], m.case_name, m.eventlog.filename),
                "json")
            with open(filename, "w") as f:
                f.write(jsonResult)
            writeLog("Generated results saved into file: %s" % filename)
        else:
            writeLog("Test set is empty. No results created.")
        print(jsonResult)
    elif ((parameters["input_filename"] != None) or (inputJson != None)):
        if parameters["cross-validation-splits"] != None:
            EventLog.performCrossValidatedTests(parameters, inputJson, rng)
            return
        e = EventLog(parameters,
                     rng,
                     parameters["input_filename"],
                     parameters["test_data_percentage"],
                     inputJson=inputJson)
        m = ModelCluster(rng)
        m.initialize(
            parameters=parameters,
            case_clustering=Clustering(
                parameters["case_clustering_method"], parameters, {
                    "num_clusters":
                    parameters["num_case_clusters"],
                    "max_num_clusters":
                    parameters["max_num_case_clusters"],
                    "ignore_values_threshold":
                    parameters["ignore_values_threshold_for_case_attributes"]
                }),
            event_clustering=Clustering(
                parameters["event_clustering_method"], parameters, {
                    "num_clusters":
                    parameters["num_event_clusters"],
                    "max_num_clusters":
                    parameters["max_num_event_clusters"],
                    "ignore_values_threshold":
                    parameters["ignore_values_threshold_for_event_attributes"]
                }),
            rng=rng)
        trainResult = m.train(e)
        filename = m.save(parameters["file_handle"], parameters)
        writeLog("Generated model saved into file: %s" % filename)
        print(filename)

        if (parameters["test_filename"] != None):
            m = ModelCluster(rng)
            m.load(filename, parameters)
            el = EventLog(parameters,
                          rng,
                          parameters["test_filename"],
                          modelCluster=m)
            result = m.test(el, 1.0, trainResult)
            jsonResult = json.dumps(result)
            filename = get_filename(
                "predict_result", "%s_%s_%s" %
                (parameters["file_handle"], m.case_name, m.eventlog.filename),
                "json")
            with open(filename, "w") as f:
                f.write(jsonResult)
            writeLog("Generated results saved into file: %s" % filename)
            print(jsonResult)
Пример #29
0
    def __init__(self, num_layers, algorithm, num_units, hidden_dim_size,
                 grad_clipping, optimizer, learning_rate):
        self.traces_train = []
        self.traces_test = []
        self.num_layers = num_layers
        self.algorithm = algorithm
        self.num_units = num_units
        self.hidden_dim_size = hidden_dim_size
        self.grad_clipping = grad_clipping
        self.optimizer = optimizer
        self.learning_rate = learning_rate
        writeLog("Preparing " + str(self.num_layers) +
                 " layers for algorithm: " + self.algorithm)

        # First, we build the network, starting with an input layer
        # Recurrent layers expect input of shape
        # (batch size, SEQ_LENGTH, num_features)
        mask_var = T.matrix('mask')

        l_in = lasagne.layers.InputLayer(shape=(None, None, num_units))
        l_mask = lasagne.layers.InputLayer((None, None), mask_var)
        self.l_layers = [l_in]

        # We now build the LSTM layer which takes l_in as the input layer
        # We clip the gradients at GRAD_CLIP to prevent the problem of exploding gradients.
        if (self.algorithm == "gru"):
            layerCreatorFunc = lambda parentLayer, isFirstLayer, isLastLayer: lasagne.layers.GRULayer(
                parentLayer,
                self.hidden_dim_size,
                grad_clipping=self.grad_clipping,
                mask_input=l_mask if isFirstLayer else None,
                only_return_final=isLastLayer)
        else:
            # All gates have initializers for the input-to-gate and hidden state-to-gate
            # weight matrices, the cell-to-gate weight vector, the bias vector, and the nonlinearity.
            # The convention is that gates use the standard sigmoid nonlinearity,
            # which is the default for the Gate class.
            #            gate_parameters = lasagne.layers.recurrent.Gate(
            #                W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
            #                b=lasagne.init.Constant(0.))
            #            cell_parameters = lasagne.layers.recurrent.Gate(
            #                W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
            #                # Setting W_cell to None denotes that no cell connection will be used.
            #                W_cell=None, b=lasagne.init.Constant(0.),
            #                # By convention, the cell nonlinearity is tanh in an LSTM.
            #                nonlinearity=lasagne.nonlinearities.tanh)

            layerCreatorFunc = lambda parentLayer, isFirstLayer, isLastLayer: lasagne.layers.LSTMLayer(
                parentLayer,
                self.hidden_dim_size,
                grad_clipping=self.grad_clipping,
                mask_input=l_mask if isFirstLayer else None,
                nonlinearity=lasagne.nonlinearities.tanh,
                # Here, we supply the gate parameters for each gate
                #                    ingate=gate_parameters, forgetgate=gate_parameters,
                #                    cell=cell_parameters, outgate=gate_parameters,
                # We'll learn the initialization and use gradient clipping
                only_return_final=isLastLayer)

        for layerId in range(self.num_layers):
            self.l_layers.append(
                layerCreatorFunc(self.l_layers[layerId], layerId == 0,
                                 layerId == self.num_layers - 1))

        # The output of l_forward_2 of shape (batch_size, N_HIDDEN) is then passed through the softmax nonlinearity to
        # create probability distribution of the prediction
        # The output of this stage is (batch_size, vocab_size)
        self.l_out = lasagne.layers.DenseLayer(
            self.l_layers[len(self.l_layers) - 1],
            num_units=num_units,
            W=lasagne.init.Normal(),
            nonlinearity=lasagne.nonlinearities.softmax)
        self.l_layers.append(self.l_out)

        # Theano tensor for the targets
        target_values = T.ivector('target_output')
        #!        target_var = T.matrix('target_output')

        # lasagne.layers.get_output produces a variable for the output of the net
        network_output = lasagne.layers.get_output(self.l_out)

        # https://github.com/Lasagne/Lasagne/blob/master/examples/recurrent.py
        # The network output will have shape (n_batch, 1); let's flatten to get a
        # 1-dimensional vector of predicted values
        #        predicted_values = network_output.flatten()

        #        flat_target_values = target_values.flatten()

        # Our cost will be mean-squared error
        #        cost = T.mean((predicted_values - flat_target_values)**2)
        #        cost = T.mean((network_output - target_values)**2)
        # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target.
        #!        cost = T.nnet.categorical_crossentropy(network_output,target_var).mean()
        cost = T.nnet.categorical_crossentropy(network_output,
                                               target_values).mean()

        # Retrieve all parameters from the network
        all_params = lasagne.layers.get_all_params(self.l_out, trainable=True)

        # Compute AdaGrad updates for training
        writeLog("Computing updates...")
        writeLog("Using optimizer: " + self.optimizer)
        if (self.optimizer == "sgd"):
            updates = lasagne.updates.sgd(cost, all_params, self.learning_rate)
        elif (self.optimizer == "adagrad"):
            updates = lasagne.updates.adagrad(cost, all_params,
                                              self.learning_rate)
        elif (self.optimizer == "adadelta"):
            updates = lasagne.updates.adagrad(cost, all_params,
                                              self.learning_rate, 0.95)
        elif (self.optimizer == "momentum"):
            updates = lasagne.updates.momentum(cost, all_params,
                                               self.learning_rate, 0.9)
        elif (self.optimizer == "nesterov_momentum"):
            updates = lasagne.updates.nesterov_momentum(
                cost, all_params, self.learning_rate, 0.9)
        elif (self.optimizer == "rmsprop"):
            updates = lasagne.updates.rmsprop(cost, all_params,
                                              self.learning_rate, 0.9)
        else:
            updates = lasagne.updates.adam(cost,
                                           all_params,
                                           self.learning_rate,
                                           beta1=0.9,
                                           beta2=0.999)

        # Theano functions for training and computing cost
        writeLog("Compiling train function...")
        self.train = theano.function(
            [l_in.input_var, target_values, l_mask.input_var],
            cost,
            updates=updates,
            allow_input_downcast=True)
        #!        self.train = theano.function([l_in.input_var, target_var, l_mask.input_var], cost, updates=updates, allow_input_downcast=True)
        writeLog("Compiling train cost computing function...")
        #        self.compute_cost = theano.function([l_in.input_var, target_values, l_mask.input_var], cost, allow_input_downcast=True)

        # In order to generate text from the network, we need the probability distribution of the next character given
        # the state of the network and the input (a seed).
        # In order to produce the probability distribution of the prediction, we compile a function called probs.
        writeLog("Compiling propabilities computing function...")
        self.propabilities = theano.function(
            [l_in.input_var, l_mask.input_var],
            network_output,
            allow_input_downcast=True)
Пример #30
0
 def initializationReport(self):
     writeLog("Initialized event log %s" % (self.filename))
     writeLog("  # cases: %d (train: %d, test: %d)" % (len(
         self.data["cases"]), len(self.trainingData), len(self.testData)))
     writeLog("  # activities: %d" % (len(self.data["activities"])))
     writeLog("  # case attributes: %d" %
              (len(self.data["attributes"]["case"])))
     writeLog("  # event attributes: %d" %
              (len(self.data["attributes"]["event"])))
     if (self.pTraining != None):
         writeLog("  Training set percentage: %d" %
                  (int(self.pTraining * 100)))