factory = XFactory() print(os.listdir(datadir)) for dir in os.listdir(datadir): if not os.path.isdir(os.path.join(datadir, dir)): continue outdir = os.path.join(datadir, dir, 'l1000') os.makedirs(outdir) for xlog_filepath in os.listdir(os.path.join(datadir, dir, 'l5000')): if '.xes.gz' not in xlog_filepath: continue print('Processing {}'.format(xlog_filepath)) with open(os.path.join(datadir, dir, xlog_filepath), 'r') as f: xlog = XUniversalParser().parse(f)[0] assert isinstance(xlog, XLog) new_xlog = factory.create_log(xlog.get_attributes()) traces = np.random.choice(xlog, nb_traces, replace=False) new_xlog.get_classifiers().append(xlog.get_classifiers()[0]) for t in traces: new_xlog.append(t) with open(outdir + os.sep + xlog_filepath, 'w') as f: XesXmlGZIPSerializer().serialize(new_xlog, f)
CONVERGENCE_TOLERANCE = 0.001 NUM_THREADS = 8 kmeans = KMeans(n_clusters=NUM_CLUSTERS, max_iter=MAX_ITERATIONS, init=INITIALIZE_CLUSTERS, tol=CONVERGENCE_TOLERANCE, n_jobs=NUM_THREADS) # Create the cluster with the log vector kmeans.fit(log_vector) # Create new log with the attribute for the original log new_logs = {} for i in range(len(kmeans.cluster_centers_)): new_log = XFactory.create_log(log.get_attributes().clone()) for elem in log.get_extensions(): new_log.get_extensions().add(elem) new_log.__classifiers = log.get_classifiers().copy() new_log.__globalTraceAttributes = log.get_global_trace_attributes( ).copy() new_log.__globalEventAttributes = log.get_global_event_attributes( ).copy() new_logs[str(i)] = new_log # Distribute the trace depending the cluster. for point, trace in zip(log_vector, log): cluster = kmeans.predict([point])[0] new_logs[str(cluster)].append(trace)