def use_experiment_with_kmeans(file): TimingLogger.start('kmeans', 'kmeans') x = read_matrix(file) X = np.array(x) logging.info("n_clusters max is " + str(len(x))) warnings_counting = 0 scores = [] for n_clusters in range(2, len(x)): if warnings_counting > 20: break with warnings.catch_warnings(): warnings.filterwarnings('error') try: kmeans = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = kmeans.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) logging.info("For n_clusters = " + str(n_clusters) + " The average silhouette_score is : " + str(silhouette_avg)) scores.append((n_clusters, silhouette_avg)) except ConvergenceWarning: logging.warning("For n_clusters = " + str(n_clusters) + " no convergence") warnings_counting += 1 if len(scores) > 0: logging.info( 'Better choice is ' + str(sorted(scores, key=lambda p: p[1], reverse=True)[0][0]) + ' clusters') TimingLogger.stop('kmeans')
def generate_result_json_files(source_dir): print('Tester Include =', TESTER_INCLUDE) TimingLogger.start('results', 'results') if os.path.isdir(source_dir): workers = multiprocessing.cpu_count() with ProcessPoolExecutor(max_workers=workers) as executor: file_list = [os.path.join(source_dir, f) for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))] chuncks = split_in(workers, file_list) for chunck in chuncks: executor.submit(_generate_result_json_files_parallel, chunck) TimingLogger.stop('results')
def use_kmeans(file, n_clusters, source, destination): TimingLogger.start('konly', 'konly') print('Entered with', file, n_clusters, source, destination) if not os.path.isdir(source) or not os.path.isdir(destination): raise ValueError('source and destination need to be dir') x = read_matrix(file) X = np.array(x) kmeans = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = kmeans.fit_predict(X) _copy_clusters(cluster_labels, source, destination) TimingLogger.stop('konly')
def our_files_together(source_dir, destination_dir): TimingLogger.start('together' + str(source_dir), 'together') source_path = os.path.join(*source_dir) if os.path.isdir(source_path): for dir_item in os.listdir(source_path): dir_item_path = os.path.join(*(source_dir + [dir_item])) if os.path.isfile(dir_item_path) and dir_item.endswith('.json'): print("Copying", dir_item_path) shutil.copyfile(dir_item_path, os.path.join(destination_dir, "___".join(source_dir[1:] + [dir_item]))) elif os.path.isdir(dir_item_path): our_files_together(source_dir + [dir_item], destination_dir) TimingLogger.stop('together' + str(source_dir))
def use_experiment_with_pyclustering_kmedoids(file): TimingLogger.start('pyclustering.kmedoids', 'kmedoids') x = read_matrix(file) X = np.array(x) clusters = len(x) search_instance = silhouette_ksearch( X, 2, clusters, algorithm=silhouette_ksearch_type.KMEDOIDS).process() scores = search_instance.get_scores() for i in range(2, len(scores)): logging.info('For n_clusters = ' + str(i) + ' The average silhouette_score is : ' + str(scores[i - 2])) logging.info('Better choice is ' + str(search_instance.get_amount()) + ' clusters') TimingLogger.stop('pyclustering.kmedoids')
def use_kmedoids(file, n_clusters, source, destination): TimingLogger.start('konly.kmedoids', 'konly') x = read_matrix(file) X = np.array(x) initial_medoids = random.sample(range(0, len(x)), n_clusters) kmedoids_instance = kmedoids(X, initial_medoids) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() cluster_labels = [None for _ in range(0, len(x))] cluster_index = 0 for cluster in clusters: for i in cluster: cluster_labels[int(i)] = cluster_index cluster_index += 1 _copy_clusters(cluster_labels, source, destination) TimingLogger.stop('konly.kmedoids')
def build_matrix_flow(source_dir, output_file, algorithm): print('One Value =', USING_ONE_VALUE) TimingLogger.start('matrix', 'matrix') if os.path.isdir(source_dir): files = [os.path.join(source_dir, x) for x in os.listdir(source_dir) if x.endswith('.result.json')] files = [x for x in files if os.path.isfile(x)] print('Building distance matrix...') distance_matrix = build_distance_matrix(files, algorithm) print('Preparing distance matrix results...') distance_matrix, columns_names = prepare_distance_matrix_results(distance_matrix, reverse=_isReverseAlg(algorithm)) print('Preparing 2-dimensional distance matrix...') distance_matrix, columns_statuses = prepare_2_dimensional_distance_matrix(distance_matrix) columns_names_aux = [] for i, column_status in zip(range(0, len(columns_statuses)), columns_statuses): if not columns_statuses[i]: columns_names_aux.append(columns_names[i]) print('Saving as CSV...') _save_matrix_as_csv(distance_matrix, columns_names_aux, output_file) TimingLogger.stop('matrix')
def reduce_amount_of_redundant_results_by_force(files, destination): TimingLogger.start('reduce', 'reduce') results = sorted(files, key=lambda _f: put_processes_on_one_value(FileHelper(_f).munch)['__one__']) os.makedirs(os.path.join(destination, 'chosen')) os.makedirs(os.path.join(destination, 'grouped')) head = None head_dir = None head_content = None print('Reducing...') path_resolution = PathResolution(files) for i in range(1, len(results)): other_content = put_processes_on_one_value(FileHelper(results[i]).munch)['__one__'] if head_content != other_content: head = results[i] head_content = put_processes_on_one_value(FileHelper(head).munch)['__one__'] head_dir = os.path.join(os.path.join(destination, 'grouped'), path_resolution.get_hash(head)) os.makedirs(head_dir) shutil.copyfile(head, os.path.join(head_dir, os.path.basename(head))) shutil.copyfile(head, os.path.join(os.path.join(destination, 'chosen'), os.path.basename(head))) else: shutil.copyfile(results[i], os.path.join(head_dir, os.path.basename(results[i]))) sys.stdout.write('\r%04d/%04d analyzed' % (i + 1, len(results))) print() TimingLogger.stop('reduce')
parser.add_argument('ext') parser.add_argument('year', type=int) parser.add_argument('--extra-log', type=str, default=None) args = parser.parse_args() handlers = [logging.StreamHandler()] if args.extra_log: handlers.append(logging.FileHandler(args.extra_log, 'a')) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO, handlers=handlers) TimingLogger.start('file_utils', 'file_utils') path = args.path ext = args.ext year = args.year print('You enter with path', path, 'and ext', ext) paths = [] if os.path.isdir(path): print('Your path', path, 'is a dir') paths = [ path + '/' + x for x in os.listdir(path) if os.path.isfile(path + '/' + x) and ( ext == 'any' or x.endswith(ext)) and not x.endswith('.tmp') ] print('There were found', len(paths), 'there that ends with', ext) else:
] ensure_that_objects_dir_exists() parser = argparse.ArgumentParser() parser.add_argument('path') parser.add_argument('ext') parser.add_argument('--parser', type=str, default='traceback', choices=[ 'traceback', 'full' ]) parser.add_argument('--start-from', type=str, default=None) parser.add_argument('--logging', type=str, default=None) parser.add_argument('--threads', type=int, default=1) parser.add_argument('--extra-log', type=str, default=None) TimingLogger.start('parsers', 'parsers') args = parser.parse_args() path = args.path ext = args.ext THREADS = args.threads start_from = args.start_from logging_file = args.logging if logging_file: logging_handlers.append(logging.FileHandler(logging_file, 'w')) if args.extra_log: logging_handlers.append(logging.FileHandler(args.extra_log, 'a')) logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, handlers=logging_handlers)