Exemplo n.º 1
0
def use_experiment_with_kmeans(file):
    TimingLogger.start('kmeans', 'kmeans')
    x = read_matrix(file)
    X = np.array(x)
    logging.info("n_clusters max is " + str(len(x)))
    warnings_counting = 0
    scores = []
    for n_clusters in range(2, len(x)):
        if warnings_counting > 20:
            break
        with warnings.catch_warnings():
            warnings.filterwarnings('error')
            try:
                kmeans = KMeans(n_clusters=n_clusters, random_state=10)
                cluster_labels = kmeans.fit_predict(X)
                silhouette_avg = silhouette_score(X, cluster_labels)
                logging.info("For n_clusters = " + str(n_clusters) +
                             " The average silhouette_score is : " +
                             str(silhouette_avg))
                scores.append((n_clusters, silhouette_avg))
            except ConvergenceWarning:
                logging.warning("For n_clusters = " + str(n_clusters) +
                                " no convergence")
                warnings_counting += 1
    if len(scores) > 0:
        logging.info(
            'Better choice is ' +
            str(sorted(scores, key=lambda p: p[1], reverse=True)[0][0]) +
            ' clusters')
    TimingLogger.stop('kmeans')
Exemplo n.º 2
0
def generate_result_json_files(source_dir):
    print('Tester Include =', TESTER_INCLUDE)
    TimingLogger.start('results', 'results')
    if os.path.isdir(source_dir):
        workers = multiprocessing.cpu_count()
        with ProcessPoolExecutor(max_workers=workers) as executor:
            file_list = [os.path.join(source_dir, f) for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]
            chuncks = split_in(workers, file_list)
            for chunck in chuncks:
                executor.submit(_generate_result_json_files_parallel, chunck)
    TimingLogger.stop('results')
Exemplo n.º 3
0
def use_kmeans(file, n_clusters, source, destination):
    TimingLogger.start('konly', 'konly')
    print('Entered with', file, n_clusters, source, destination)
    if not os.path.isdir(source) or not os.path.isdir(destination):
        raise ValueError('source and destination need to be dir')
    x = read_matrix(file)
    X = np.array(x)
    kmeans = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = kmeans.fit_predict(X)
    _copy_clusters(cluster_labels, source, destination)
    TimingLogger.stop('konly')
Exemplo n.º 4
0
def our_files_together(source_dir, destination_dir):
    TimingLogger.start('together' + str(source_dir), 'together')
    source_path = os.path.join(*source_dir)
    if os.path.isdir(source_path):
        for dir_item in os.listdir(source_path):
            dir_item_path = os.path.join(*(source_dir + [dir_item]))
            if os.path.isfile(dir_item_path) and dir_item.endswith('.json'):
                print("Copying", dir_item_path)
                shutil.copyfile(dir_item_path, os.path.join(destination_dir, "___".join(source_dir[1:] + [dir_item])))
            elif os.path.isdir(dir_item_path):
                our_files_together(source_dir + [dir_item], destination_dir)
    TimingLogger.stop('together' + str(source_dir))
Exemplo n.º 5
0
def use_experiment_with_pyclustering_kmedoids(file):
    TimingLogger.start('pyclustering.kmedoids', 'kmedoids')
    x = read_matrix(file)
    X = np.array(x)
    clusters = len(x)
    search_instance = silhouette_ksearch(
        X, 2, clusters, algorithm=silhouette_ksearch_type.KMEDOIDS).process()
    scores = search_instance.get_scores()
    for i in range(2, len(scores)):
        logging.info('For n_clusters = ' + str(i) +
                     ' The average silhouette_score is : ' +
                     str(scores[i - 2]))
    logging.info('Better choice is ' + str(search_instance.get_amount()) +
                 ' clusters')
    TimingLogger.stop('pyclustering.kmedoids')
Exemplo n.º 6
0
def use_kmedoids(file, n_clusters, source, destination):
    TimingLogger.start('konly.kmedoids', 'konly')
    x = read_matrix(file)
    X = np.array(x)
    initial_medoids = random.sample(range(0, len(x)), n_clusters)
    kmedoids_instance = kmedoids(X, initial_medoids)
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    cluster_labels = [None for _ in range(0, len(x))]
    cluster_index = 0
    for cluster in clusters:
        for i in cluster:
            cluster_labels[int(i)] = cluster_index
        cluster_index += 1
    _copy_clusters(cluster_labels, source, destination)
    TimingLogger.stop('konly.kmedoids')
Exemplo n.º 7
0
def build_matrix_flow(source_dir, output_file, algorithm):
    print('One Value =', USING_ONE_VALUE)
    TimingLogger.start('matrix', 'matrix')
    if os.path.isdir(source_dir):
        files = [os.path.join(source_dir, x) for x in os.listdir(source_dir) if x.endswith('.result.json')]
        files = [x for x in files if os.path.isfile(x)]
        print('Building distance matrix...')
        distance_matrix = build_distance_matrix(files, algorithm)
        print('Preparing distance matrix results...')
        distance_matrix, columns_names = prepare_distance_matrix_results(distance_matrix,
                                                                         reverse=_isReverseAlg(algorithm))
        print('Preparing 2-dimensional distance matrix...')
        distance_matrix, columns_statuses = prepare_2_dimensional_distance_matrix(distance_matrix)
        columns_names_aux = []
        for i, column_status in zip(range(0, len(columns_statuses)), columns_statuses):
            if not columns_statuses[i]:
                columns_names_aux.append(columns_names[i])
        print('Saving as CSV...')
        _save_matrix_as_csv(distance_matrix, columns_names_aux, output_file)
    TimingLogger.stop('matrix')
Exemplo n.º 8
0
def reduce_amount_of_redundant_results_by_force(files, destination):
    TimingLogger.start('reduce', 'reduce')
    results = sorted(files, key=lambda _f: put_processes_on_one_value(FileHelper(_f).munch)['__one__'])
    os.makedirs(os.path.join(destination, 'chosen'))
    os.makedirs(os.path.join(destination, 'grouped'))
    head = None
    head_dir = None
    head_content = None
    print('Reducing...')
    path_resolution = PathResolution(files)
    for i in range(1, len(results)):
        other_content = put_processes_on_one_value(FileHelper(results[i]).munch)['__one__']
        if head_content != other_content:
            head = results[i]
            head_content = put_processes_on_one_value(FileHelper(head).munch)['__one__']
            head_dir = os.path.join(os.path.join(destination, 'grouped'), path_resolution.get_hash(head))
            os.makedirs(head_dir)
            shutil.copyfile(head, os.path.join(head_dir, os.path.basename(head)))
            shutil.copyfile(head, os.path.join(os.path.join(destination, 'chosen'), os.path.basename(head)))
        else:
            shutil.copyfile(results[i], os.path.join(head_dir, os.path.basename(results[i])))
        sys.stdout.write('\r%04d/%04d analyzed' % (i + 1, len(results)))
    print()
    TimingLogger.stop('reduce')
Exemplo n.º 9
0
    parser.add_argument('ext')
    parser.add_argument('year', type=int)
    parser.add_argument('--extra-log', type=str, default=None)

    args = parser.parse_args()

    handlers = [logging.StreamHandler()]

    if args.extra_log:
        handlers.append(logging.FileHandler(args.extra_log, 'a'))

    logging.basicConfig(format='%(asctime)s %(message)s',
                        level=logging.INFO,
                        handlers=handlers)

    TimingLogger.start('file_utils', 'file_utils')

    path = args.path
    ext = args.ext
    year = args.year
    print('You enter with path', path, 'and ext', ext)
    paths = []
    if os.path.isdir(path):
        print('Your path', path, 'is a dir')
        paths = [
            path + '/' + x for x in os.listdir(path)
            if os.path.isfile(path + '/' + x) and (
                ext == 'any' or x.endswith(ext)) and not x.endswith('.tmp')
        ]
        print('There were found', len(paths), 'there that ends with', ext)
    else:
Exemplo n.º 10
0
    ]

    ensure_that_objects_dir_exists()

    parser = argparse.ArgumentParser()
    parser.add_argument('path')
    parser.add_argument('ext')
    parser.add_argument('--parser', type=str, default='traceback', choices=[
        'traceback', 'full'
    ])
    parser.add_argument('--start-from', type=str, default=None)
    parser.add_argument('--logging', type=str, default=None)
    parser.add_argument('--threads', type=int, default=1)
    parser.add_argument('--extra-log', type=str, default=None)

    TimingLogger.start('parsers', 'parsers')

    args = parser.parse_args()
    path = args.path
    ext = args.ext
    THREADS = args.threads
    start_from = args.start_from
    logging_file = args.logging

    if logging_file:
        logging_handlers.append(logging.FileHandler(logging_file, 'w'))
    if args.extra_log:
        logging_handlers.append(logging.FileHandler(args.extra_log, 'a'))

    logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT, handlers=logging_handlers)