def fit_model(file_to_print):
    logger = logging.getLogger('one_split_fit_pso')
    configure_logger_by_default(logger)
    logger.info("START fit_model")

    def print_info(info):
        logger.info(info)
        print(info)
        file_to_print.write(info + "\n")

    CONFIG['pso_options']['print_info'] = print_info
    train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES,
                                                      INPUT_LABELS))

    train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[
        train_index]
    test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[
        test_index]
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features = scaler.transform(train_features)
    test_features = scaler.transform(test_features)

    train_features = torch.from_numpy(train_features)
    train_labels = torch.from_numpy(train_labels)
    test_features = torch.from_numpy(test_features)
    test_labels = torch.from_numpy(test_labels)

    model = Model()
    train_pso(model, train_features, train_labels, test_features, test_labels,
              CONFIG)
    logger.info("END fit_model")
Пример #2
0
def get_test_accuracy_by_epoch() -> Tuple[List[int], List[float], List[int]]:
    logger = logging.getLogger('early_stopping')
    configure_logger_by_default(logger)
    logger.info("START get_test_accuracy_by_epoch")
    train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES, INPUT_LABELS))

    model = Model(INPUT_FEATURES.shape[1])
    criterion = CONFIG['criterion']()
    optimizer = CONFIG['optimizer'](model.parameters(), lr=CONFIG['lr'])
    train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[train_index]
    test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[test_index]
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features = scaler.transform(train_features)
    test_features = scaler.transform(test_features)
    trainloader = torch.utils.data.DataLoader(
        PsobDataset(train_features, train_labels),
        batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2
    )
    testloader = torch.utils.data.DataLoader(
        PsobDataset(test_features, test_labels),
        batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2
    )
    accuracies = []
    best_accuracy = -1
    durations = []
    current_duration = 0
    for epoch in tqdm(range(CONFIG['epochs'])):
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        correct = 0
        total = 0
        with torch.no_grad():
            for data in testloader:
                features, labels = data
                outputs = model(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        if best_accuracy >= accuracy:
            current_duration += 1
        else:
            if current_duration != 0:
                durations.append(current_duration)
            current_duration = 0
        best_accuracy = max(best_accuracy, accuracy)
        accuracies.append(accuracy)
        logger.info(str(epoch) + ": " + str(accuracy))
    if current_duration != 0:
        durations.append(current_duration)
    logger.info("END get_test_accuracy_by_epoch")
    return [i for i in range(CONFIG['epochs'])], accuracies, durations
def get_accuracies_for_lr() -> Dict[float, float]:
    logger = logging.getLogger('learning_rate')
    configure_logger_by_default(logger)
    logger.info("START get_accuracies_for_lr")
    accuracies_by_lr = defaultdict(lambda: -1.0)
    for lr in CONFIG['params']['lr']:
        logger.info("lr = " + str(lr))
        skf = CONFIG['cv']
        train_index, test_index = next(skf.split(INPUT_FEATURES, INPUT_LABELS))

        model = Model(INPUT_FEATURES.shape[1])
        criterion = CONFIG['criterion']()
        optimizer = CONFIG['optimizer'](model.parameters(), lr=lr, momentum=CONFIG['momentum'])
        train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[train_index]
        test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[test_index]
        trainloader = torch.utils.data.DataLoader(
            PsobDataset(train_features, train_labels),
            batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2
        )
        testloader = torch.utils.data.DataLoader(
            PsobDataset(test_features, test_labels),
            batch_size=CONFIG['batch_size'], shuffle=CONFIG['shuffle'], num_workers=2
        )
        current_duration = 0
        for epoch in tqdm(range(CONFIG['epochs'])):
            for i, data in enumerate(trainloader, 0):
                inputs, labels = data
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            correct = 0
            total = 0
            with torch.no_grad():
                for data in testloader:
                    features, labels = data
                    outputs = model(features)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
            accuracy = correct / total
            if accuracies_by_lr[lr] >= accuracy:
                current_duration += 1
            else:
                current_duration = 0
            accuracies_by_lr[lr] = max(accuracies_by_lr[lr], accuracy)
            if current_duration > CONFIG['early_stopping_rounds']:
                break
            if epoch % 10 == 0:
                logger.info("CHECKPOINT EACH 10th EPOCH" + str(epoch) + ": " + str(accuracy))
            if epoch % 100 == 0:
                logger.info("CHECKPOINT EACH 100th EPOCH" + str(epoch) + ": " + str(accuracy))
            logger.info(str(epoch) + ": " + str(accuracy))
    logger.info("END get_accuracies_for_lr")
    return accuracies_by_lr
Пример #4
0
def get_best_metrics_and_accuracy_from_metrics_set(metrics_sets) -> Tuple[List[int], float]:
    logger = logging.getLogger('finding_best_metrics_and_accuracy')
    configure_logger_by_default(logger)
    logger.info("STARTED FINDING BEST METRICS SET")
    loaded_features, loaded_labels = \
        torch.load("../calculated_features/split_each_file_features.tr"), torch.load("../calculated_features/split_each_file_labels.tr")
    skf = StratifiedKFold(n_splits=10, shuffle=True)
    train_index, test_index = next(skf.split(loaded_features, loaded_labels))

    best_metrics = None
    best_accuracy = -1
    for metrics in tqdm(metrics_sets):
        metrics = list(metrics)
        if len(metrics) == 0:
            continue

        model = Model(len(metrics))
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        train_features, train_labels = loaded_features[train_index], loaded_labels[train_index]
        test_features, test_labels = loaded_features[test_index], loaded_labels[test_index]
        trainloader = torch.utils.data.DataLoader(
            PsobDataset(train_features, train_labels, metrics),
            batch_size=BATCH_SIZE, shuffle=True, num_workers=2
        )
        testloader = torch.utils.data.DataLoader(
            PsobDataset(test_features, test_labels, metrics),
            batch_size=BATCH_SIZE, shuffle=False, num_workers=2
        )
        for _ in range(EPOCHS):
            for i, data in enumerate(trainloader, 0):
                inputs, labels = data
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
        correct = 0
        total = 0
        with torch.no_grad():
            for data in testloader:
                features, labels = data
                outputs = model(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        log_info = str(metrics) + ": " + str(accuracy)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_metrics = metrics
            log_info += " NEW BEST"
        logger.info(log_info)
    logger.info("END FINDING BEST METRICS SET")
    return best_metrics, best_accuracy
def get_accuracies(file_to_print):
    logger = logging.getLogger(CONFIG['experiment_name'])
    configure_logger_by_default(logger)
    logger.info("START " + CONFIG['experiment_name'])

    def print_info(info):
        logger.info(info)
        print(info)
        file_to_print.write(info + "\n")

    CONFIG['pso_options']['print_info'] = print_info

    train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES,
                                                      INPUT_LABELS))
    train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[
        train_index]
    test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[
        test_index]
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features = scaler.transform(train_features)
    test_features = scaler.transform(test_features)

    train_features = torch.from_numpy(train_features)
    train_labels = torch.from_numpy(train_labels)
    test_features = torch.from_numpy(test_features)
    test_labels = torch.from_numpy(test_labels)

    train_accuracies = []
    test_accuracies = []
    hidden_dims = []
    for hidden_layer_percentage in CONFIG['hidden_layer_percentage']:
        hidden_dim = int(CONFIG['max_hidden_layer_size'] *
                         hidden_layer_percentage)
        model = Model(hidden_dim=hidden_dim)
        test_acc, train_acc = train_bp(model, train_features, train_labels,
                                       test_features, test_labels, CONFIG)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
        hidden_dims.append(hidden_dim)
    print_info("======RESULTING ACCURACIES======")
    print_info("TRAIN: " + str(train_accuracies))
    print_info("TEST: " + str(test_accuracies))
    print_info("HIDDEN DIMS: " + str(hidden_dims))
    logger.info("END " + CONFIG['experiment_name'])
    return train_accuracies, test_accuracies, hidden_dims
Пример #6
0
def get_accuracies(file_to_print):
    logger = logging.getLogger(CONFIG['experiment_name'])
    configure_logger_by_default(logger)
    logger.info("START " + CONFIG['experiment_name'])

    def print_info(info):
        logger.info(info)
        print(info)
        file_to_print.write(info + "\n")

    CONFIG['pso_options']['print_info'] = print_info
    full_train_index, test_index = next(CONFIG['cv'].split(
        INPUT_FEATURES, INPUT_LABELS))
    np.random.shuffle(full_train_index)
    train_accuracies = []
    test_accuracies = []
    for train_percentage in CONFIG['train_splits']:
        train_size = int(train_percentage * full_train_index.shape[0])
        train_index = full_train_index[0:train_size]
        train_features, train_labels = INPUT_FEATURES[
            train_index], INPUT_LABELS[train_index]
        test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[
            test_index]
        scaler = preprocessing.StandardScaler().fit(train_features)
        train_features = scaler.transform(train_features)
        test_features = scaler.transform(test_features)

        train_features = torch.from_numpy(train_features)
        train_labels = torch.from_numpy(train_labels)
        test_features = torch.from_numpy(test_features)
        test_labels = torch.from_numpy(test_labels)

        model = Model()
        test_acc, train_acc = train_bp(model, train_features, train_labels,
                                       test_features, test_labels, CONFIG)
        train_accuracies.append(train_acc)
        test_accuracies.append(test_acc)
    print_info("======RESULTING ACCURACIES======")
    print_info("TRAIN: " + str(train_accuracies))
    print_info("TEST: " + str(test_accuracies))
    logger.info("END " + CONFIG['experiment_name'])
    return train_accuracies, test_accuracies
Пример #7
0
def fit_model(file_to_print):
    logger = logging.getLogger('one_split_fit')
    configure_logger_by_default(logger)
    logger.info("START fit_model")

    def print_info(info):
        logger.info(info)
        print(info)
        file_to_print.write(info + "\n")

    CONFIG['pso_options']['print_info'] = print_info
    train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES,
                                                      INPUT_LABELS))
    train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[
        train_index]
    test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[
        test_index]
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features = scaler.transform(train_features)
    test_features = scaler.transform(test_features)

    train_features = torch.from_numpy(train_features)
    train_labels = torch.from_numpy(train_labels)
    test_features = torch.from_numpy(test_features)
    test_labels = torch.from_numpy(test_labels)

    model = Model()
    for trainer_type in CONFIG['trainers_to_use']:
        if trainer_type == 'bp':
            test_acc, train_acc, test_loss, train_loss = \
                train_bp(model, train_features, train_labels, test_features, test_labels, CONFIG)
            print_info("Best Test Accuracy: " + str(test_acc) +
                       ", Train Accuracy: " + str(train_acc) +
                       ", Test Loss: " + str(test_loss) + ", Train Loss: " +
                       str(train_loss))
        elif trainer_type == 'pso':
            train_pso(model, train_features, train_labels, test_features,
                      test_labels, CONFIG)
    logger.info("END fit_model")
Пример #8
0
def get_features_mutual_info(file_to_print):
    logger = logging.getLogger(CONFIG['experiment_name'])
    configure_logger_by_default(logger)
    logger.info("START " + CONFIG['experiment_name'])

    def print_info(info):
        logger.info(info)
        print(info)
        file_to_print.write(info + "\n")

    mutual_info = mutual_info_classif(
        INPUT_FEATURES,
        INPUT_LABELS,
        discrete_features=CONFIG['discrete_features'],
        random_state=CONFIG['random_state'])
    print_info("Mutual info: ")
    print_info(
        str(
            list(
                zip(range(mutual_info.shape[0]),
                    zip(MetricsCalculator.ALL_METRICS, mutual_info)))))
    logger.info("END " + CONFIG['experiment_name'])
    return mutual_info, MetricsCalculator.ALL_METRICS
def fit_model(file_to_print):
    logger = logging.getLogger('one_split_fit')
    configure_logger_by_default(logger)
    logger.info("START fit_model")

    def print_info(info):
        logger.info(info)
        print(info)
        file_to_print.write(info + "\n")

    train_index, test_index = next(CONFIG['cv'].split(INPUT_FEATURES,
                                                      INPUT_LABELS))
    model = Model(len(CONFIG['metrics']))
    criterion = CONFIG['criterion']()
    optimizer = CONFIG['optimizer'](model.parameters(),
                                    lr=CONFIG['lr'],
                                    momentum=CONFIG['momentum'])
    train_features, train_labels = INPUT_FEATURES[train_index], INPUT_LABELS[
        train_index]
    test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[
        test_index]
    scaler = preprocessing.StandardScaler().fit(train_features)
    train_features = scaler.transform(train_features)
    test_features = scaler.transform(test_features)
    trainloader = torch.utils.data.DataLoader(PsobDataset(
        train_features, train_labels, CONFIG['metrics']),
                                              batch_size=CONFIG['batch_size'],
                                              shuffle=CONFIG['shuffle'],
                                              num_workers=2)
    testloader = torch.utils.data.DataLoader(PsobDataset(
        test_features, test_labels, CONFIG['metrics']),
                                             batch_size=CONFIG['batch_size'],
                                             shuffle=CONFIG['shuffle'],
                                             num_workers=2)
    best_accuracy = -1.0
    current_duration = 0
    for epoch in range(CONFIG['epochs']):
        for i, data in enumerate(trainloader, 0):
            inputs, labels = data
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        correct = 0
        total = 0
        with torch.no_grad():
            for data in testloader:
                features, labels = data
                outputs = model(features)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total
        if best_accuracy >= accuracy:
            current_duration += 1
        else:
            current_duration = 0
        best_accuracy = max(best_accuracy, accuracy)
        if current_duration > CONFIG['early_stopping_rounds']:
            print_info("On epoch " + str(epoch) +
                       " training was early stopped")
            break
        if epoch % 10 == 0:
            logger.info("CHECKPOINT EACH 10th EPOCH" + str(epoch) + ": " +
                        str(accuracy))
        if epoch % 100 == 0:
            print_info("CHECKPOINT EACH 100th EPOCH " + str(epoch) +
                       ": current accuracy " + str(accuracy) + " , best " +
                       str(best_accuracy))
        logger.info(str(epoch) + ": " + str(accuracy))

    logger.info('Finished Training')

    correct = 0
    total = 0
    labels_dist = torch.zeros(CONFIG['number_of_authors'])
    labels_correct = torch.zeros(CONFIG['number_of_authors'])
    with torch.no_grad():
        for data in testloader:
            features, labels = data
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            for i, label in enumerate(labels):
                labels_dist[label] += 1
                labels_correct[label] += predicted[i] == labels[i]
    print_info('Best accuracy: ' + str(max(best_accuracy, correct / total)))
    print_info('Final accuracy of the network: %d / %d = %d %%' %
               (correct, total, 100 * correct / total))
    print_info("Correct labels / labels for each author:\n" +
               str(torch.stack((labels_correct, labels_dist), dim=1)))
    logger.info("END fit_model")
Пример #10
0
def run_cross_validation_psobp(file_to_print) -> torch.Tensor:
    logger = logging.getLogger('10_fold_cv')
    configure_logger_by_default(logger)
    logger.info("START run_cross_validation")

    def print_info(info):
        logger.info(info)
        print(info)
        file_to_print.write(info + "\n")

    accuracies = torch.zeros((CONFIG['n_splits'], CONFIG['n_repeats']))
    fold_number = -1
    repeat_number = -1
    CONFIG['pso_options']['print_info'] = print_info

    for train_index, test_index in CONFIG['cv'].split(INPUT_FEATURES,
                                                      INPUT_LABELS):
        fold_number += 1
        if fold_number % CONFIG['n_splits'] == 0:
            fold_number = 0
            repeat_number += 1

        def print_info(info):
            logger.info(info)
            print(info)
            file_to_print.write(info + "\n")

        print_info("New " + str(fold_number) + " fold. repeat = " +
                   str(repeat_number))
        train_features, train_labels = INPUT_FEATURES[
            train_index], INPUT_LABELS[train_index]
        test_features, test_labels = INPUT_FEATURES[test_index], INPUT_LABELS[
            test_index]

        scaler = preprocessing.StandardScaler().fit(train_features)
        train_features = scaler.transform(train_features)
        test_features = scaler.transform(test_features)

        train_features = torch.from_numpy(train_features)
        train_labels = torch.from_numpy(train_labels)
        test_features = torch.from_numpy(test_features)
        test_labels = torch.from_numpy(test_labels)

        model = Model().to(CONFIG['device'])

        best_accuracy_bp, best_accuracy_pso = None, None
        for trainer_type in CONFIG['trainers_to_use']:
            if trainer_type == 'bp':
                best_accuracy_bp, _ = train_bp(model, train_features,
                                               train_labels, test_features,
                                               test_labels, CONFIG)
            elif trainer_type == 'pso':
                best_accuracy_pso = train_pso(model, train_features,
                                              train_labels, test_features,
                                              test_labels, CONFIG)
        print_info('Best accuracy pso: ' + str(best_accuracy_pso) + ', bp: ' +
                   str(best_accuracy_bp))
        best_accuracy = max(best_accuracy_bp, best_accuracy_pso)
        accuracies[fold_number][repeat_number] = best_accuracy
        print_info('Best final accuracy: ' + str(best_accuracy))
        print_info('END OF EVALUATION OF ' + str(fold_number) +
                   ' FOLD, REPEAT ' + str(repeat_number))
        print_info(
            '========================================================================='
        )
    logger.info("END run_cross_validation")
    return accuracies
Пример #11
0
 def __init__(self, language_config, dataset_path: str,
              ast_path: str) -> None:
     configure_logger_by_default(self.LOGGER)
     self.LOGGER.info("Started calculating metrics")
     super().__init__()
     self.language_config = language_config
     self.line_metrics_calculator = LineMetricsCalculator(
         language_config, dataset_path)
     self.character_metrics_calculator = CharacterMetricsCalculator(
         dataset_path)
     self.ast_metrics_calculator = \
         AstMetricsCalculator(language_config, ast_path, self.character_metrics_calculator.character_number_for_file)
     self.LOGGER.info("End calculating metrics")
     self.metrics_functions = {
         "ratio_of_blank_lines_to_code_lines":
         self.line_metrics_calculator.ratio_of_blank_lines_to_code_lines,
         "ratio_of_comment_lines_to_code_lines":
         self.line_metrics_calculator.ratio_of_comment_lines_to_code_lines,
         "ratio_of_block_comments_to_all_comment_lines":
         self.line_metrics_calculator.
         ratio_of_block_comments_to_all_comment_lines,
         "ratio_of_open_braces_alone_in_a_line":
         self.character_metrics_calculator.
         ratio_of_open_braces_alone_in_a_line,
         "ratio_of_close_braces_alone_in_a_line":
         self.character_metrics_calculator.
         ratio_of_close_braces_alone_in_a_line,
         "average_character_number_per_java_file":
         self.character_metrics_calculator.
         average_character_number_per_java_file,
         "ratio_of_variable_naming_without_uppercase_letters":
         self.ast_metrics_calculator.variable_metrics_calculator.
         ratio_of_variable_naming_without_uppercase_letters,
         "ratio_of_variable_naming_starting_with_lowercase_letters":
         self.ast_metrics_calculator.variable_metrics_calculator.
         ratio_of_variable_naming_starting_with_lowercase_letters,
         "average_variable_name_length":
         self.ast_metrics_calculator.variable_metrics_calculator.
         average_variable_name_length,
         "ratio_of_macro_variables":
         self.ast_metrics_calculator.variable_metrics_calculator.
         ratio_of_macro_variables,
         "preference_for_cyclic_variables":
         self.ast_metrics_calculator.variable_metrics_calculator.
         preference_for_cyclic_variables,
         "ratio_of_for_statements_to_all_loop_statements":
         self.ast_metrics_calculator.statements_metrics_calculator.
         ratio_of_for_statements_to_all_loop_statements,
         "ratio_of_if_statements_to_all_conditional_statements":
         self.ast_metrics_calculator.statements_metrics_calculator.
         ratio_of_if_statements_to_all_conditional_statements,
         "average_number_of_methods_per_class":
         self.ast_metrics_calculator.statements_metrics_calculator.
         average_number_of_methods_per_class,
         "ratio_of_catch_statements_when_dealing_with_exceptions":
         self.ast_metrics_calculator.statements_metrics_calculator.
         ratio_of_catch_statements_when_dealing_with_exceptions,
         "ratio_of_branch_statements":
         self.ast_metrics_calculator.statements_metrics_calculator.
         ratio_of_branch_statements,
         "ratio_of_try_structure":
         self.ast_metrics_calculator.statements_metrics_calculator.
         ratio_of_try_structure,
         "average_number_of_interfaces_per_class":
         self.ast_metrics_calculator.statements_metrics_calculator.
         average_number_of_interfaces_per_class,
         "maximum_depth_of_an_ast":
         self.ast_metrics_calculator.maximum_depth_of_an_ast
     }