Exemplo n.º 1
0
def _process(kwargs):
    pid = os.getpid()
    mac = kwargs["mac"]
    log = kwargs["log"]
    syn_para = kwargs["syn_para"]
    acr_para = kwargs["acr_para"]
    img_para = kwargs["img_para"]
    ip_tv_para = kwargs["ip_tv"]
    stat_para = kwargs["stat_cfg"]

    acr_queue = aioqueue.Queue(max_size=acr_para.queue_cap)

    mkdir(log["path"])
    log_init(log)

    ip_tv_init(ip_tv_para.url, ip_tv_para.local)

    threads = [
        SynThread(
            mac=mac,
            syn_para=syn_para,
            acr_para=acr_para,
            img_para=img_para,
            acr_queue=acr_queue,
            stat_para=stat_para,
        ),
        AcrLog(
            acr_para=acr_para,
            acr_queue=acr_queue,
            stat_para=stat_para,
        )
    ]

    def _signal(sig, frame):
        logging.info(f"{os.getpid()} recv signal {sig} {frame}")
        for w in threads:
            w.close()

    signal.signal(signal.SIGINT, _signal)
    signal.signal(signal.SIGTERM, _signal)
    signal.signal(signal.SIGABRT, _signal)

    logging.info(f"process {pid} start... ")

    try:
        for task in threads:
            task.start()

        for task in threads:
            task.join()
    except:
        logging.error(f"process {pid} over... ")

    logging.info(f"process {pid} end... ")
def main():
    log_init(path.join(RESULTS_DIR_PATH, "log", "classifier_training_.txt"))
    log("ML4Refactoring: Binary classification")
    refactorings = build_refactorings(Level)

    # Run models
    models = build_models()
    pipeline = None

    pipeline = BinaryClassificationPipeline(models, refactorings, DATASETS)
    results = pipeline.run()

    return results
    if not path.exists(fig_path_box):
        combined_stable_metrics = pd.DataFrame()
        for level in STABLE_LEVELS:
            for k in STABLE_Ks:
                stable_metrics = get_metrics_stable_level_unique_metrics(level, k, metrics, samples=35000)
                stable_metrics['K'] = k
                stable_metrics = pd.melt(stable_metrics, id_vars="K", var_name="Metric", value_vars=metrics, value_name="values")
                stable_metrics["Metric"] = stable_metrics["Metric"].apply(lambda x: f"{x} {str(level)}")
                combined_stable_metrics = combined_stable_metrics.append(stable_metrics)
        # plot
        line_plot_seaborn(combined_stable_metrics, title, fig_path_box, xticks=STABLE_Ks, yticks=yticks, scale="log", custom_palette=custom_palette)
    else:
        log(f"--Skipped plot at {fig_path_box}, because it already exists.")


log_init(f"results/Distribution/class_metrics_distribution_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt")
start_time = time.time()

Path(path.dirname("results/Distribution/Class_Metrics/K/")).mkdir(parents=True, exist_ok=True)
custom_palette = {"classCbo Level.Class":"red", "classCbo Level.Method":"brown", "classCbo Level.Variable":"orangered", "classCbo Level.Field":"maroon",
                  "classTCC Level.Class":"green", "classTCC Level.Method":"olive", "classTCC Level.Variable":"lime", "classTCC Level.Field":"yellowgreen",
                  "classWmc Level.Class":"blue", "classWmc Level.Method":"navy", "classWmc Level.Variable":"cyan", "classWmc Level.Field":"dodgerblue"}
level_merged_stable_k("Distribution/Class_Metrics/K", metrics=CLASS_METRICS_REDUCED_Fields, yticks=[1, 2.5, 3.5, 5, 7.5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 75, 90, 100, 125, 150, 200, 250, 300, 350],
                      title=f"Class Metrics: Stable K's",
                      file_descriptor=f"Class_Metrics_Reduced_K",
                      custom_palette=custom_palette)

custom_palette = {"classNumberOfMethods Level.Class":"red", "classNumberOfMethods Level.Method":"brown", "classNumberOfMethods Level.Variable":"orangered", "classNumberOfMethods Level.Field":"maroon",
                  "classNumberOfPublicFields Level.Class":"green", "classNumberOfPublicFields Level.Method":"olive", "classNumberOfPublicFields Level.Variable":"lime", "classNumberOfPublicFields Level.Field":"yellowgreen",
                  "classStringLiteralsQty Level.Class":"blue", "classStringLiteralsQty Level.Method":"navy", "classStringLiteralsQty Level.Variable":"cyan", "classStringLiteralsQty Level.Field":"dodgerblue",
                  "classUniqueWordsQty Level.Class":"black", "classUniqueWordsQty Level.Method":"grey", "classUniqueWordsQty Level.Variable":"lightgrey", "classUniqueWordsQty Level.Field":"snow",
Exemplo n.º 4
0
from configs import DATASETS, Level, VALIDATION_DATASETS
from db.QueryBuilder import get_all_level_stable, get_level_refactorings_count, get_level_refactorings
from db.DBConnector import execute_query
from utils.log import log_init, log_close, log
import time

log_init()
log('Begin cache warm-up')
start_time = time.time()

for dataset in (DATASETS + VALIDATION_DATASETS):
    log("\n**** dataset: " + dataset)
    for level in Level:
        log("-- non refactored instances for " + str(level))
        non_refactored = execute_query(
            get_all_level_stable(int(level), dataset))
        log(
            str(len(non_refactored)) +
            " non-refactored instances were found for level: " + str(level))

        log("-- " + str(level) + " refactoring types with count")
        refactorings = execute_query(
            get_level_refactorings_count(int(level), dataset))
        log(refactorings.to_string())
        for refactoring_name in refactorings['refactoring']:
            refactoring_instances = execute_query(
                get_level_refactorings(int(level), refactoring_name, dataset))

log('Cache warm-up took %s seconds.' % (time.time() - start_time))
log_close()
Exemplo n.º 5
0
                                             [db_ids_val])
    formatted_results = format_results_single_run(
        DATASET, refactoring_name, ["test set github"], model_name,
        val_scores["f1_score"], val_scores["precision"], val_scores["recall"],
        val_scores['accuracy'], val_scores['tn'], val_scores['fp'],
        val_scores['fn'], val_scores['tp'],
        val_scores["permutation_importance"], trained_model, features,
        json.dumps(trained_model.get_params()))
    save_validation_results(model_name, val_results[0], "test set github",
                            formatted_results)
    return formatted_results, trained_model.get_params()


# Start
log_init(
    f"{SAVE_DIRECTORY}classifier_evaluation_test-set_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
)
log('Begin classifier evaluation')

refactorings = build_refactorings(Level)

for model_name in [
        "LogisticRegressionRefactoringModel", "RandomForestRefactoringModel"
]:
    evaluation_path = f"{SAVE_DIRECTORY}test_set_evaluation{model_name}.xlsx"
    params_path = f"{SAVE_DIRECTORY}{model_name}_parameter.xlsx"
    if not os.path.exists(evaluation_path):
        evaluation_results = pd.DataFrame()
        parameter_sets = pd.DataFrame()
        for refactoring in refactorings:
            refactoring_name = refactoring.name()
Exemplo n.º 6
0
# global path variable
local_base_path = '/data/snapshots/'
remote_base_path = '/data/snapshots/'

if __name__ == '__main__':
    logging.info('--------------begin perfom all application-------- ')

    file = open(sys.argv[1], 'r', encoding='utf-8')
    # file = open('collect_ip.json', 'r', encoding='utf-8')
    ci_array = json.load(file)

    # read log config file
    file_log = open(sys.argv[2], 'r', encoding='utf-8')
    # file_log = open('loggin_conf.json', 'r', encoding='utf-8')
    ci_array_log = json.load(file_log)
    log_init(ci_array_log['logging'])

    pool = mp.Pool(processes=5)  # process pool
    p_work = partial(transition, remote_base_path, local_base_path)  # perform rsync file function

    for item in ci_array:
        '''
        Multiple processes perform synchronization tasks
        '''
        try:

            if not check_ssh(host=item.get("ip"), user=item.get("account"), port=item.get("port"),
                             passwd=item.get("pwd"), dest_path="/data/snapshots/"):
                logging.error('SSH connect faild!')
                exit(-1)
            pool.map(p_work, (item, ))
    plt.plot(x, y)
    plt.ylabel(f"Fraction unique classes")
    plt.xlabel("commit threshold")
    plt.title(f"blabla")
    plt.ylim(ymin=1)
    fig_path = f"results/StableInstances/test.png"
    plt.savefig(fig_path)


def stable_instance_statistics():
    query = "SELECT * FROM stable_unique_classes_all_level;"
    data = execute_query(query)
    commitThresholds = data["level" == 1]["commitThreshold"]
    plot_x_y(commitThresholds, data["unique_class_files_fraction"])
    data = data.groupby("level")
    return data


log_init(f"")
log('Begin Statistics')
start_time = time.time()

Path(path.dirname("results/StableInstances/")).mkdir(parents=True, exist_ok=True)

stable_instance_statistics()

log(f"Processing statistics took {time.time() - start_time:.2f} seconds.")
log_close()

exit()

def plot_frequency(frequency_data, metric, var_name, level):
    frequency_data_melt = pd.melt(frequency_data,
                                  id_vars=metric,
                                  var_name=var_name,
                                  value_name="Frequency")
    line_plot_seaborn(frequency_data_melt,
                      x=metric,
                      y="Frequency",
                      hue=var_name,
                      level=level)


log_init(
    f"results/Evolution/statistics_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
)
log('Begin Statistics')
start_time = time.time()

Path(path.dirname(SAVE_DIR)).mkdir(parents=True, exist_ok=True)
for metric in METRICS:
    for level in REFACTORING_LEVELS:
        frequency_data = get_frequency_data_refactorings(level,
                                                         metric).head(101)
        plot_frequency(frequency_data, metric, "refactoring", level)
        plot_cdf(frequency_data, metric, "refactoring", level)

    for level in STABLE_LEVELS:
        frequency_data_stable = get_frequency_data_stable(level,
                                                          metric).head(101)
Exemplo n.º 9
0
from utils.log import log_init, log_close, log
import time
import datetime
from os import path


"""
The amount of samples for refactorings and non-refactorings in the database is enormous, thus caching the relevant data on your local machine can speed up the machine learning process.

This class fetches the training data for refactoring instances and non-refactoring instances, as configured, from the database and stores the results of the queries in cache files. 

Note:
    In order to use this feature, ensure in the USE_CACHE is enabled in the config.
"""

log_init(path.join(CACHE_DIR_PATH, "results", f"warm-up_cache_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"))
log('Begin cache warm-up')
start_time = time.time()

for dataset in DATASETS + VALIDATION_DATASETS:
    for level in [Level.Class, Level.Method, Level.Variable, Level.Field, Level.Other]:
        log(f"-- non refactored instances for {level} for dataset: {dataset}")
        for k in [15, 20, 25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]:
            log(f"---- non refactored instances with k {k} for {level} for dataset: {dataset}")
            non_refactored = execute_query(get_level_stable(int(level), k, dataset))
            log(str(len(non_refactored)) + " non-refactored instances were found for level: " + str(level))

        log(f"-- {level} refactoring types with count for dataset: {dataset}")
        refactorings = execute_query(get_level_refactorings_count(int(level), dataset))
        log(refactorings.to_string())
        for refactoring_name in LEVEL_MAP[level]:
import copy
from os import path
from pathlib import Path

import joblib

from binary_classification import run
from configs import RESULTS_DIR_PATH
from utils import date_utils
from utils.log import log_close, log_init

if __name__ == "__main__":
    log_init(
        path.join(
            RESULTS_DIR_PATH,
            "log",
            f"classifier_training_{date_utils.windows_path_friendly_now()}.txt"
        ))
    projects = [
        'MaintenanceAPI',
        'AgreementPreferencesAPI',
        'AgreementsOverviewNLAPI',
        'mobile_backend',
        'EnrollmentAPI',
        'mobile-components',
        'paymentsapi',
        'mobile_tools',
        'security-proxy',
        'authentication-api',
        'registration-api',
        'ExperienceComponents',
Exemplo n.º 11
0
# -*- coding: utf-8 -*-
# @Time    : 2020/2/25 13:49
# @Author  : zbs
# @Site    :
# @File    : reptile_main.py
# @Software: PyCharm

import sys
import json
import os
# from common.syn_data import Reptile
from common.syn_tjj import Reptile
from utils.log import log_init

if __name__ == '__main__':
    config_file = sys.argv[1]
    with open(config_file, 'r', encoding='utf-8') as file:
        config = json.load(file)

    project_dir = os.path.abspath(__file__)
    config['logging']['path'] = os.path.join(os.path.dirname(project_dir),
                                             config['logging']['path'])
    log_init(config['logging'])

    reptile = Reptile(config["mongodb"]["address"], config["mongodb"]["port"],
                      config["net_address"])
    reptile.run()
Exemplo n.º 12
0
    plot_refactoring_metrics(
        metrics,
        level,
        "test_set_github",
        hue,
        yticks=[
            0.1, 0.15, 0.25, 0.5, 0.75, 1, 1.5, 2.0, 2.5, 5, 6, 7.5, 10, 15,
            20, 25, 50, 75, 100, 125, 150
        ],
        metrics=PROCESS_METRICS_FIELDS,
        title=f"Process- and Ownership Metrics: {title} at {level.name}",
        file_descriptor=f"{file_descriptor}_Process_Ownership_Metrics")


log_init(
    f"{SAVE_DIRECTORY}evaluation_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
)
start_time = time.time()
# data import and preparation
evaluation_data, prediction_data = import_evaluation(INPUT_DIRECTORY)

false_negatives_metrics_all = pd.DataFrame()
false_positives_metrics_all = pd.DataFrame()
true_negatives_metrics_all = pd.DataFrame()
true_positives_metrics_all = pd.DataFrame()

for index, row in prediction_data.iterrows():
    refactoring_name = row["refactoring_name"]
    level = get_refactoring_level(refactoring_name)
    false_negatives_metrics_all = false_negatives_metrics_all.append(
        extract_false_negatives(row, "test set github", level,
Exemplo n.º 13
0
            statistics.to_csv(statistics_path, index=False, header=True)
            log(f"Collected all statistics for {str(level)} and stored them at: {statistics_path}."
                )
        else:
            statistics = statistics.append(pd.read_csv(statistics_path),
                                           ignore_index=True)

    grouped = statistics.groupby(["metric", "level"], as_index=False).mean()
    excel_path = f"{save_dir}{file_descriptor}_{dataset}.xlsx"
    grouped.to_excel(excel_path, index=False)
    return statistics


SAVE_DIR = f"results/Distribution/Statistics/"
log_init(
    f"{SAVE_DIR}feature_statistics_{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
)
start_time = time.time()

Path(path.dirname(SAVE_DIR)).mkdir(parents=True, exist_ok=True)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    # for metric_description, metrics in METRIC_SETS:
    #     statistics = pd.DataFrame()
    #     metrics_data = pd.DataFrame()
    #     for metric in metrics:
    #         metrics = get_last_refactored_instance_all([metric], REFACTORING_SAMPLES * 5)
    #         statistics_metric = compute_statistics(metrics, Level.NONE, metric, extra_field="all")
    #         statistics = statistics.append(statistics_metric, ignore_index=True)
    #         metrics_data = metrics_data.append(metrics)
    #         log(f"Extract {metric}")
    #