def print_smells_proprties_table(): records = [] for i in CONCEPTS_DICT.keys(): file = join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=i)) df = pd.read_csv(file) record = [] record.append(CONCEPT_NAMES[i]) record.append(len(potential_smells(df, i))) record.append(len(robust_smells(df, i))) record.append(len(almost_robust_smells(df, i))) record.append(len(predictive_features(df, i))) record.append(len(cochange_features(df))) record.append(len(twins_features(df))) record.append(len(monotonicity_features(df))) record.append(len(length_features(df))) records.append(record) table_df = pd.DataFrame(records, columns=[ 'Concept', 'Potential', 'Robust', 'Almost', 'Predictive', 'Cochange', 'Twins', 'Monotonicity', 'Length' ]) table_df = table_df.sort_values(['Concept'], ascending=[False]) title = '\label{tab:smells-properties} Smells Properties' print() df_to_latex_table(table_df, title, rounding_digits=0) print()
def run_print_influence_tables(): for i in CONCEPTS_DICT.keys(): stats = pd.read_csv( join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=i))) print_influence_tables(stats, concept=i) print_act_upon_tables(stats, concept=i)
def missing_propetries_distribution(): concept = 'file_ccp' df = pd.read_csv( join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept))) properties_existnace = [ 'has_predictive', 'has_cochange', 'has_monotonicity', 'has_twins', 'has_length' ] df['has_predictive'] = df.apply( lambda x: 1 if x[RELATIVE_MEAN_DIFF_PREFIX + concept ] > 0 and x.true_positives + x.false_negatives > 200 else 0, axis=1) df['has_cochange'] = df.apply(lambda x: 1 if x.cochange_precision_lift > 0.0 else 0, axis=1) df['has_monotonicity'] = df.apply(lambda x: 1 if x.monotonicity else 0, axis=1) df['has_twins'] = df.apply(lambda x: 1 if x.twins_precision_lift > 0.0 else 0, axis=1) df['has_length'] = df.apply(lambda x: 1 if ((x.line_pearson < 0.5) and ( (x.control_short_precision_lift > 0.0) or (x.control_short_hit_rate == 0.0)) and ( (x.control_medium_precision_lift > 0.0) or (x.control_medium_hit_rate == 0.0)) and ( (x.control_long_precision_lift > 0.0) or (x.control_long_hit_rate == 0.0))) else 0, axis=1) df['properties_num'] = \ df[properties_existnace].sum(axis=1) g = df.groupby(['properties_num'], as_index=False).agg({ 'feature': 'count' }).sort_values('properties_num') print(g) has_4 = df[df.properties_num == 4][['feature'] + properties_existnace] for i in [ 'has_predictive', 'has_cochange', 'has_monotonicity', 'has_twins', 'has_length' ]: print(i, 1 - has_4[i].mean()) print(has_4.describe())
def multiple_smells(): rows = [] for i in CONCEPTS_DICT.keys(): df = pd.read_csv( join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=i))) smells = potential_smells(df, i) for smell in smells: rows.append((i, smell)) features_df = pd.DataFrame(rows, columns=['concpet', 'feature']).sort_values('feature') # grouped = features_df.groupby(['feature'], as_index=False).agg({'concept' : 'count'}) grouped = features_df.groupby(['feature'], as_index=False).agg(['count']).reset_index() grouped.columns = ['feature', 'concept'] #grouped.columns = [ 'concept'] grouped = grouped[grouped.concept > 1] print(features_df[features_df.feature.isin(grouped.feature.tolist())])
def run_basic_models(concept): start = time.time() df = get_per_year_dataset() q25 = df[concept].quantile(0.25) df = df[df.year == MAX_YEAR - 1] df = df.fillna(NUMERIC_NULL) df[CONCEPT] = df[concept].map(lambda x: x <= q25) stats = pd.read_csv( join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept))) smells = potential_smells(stats, concept) features = smells + [CONCEPT] #df = df[SINGLE_SMELL + [CONCEPT]] df = df[features] print(risk_predictive_columns(df)) end = time.time() print("Load data time", end - start) class_weight = {1: 1, 0: 1} #class_weight = {1: 100 , 0: 1} #class_weight = {1: 1 , 0: 100} #class_weight = {1: 1 , 0: 0.001} classifiers = { 'Tree_ms50_md3': DecisionTreeClassifier(min_samples_leaf=200, max_depth=3, class_weight=class_weight), 'Tree_default': DecisionTreeClassifier(class_weight=class_weight), 'Tree_ms50': DecisionTreeClassifier(min_samples_leaf=200, class_weight=class_weight), 'Tree_md3': DecisionTreeClassifier(max_depth=3, class_weight=class_weight), 'RandomForest': RandomForestClassifier(n_estimators=10, min_samples_leaf=50) } for model_name in classifiers.keys(): print(model_name) start = time.time() regressor = classifiers[model_name] regressor, performance = build_basic_model( df, concept=CONCEPT, classifier=regressor, model_file_name='{}.pkl'.format(model_name), performance_file=os.path.join(PERFORMANCE_PATH, '{}.json'.format(model_name))) if 'Tree' in model_name: plot_tree(regressor, dot_file_path=os.path.join(FIGURES_PATH, '{}.dot'.format(model_name)), png_file_path=os.path.join(FIGURES_PATH, '{}.png'.format(model_name)), feature_names=smells) tree_to_sql(tree=regressor, feature_names=smells, function_name="tree", output_file=os.path.join(MODELS_PATH, '{}.sql'.format(model_name))) else: plot_random_forest( regressor, dot_files_prefix=os.path.join(FIGURES_PATH, 'rf1'), png_files_prefix=os.path.join(FIGURES_PATH, 'rf1'), feature_names=smells) random_forest_to_sql(regressor, feature_names=smells, function_name_prefix="rf", output_file_prefix=os.path.join( MODELS_PATH, 'rf')) end = time.time() print("Model running time", end - start) return regressor, df
def aggregate_stats(concept): length_df = pd.read_csv(join(DATA_PATH, LENGTH_PEARSON_STATS)) length_df = length_df[['feature', 'line_pearson']] removal_df = pd.read_csv(join(DATA_PATH, SMELL_REMOVAL_FILE)) removal_df = rename_columns(removal_df, prefix='removal_', columns=set(removal_df.columns) - set(['feature'])) monotinicity_df = pd.read_csv( MONOTONE_PATH_TEMPLATE.format(monotone_column=concept)) cochange_df = pd.read_csv( join(DATA_PATH, COHANGE_STATS_TEMPLATE.format(metric=concept))) cochange_df = rename_columns(cochange_df, prefix='cochange_', columns=set(cochange_df.columns) - set(['feature'])) features_df = pd.read_csv( join(DATA_PATH, PREDICTIVE_STATS_TEMPLATE.format(concept=concept))) features_df = features_df.rename(columns={'feature_name': 'feature' }) # TODO - change in original twins_df = pd.read_csv( join(DATA_PATH, AUTHOR_TWIN_CM_TEMPLATE.format(concept=concept))) twins_df = rename_columns(twins_df, prefix='twins_', columns=set(twins_df.columns) - set(['feature'])) relative_mean_df = pd.read_csv(join(DATA_PATH, RELATIVE_MEANS_FILE)) relative_mean_df = relative_mean_df[[ 'feature', RELATIVE_MEAN_PREFIX + concept, RELATIVE_MEAN_DIFF_PREFIX + concept ]] joint_df = pd.merge(features_df, cochange_df, on='feature') joint_df = pd.merge(joint_df, relative_mean_df, on='feature') joint_df = pd.merge(joint_df, monotinicity_df, on='feature') joint_df = pd.merge(joint_df, length_df, on='feature') joint_df = pd.merge(joint_df, removal_df, on='feature') joint_df = pd.merge(joint_df, twins_df, on='feature') file_codesmell_df = pd.read_csv(join(DATA_PATH, BINARY_DATASET_FILE)) control_variable = 'length_group' for i in file_codesmell_df[control_variable].unique(): template = 'pred_stats_ctl_{control_variable}_{control_val}_' + concept + '.csv' file = template.format(control_variable=control_variable, control_val=i) control_df = pd.read_csv(join(DATA_PATH, file)) control_df = control_df.rename(columns={'feature_name': 'feature' }) # TODO - change in original control_df = rename_columns(control_df, prefix='control_{val}_'.format(val=i), columns=set(control_df.columns) - set(['feature'])) joint_df = pd.merge(joint_df, control_df, on='feature') joint_df.to_csv(join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept)), index=False) return joint_df
def model_groups_influence(): SMELLS_COUNT = 'smells_count' CLASSIFIER = 'has_smells' LOW_QUALITY = 'low_quality' HIGH_QUALITY = 'high_quality' df = get_per_year_dataset() df = df[df.year == MAX_YEAR - 1] for l in [['short'], ['medium'], ['long'], ['short', 'medium', 'long']]: print(l) df = get_per_year_dataset() df = df[df.year == MAX_YEAR - 1] df = df[df['length_group'].isin(l)] rows = [] for concept in CONCEPTS_DICT.keys(): row = [CONCEPT_NAMES[concept]] stats = pd.read_csv( join(DATA_PATH, JOINT_STATS_TEMPLATE.format(concept=concept))) smells = potential_smells(stats, concept) #smells = robust_smells(stats # , concept) df[SMELLS_COUNT] = df[smells].sum(axis=1) df[CLASSIFIER] = df[SMELLS_COUNT].map(lambda x: x == 0) row.append(df[CLASSIFIER].mean()) q25 = df[concept].quantile(0.25) df[HIGH_QUALITY] = df[concept].map(lambda x: x <= q25) cm = pair_analysis(df, first_metric=CLASSIFIER, second_metric=HIGH_QUALITY) row.append(cm['precision_lift']) rows.append(row) q75 = df[concept].quantile(0.75) df[LOW_QUALITY] = df[concept].map(lambda x: x >= q75) cm = pair_analysis(df, first_metric=CLASSIFIER, second_metric=LOW_QUALITY) row.append(cm['precision_lift']) #row.append(df[concept].mean()) #row.append(df[df[CLASSIFIER]][concept].mean()) features_df = pd.DataFrame( rows, columns=[ 'Metric', 'Hit Rate', 'High Quality', 'Low Quality' #, 'Mean', 'CMean' ]).sort_values('Metric') print() df_to_latex_table( features_df, '\label{tab:group_smell_influence} Smells Groups Influence ')