def test_get_encoded_logs_Loaded_cache(self): job = create_test_job() w_cache = get_encoded_logs(job, True) cached_loaded_log = LoadedLog.objects.filter(split=job.split)[0] cached_train = cached_loaded_log.train_log_path cached_test = cached_loaded_log.test_log_path os.remove('cache/loaded_log_cache/' + get_digested(cached_train) + '.pickle') loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1]) os.remove('cache/loaded_log_cache/' + get_digested(cached_test) + '.pickle') loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1])
def test_get_encoded_logs_cache(self): job = create_test_job() w_cache = get_encoded_logs(job, True) wout_cache = get_encoded_logs(job, False) assert_frame_equal(w_cache[0], wout_cache[0]) assert_frame_equal(w_cache[1], wout_cache[1]) loaded_from_cache = get_encoded_logs(job, True) assert_frame_equal(w_cache[0], loaded_from_cache[0]) assert_frame_equal(w_cache[1], loaded_from_cache[1])
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 5 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path) model = model[0] training_df, test_df = get_encoded_logs(job) feature_names = list( training_df.drop(['trace_id', 'label'], 1).columns.values) X_train = training_df.drop(['trace_id', 'label'], 1) Y_train = training_df.drop( ['trace_id', 'prefix_1', 'prefix_3', 'prefix_4', 'label'], 1) rf = RuleFit() columns = list(X_train.columns) X = X_train.as_matrix() rf.fit(X, Y_train.values.ravel(), feature_names=columns) rules = rf.get_rules() # rules = rules[rules.coef != 0].sort_values("support", ascending=False) rules = rules[(rules.coef > 0.) & (rules.type != 'linear')] rules['effect'] = rules['coef'] * rules['support'] pd.set_option('display.max_colwidth', -1) rules.nlargest(10, 'effect') # print(rules) rules
def handle(self, *args, **kwargs): plt.style.use('ggplot') plt.figure(figsize=(6, 6)) TARGET_MODEL = 59 job = Job.objects.filter(pk=TARGET_MODEL)[0] training_df, test_df = get_encoded_logs(job) X_train = training_df.drop(['trace_id', 'label'], 1) RF = DecisionTreeClassifier() Y_train = training_df['label'].values RF.fit(X_train, Y_train) importancies, _ = audit_model(RF.predict, X_train) importancies print(importancies) # generate feature dependence plot fig = plot_dependencies( importancies.median(), reverse_values=False, title="FairML feature dependence plot" ) file_name = "fairml_plot_train_1_3_decision_tree.png" plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=550)
def handle(self, *args, **kwargs): TARGET_MODEL = 68 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path) model = model[0] training_df, test_df = get_encoded_logs(job) EXPLANATION_TARGET = 2_3300 FEATURE_TARGET = 1 shap.initjs() explainer = shap.TreeExplainer(model) training_df = training_df.drop(['trace_id', 'label'], 1) shap_values = explainer.shap_values(training_df) encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) shap.force_plot(explainer.expected_value, shap_values[EXPLANATION_TARGET, :], training_df.iloc[EXPLANATION_TARGET, :], show=False, matplotlib=True).savefig('shap_plot_train_1_3.png')
def get_decoded_df(request, pk): job = Job.objects.filter(pk=pk)[0] training_df, test_df = get_encoded_logs(job) training_df = training_df.drop(['trace_id'], 1) encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) return Response(training_df, status=200)
def test_get_labelled_logs(self): job = create_test_job() labelled_logs = get_encoded_logs(job) cached_labelled_logs = get_labelled_logs(job) assert_frame_equal(labelled_logs[0], cached_labelled_logs[0]) assert_frame_equal(labelled_logs[1], cached_labelled_logs[1])
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 59 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path)[0] # load data training_df, test_df = get_encoded_logs(job) training_df['label'] = training_df['label'].astype(bool).astype(int) columns = list(training_df.columns.values) features = list( training_df.drop(['trace_id', 'label'], 1).columns.values) feature = 'Age_1' feature_grids, percentile_info = _get_grids( feature_values=training_df[feature].values, num_grid_points=10, grid_type=None, percentile_range='percentile', grid_range=None) custom_grids = [] indexs = [] for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)): custom_grids.append(x) print(features) fig, axes, summary_df = info_plots.target_plot( df=training_df, feature=feature, feature_name='feature value', cust_grid_points=custom_grids, target='label', show_percentile=False) fig.savefig('ice_plot_train_1_3_CType.png') lists = list(training_df[feature].values) for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)): indexs.append(lists.index(x)) encoder = retrieve_proper_encoder(job) encoder.decode(training_df, job.encoding) values = training_df[feature].values training_df lst = [] print(summary_df) if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value: for x in range(len(indexs) - 1): lst.append({ 'value': values[indexs[x]], 'label': summary_df['label'][x], 'count': summary_df['count'][x], }) else: for x in range(summary_df.shape[0]): lst.append({ 'value': summary_df['display_column'][x], 'label': summary_df['label'][x], 'count': summary_df['count'][x], }) print(lst)
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower()). performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global training_df, test_df, global_job global_job = job training_df, test_df = get_encoded_logs(job) space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower( )).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = _choose_algorithm(job) try: fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials) except ValueError: raise ValueError("All jobs failed, cannot find best configuration") current_best = { 'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {} } for trial in trials: a = trial['result'] if current_best['loss'] > a['loss']: current_best = a job.predictive_model = PredictiveModel.objects.filter( pk=current_best['predictive_model_id'])[0] job.save() logger.info("End hyperopt job {}, {} . Results {}".format( job.type, get_run(job), current_best['results'])) return current_best['results'], current_best['config'], current_best[ 'model_split']
def explanation(exp_id: int, explanation_target: str = None): exp = Explanation.objects.filter(pk=exp_id)[0] job = exp.job # load data training_df, test_df = get_encoded_logs(job) result = EXPLANATION[exp.type][EXPLAIN](exp, training_df, test_df, explanation_target) return 'False', result
def explanation_temporal_stability(exp_id: int, explanation_target: str = None): exp = Explanation.objects.filter(pk=exp_id)[0] job = exp.job # load data training_df, test_df = get_encoded_logs(job) result = EXPLANATION[exp.type][TEMPORAL_STABILITY](exp, training_df, test_df, explanation_target) return 'False', result
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 20 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path) model = model[0] # load data training_df, test_df = get_encoded_logs(job) # get radom point in evaluation set EXPLANATION_TARGET = 1 # get the actual explanation job.encoding.features.remove('label') explainer = anchor_tabular.AnchorTabularExplainer( class_names=[True, False], feature_names=job.encoding.features, data=training_df.drop(['trace_id', 'label'], 1).T, categorical_names={ job.encoding.features.index(item): list(range(max(training_df[item]))) for item in job.encoding.features } ) explainer.fit( training_df.drop(['trace_id', 'label'], 1).as_matrix(), [True, False], test_df.drop(['trace_id', 'label'], 1).as_matrix(), [True, False] ) model_fn = lambda x: model.predict(x) # show plot idx = 0 np.random.seed(1) print('Prediction: ', explainer.class_names[model_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1))[0]]) exp = explainer.explain_instance(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx], model_fn, threshold=0.95) print('Anchor: %s' % (' AND '.join(exp.names()))) print('Precision: %.2f' % exp.precision()) print('Coverage: %.2f' % exp.coverage()) fit_anchor = np.where(np.all(test_df.drop(['trace_id', 'label'], 1)[:, exp.features()] == test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx][exp.features()], axis=1))[0] print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(test_df.drop(['trace_id', 'label'], 1).shape[0]))) # print('Anchor test precision: %.2f' % ( # np.mean(predict_fn(test_df.drop(['trace_id', 'label'], 1)[fit_anchor]) == predict_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1)))) # np.mean(predict_fn(test_df.drop(['trace_id', 'label'], 1)[fit_anchor]) == predict_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1)))) # ) print('done')
def handle(self, *args, **kwargs): # get model TARGET_MODEL = 71 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path)[0] # load data training_df, test_df = get_encoded_logs(job) features = list( training_df.drop(['trace_id', 'label'], 1).columns.values) interpreter = Interpretation(training_df, feature_names=features) X_train = training_df.drop(['trace_id', 'label'], 1) Y_train = training_df['label'].values model_inst = InMemoryModel(model.predict, examples=X_train, model_type='classifier', unique_values=[1, 2], feature_names=features, target_names=['label']) surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5) surrogate_explainer.fit(X_train, Y_train, use_oracle=True, prune='post', scorer_type='default') surrogate_explainer.class_names = features viz = dtreeviz(surrogate_explainer.estimator_, X_train, Y_train, target_name='label', feature_names=features, orientation="TD", class_names=list(surrogate_explainer.class_names), fancy=True, X=None, label_fontsize=12, ticks_fontsize=8, fontname="Arial") viz.save("skater_plot_train_2_2.svg")
def handle(self, *args, **kwargs): #get model TARGET_MODEL = 5 job = Job.objects.filter(pk=TARGET_MODEL)[0] model = joblib.load(job.predictive_model.model_path) #load data training_df, test_df = get_encoded_logs(job) #get radom point in evaluation set EXPLANATION_TARGET = 3 #get the actual explanation explainer = lime.lime_tabular.LimeTabularExplainer( training_df.drop(['trace_id', 'label'], 1).as_matrix(), feature_names=list( training_df.drop(['trace_id', 'label'], 1).columns.values), categorical_features=[ i for i in range( len( list( training_df.drop(['trace_id', 'label'], 1).columns.values))) ], verbose=True, mode='classification', ) exp = explainer.explain_instance( test_df.drop(['trace_id', 'label'], 1).iloc[ EXPLANATION_TARGET], #TODO probably the opposite would be way less computationally intesive model[0].predict_proba, num_features=5) exp.as_list() #show plot #exp.show_in_notebook(show_table=True) # exp.as_pyplot_figure().show() exp.save_to_file('oi.html') print('done')
def handle(self, *args, **kwargs): TARGET_JOB = 71 SPLITID = 12 job_obj = Job.objects.filter(pk=TARGET_JOB)[0] split_obj = Split.objects.filter(pk=SPLITID)[0] training_df, test_df = get_encoded_logs(job_obj) test_df1 = test_df.copy() test_df2 = test_df.copy() test_df3 = test_df.copy() # todo: retrieve lime explanation # RETRIEVE&SAVE TS ts_exp_job, _ = Explanation.objects.get_or_create( type=ExplanationTypes.TEMPORAL_STABILITY.value, split=split_obj, predictive_model=job_obj.predictive_model, job=job_obj) ts = temporal_stability(ts_exp_job, training_df, test_df1, explanation_target=None) # RETRIEVE&SAVE LIMETS limets_exp_job, _ = Explanation.objects.get_or_create( type=ExplanationTypes.LIME.value, split=split_obj, predictive_model=job_obj.predictive_model, job=job_obj) lime_ts = lime_temporal_stability(limets_exp_job, training_df, test_df2, explanation_target=None) # SAVE GOLD gold = test_df3[['trace_id', 'label']] # todo: retrieve confusion matrix ts = { asdf: { uuu + '1' if uuu[-1:] == '_' else uuu: ts[asdf][uuu] for uuu in ts[asdf] } for asdf in ts } lime_ts = { asdf: { uuu + '1' if uuu[-1:] == '_' else uuu: lime_ts[asdf][uuu] for uuu in lime_ts[asdf] } for asdf in lime_ts } trace_ids = set(gold['trace_id']) confusion_matrix = { 'tp': [ str(tid) for tid in trace_ids if str(tid) in ts and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))] ['predicted'] == 'true' and ts[str(tid)] ['prefix_' + str(len(ts[str(tid)]))]['predicted'] == ('true' if gold[ gold['trace_id'] == tid]['label'].values[0] else 'false') ], 'tn': [ str(tid) for tid in trace_ids if str(tid) in ts and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))] ['predicted'] == 'false' and ts[str(tid)] ['prefix_' + str(len(ts[str(tid)]))]['predicted'] == ('true' if gold[ gold['trace_id'] == tid]['label'].values[0] else 'false') ], 'fp': [ str(tid) for tid in trace_ids if str(tid) in ts and ts[str(tid)][ 'prefix_' + str(len(ts[str(tid)]))]['predicted'] == 'true' and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))] ['predicted'] != ('true' if gold[ gold['trace_id'] == tid]['label'].values[0] else 'false') ], 'fn': [ str(tid) for tid in trace_ids if str(tid) in ts and ts[str(tid)][ 'prefix_' + str(len(ts[str(tid)]))]['predicted'] == 'false' and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))] ['predicted'] != ('true' if gold[ gold['trace_id'] == tid]['label'].values[0] else 'false') ] } limefeats = { k: { key: [ element for element in sorted( [(pref, lime_ts[key] ['prefix_' + str(job_obj.encoding.prefix_length)][pref]['value'], lime_ts[key]['prefix_' + str(job_obj.encoding.prefix_length)] [pref]['importance']) for pref in lime_ts[key] ['prefix_' + str(job_obj.encoding.prefix_length)]], key=lambda x: (x[2], x[1]), reverse=True if k in ['tp', 'fp'] else False # reverse order of lime values if the prediction is negative ) ] for key in confusion_matrix[k] if 'prefix_' + str(job_obj.encoding.prefix_length) in lime_ts[key] } for k in confusion_matrix } freq_seqs = {'tp': {}, 'tn': {}, 'fp': {}, 'fn': {}} # todo: retrive patterns CONFUSION_MATRIX = ['tp', 'tn', 'fp', 'fn'] LIMEFEATS = { 'abs_lime': False, 'tp': 0.2, 'tn': 0.2, 'fp': 0.2, 'fn': 0.2, 'top': 10, 'outputfile': None } FREQ_SEQS = { 'tp': 10, 'tn': 10, 'fp': 10, 'fn': 10, 'top': 15, 'outputfile': None, 'RECOMPUTEDoutputfile': None, } ABSENCE = { 'tp': 0.1, 'tn': 0.1, 'fp': 0.1, 'fn': 0.1, 'ABSENCEoutputfile': None } MINING_METHOD = 'item_mining' print( 'Initial CONFUSION MATRIX:\n', *[ '\tlimefeats ' + KEY + ':' + str(len(limefeats[KEY])) for KEY in CONFUSION_MATRIX ], '\n', *[ '\tfreq_seqs ' + KEY + ':' + str(len(freq_seqs[KEY])) for KEY in CONFUSION_MATRIX ]) available_values = {} for KEY in CONFUSION_MATRIX: available_values[KEY] = {} for tid in limefeats[KEY]: for event in limefeats[KEY][tid]: if event[0].split('_')[0] not in available_values[KEY]: available_values[KEY][event[0].split('_')[0]] = set() available_values[KEY][event[0].split('_')[0]].add(event[1]) filtered_limefeats = { KEY: { tid: [ event for event in limefeats[KEY][tid] if ((not LIMEFEATS['abs_lime']) and ( (KEY in ['tp', 'fp'] and event[2] >= LIMEFEATS[KEY]) or (KEY in ['tn', 'fn'] and event[2] <= -LIMEFEATS[KEY]))) or (LIMEFEATS['abs_lime'] and abs(event[2]) >= LIMEFEATS[KEY]) ] for tid in limefeats[KEY] } for KEY in CONFUSION_MATRIX } prefiltered_limefeats = { KEY: { tid: [ event for event in limefeats[KEY][tid] if ((not LIMEFEATS['abs_lime']) and ( (KEY in ['tp', 'fp'] and event[2] >= LIMEFEATS[KEY]) or (KEY in ['tn', 'fn'] and event[2] <= -LIMEFEATS[KEY]))) or (LIMEFEATS['abs_lime'] and abs(event[2]) >= LIMEFEATS[KEY]) ] for tid in limefeats[KEY] } for KEY in CONFUSION_MATRIX } filtered_limefeats_mine = { KEY: { tid: prefiltered_limefeats[KEY][tid][0:LIMEFEATS['top']] for tid in prefiltered_limefeats[KEY] } for KEY in CONFUSION_MATRIX } for KEY in CONFUSION_MATRIX: for k in list(filtered_limefeats[KEY]): if len(filtered_limefeats[KEY][k]) == 0: del filtered_limefeats[KEY][k] def tassellate_numbers(element): element = str(element) return str(element).split('.')[0][0] + '0' \ if \ '.' in str(element) \ and \ len(str(element)) <= 5 \ else \ str(element).split('.')[0][0:4] \ if \ '.' in str(element) \ and \ len(str(element)) >= 10 \ else \ element def retrieve_right_len(element, available_values): if '_' in element: return len(available_values[element.split('_')[0]]) else: retval = [] for attribute in available_values: if any([ str(element) == str(tassellate_numbers(value)) for value in available_values[attribute] ]): retval += [len(available_values[attribute])] return max(retval) def weight_freq_seqs(KEY, available_values, element, limefeats): print(element[0]) print( 'frequency:', element[1], ' * ', 'len w/out absences: ', len([el for el in element[0] if 'absence' not in el]), ' * ', 'sum of enumerator of possible values: ', sum([ retrieve_right_len(el, available_values[KEY]) for el in element[0] if 'absence' not in el ]), ' / ', 'amount of examples in the field of confusion matrix: ', len(limefeats[KEY]), ' = ', (element[1] * len([el for el in element[0] if 'absence' not in el]) * sum([ retrieve_right_len(el, available_values[KEY]) for el in element[0] if 'absence' not in el ])) / len(limefeats[KEY])) return ( element[1] # * # len([el for el in element[0] if 'absence' not in el]) * # sum([retrieve_right_len(el, available_values[KEY]) for el in element[0] if 'absence' not in el]) ) / len(limefeats[KEY]) filtered_freq_seqs_old = { KEY: sorted([ element for element in freq_seqs[KEY] if weight_freq_seqs(KEY, available_values, element, limefeats) >= FREQ_SEQS[KEY] ], key=lambda x: x[1], reverse=True) for KEY in CONFUSION_MATRIX } prefiltered_freq_seqs = { KEY: sorted([ element for element in freq_seqs[KEY] if weight_freq_seqs(KEY, available_values, element, limefeats) >= FREQ_SEQS[KEY] ], key=lambda x: x[1], reverse=True) for KEY in CONFUSION_MATRIX } #todo: is this the actual topK? filtered_freq_seqs = { KEY: prefiltered_freq_seqs[KEY][0:FREQ_SEQS['top']] for KEY in CONFUSION_MATRIX } print( 'CONFUSION MATRIX after filtering:\n', *[ '\tlimefeats ' + KEY + ':' + str(len(filtered_limefeats[KEY])) for KEY in CONFUSION_MATRIX ], '\n', *[ '\tfreq_seqs ' + KEY + ':' + str(len(filtered_freq_seqs[KEY])) for KEY in CONFUSION_MATRIX ]) def printout_freq_seqs(output_obj, output_file, maxlinelength=5000): with open(output_file, 'w+') as f: f.write(prettyjson(output_obj, maxlinelength=maxlinelength)) if (LIMEFEATS['outputfile'] is not None or FREQ_SEQS['outputfile'] is not None): print('Start saving results..') if (LIMEFEATS['outputfile'] is not None): printout_freq_seqs(filtered_limefeats, LIMEFEATS['outputfile'], maxlinelength=5000) if (FREQ_SEQS['outputfile'] is not None): printout_freq_seqs(filtered_freq_seqs, FREQ_SEQS['outputfile'], maxlinelength=200) print('Results saved.') else: print('FILTERED_LIMEFEATS:\n', filtered_limefeats) print('FILTERED_FREQ_SEQS:\n', filtered_freq_seqs) print('Computing absence...') attributes = {} for KEY in CONFUSION_MATRIX: for tid in limefeats[KEY]: for event in limefeats[KEY][tid]: attribute_name = event[0] if attribute_name not in attributes: attributes[attribute_name] = set() attributes[attribute_name].add(event[1]) attributes_occurrences = { 'tp': collections.Counter(), 'fp': collections.Counter(), 'tn': collections.Counter(), 'fn': collections.Counter() } for KEY in CONFUSION_MATRIX: found_stuff = [] for tid in limefeats[KEY]: for event in limefeats[KEY][tid]: found_stuff += [tassellate_numbers(event[1])] attributes_occurrences[KEY].update(found_stuff) characterised_attributes_occurrences = {} for KEY in CONFUSION_MATRIX: characterised_attributes_occurrences[KEY] = {} for attribute in attributes: if attribute not in characterised_attributes_occurrences[KEY]: characterised_attributes_occurrences[KEY][ attribute] = dict() for attr in attributes[attribute]: characterised_attributes_occurrences[KEY][attribute][ tassellate_numbers(attr)] = 0 for KEY in CONFUSION_MATRIX: for occ in attributes_occurrences[KEY]: for attr in characterised_attributes_occurrences[KEY]: if occ in characterised_attributes_occurrences[KEY][attr]: characterised_attributes_occurrences[KEY][attr][ occ] = attributes_occurrences[KEY][occ] for attr in characterised_attributes_occurrences[KEY]: characterised_attributes_occurrences[KEY][attr]['Total'] = sum( [ characterised_attributes_occurrences[KEY][attr] [element] for element in characterised_attributes_occurrences[KEY][attr] ]) print('Absence computed.') print('The absence AFTER filtering is:\n', characterised_attributes_occurrences) print( 'RE-computing the sequence pattern result after applying the thresholds...' ) static_attr = [ # 'Age', # 'ClaimValue', # 'CType', # 'ClType', # 'PClaims', ] limefeats_static_dinamic = {} for KEY in CONFUSION_MATRIX: limefeats_static_dinamic[KEY] = {} for tid in filtered_limefeats[KEY]: limefeats_static_dinamic[KEY][tid] = { 'static': [], 'dynamic': [ att for att in filtered_limefeats[KEY][tid] if not any([ att[0].startswith(static_att) for static_att in static_attr ]) ] } current_static_attributes = [ att for att in filtered_limefeats[KEY][tid] if any([ att[0].startswith(static_att) for static_att in static_attr ]) ] for s_attr in static_attr: curr_attributes = [ att for att in current_static_attributes if att[0].startswith(s_attr) ] if len(curr_attributes) > 0: if KEY in ['tp', 'fp']: limefeats_static_dinamic[KEY][tid]['static'] += [ max(curr_attributes, key=lambda x: x[2]) ] elif KEY in ['tn', 'fn']: limefeats_static_dinamic[KEY][tid]['static'] += [ max(curr_attributes, key=lambda x: x[2]) ] else: print('Something bad happened') dynamic_data = { KEY: { tid: [ # (element[0].split('_')[0] + '_' + element[1]) (element[0] + '_' + element[1]) for element in sorted( [ k for k in limefeats_static_dinamic[KEY][tid] ['dynamic'] ], # key=lambda x: (x[0].split('_')[1], x[0].split('_')[0]) key=lambda x: x[0]) ] for tid in limefeats_static_dinamic[KEY] if len(limefeats_static_dinamic[KEY][tid]['dynamic']) > 0 } for KEY in CONFUSION_MATRIX } static_data = { KEY: { tid: [ (element[0].split('_')[0] + '_' + tassellate_numbers(element[1])) # (element[0] + '_' + tassellate_numbers(element[1])) for element in sorted([ k for k in limefeats_static_dinamic[KEY][tid]['static'] ], key=lambda x: (x[0].split('_')[1], x[ 0].split('_')[0])) ] for tid in limefeats_static_dinamic[KEY] if len(limefeats_static_dinamic[KEY][tid]['static']) > 0 } for KEY in CONFUSION_MATRIX } data = {} for KEY in CONFUSION_MATRIX: data[KEY] = {} for tid in limefeats[KEY]: if tid in static_data[KEY] and tid in dynamic_data[KEY]: data[KEY][ tid] = static_data[KEY][tid] + dynamic_data[KEY][tid] elif tid in static_data[KEY]: data[KEY][tid] = static_data[KEY][tid] elif tid in dynamic_data[KEY]: data[KEY][tid] = dynamic_data[KEY][tid] if (MINING_METHOD == 'seq_mining'): freq_seqs_after_filter = { 'tp': sorted( seqmining.freq_seq_enum( [data['tp'][tid] for tid in data['tp']], 2)), 'tn': sorted( seqmining.freq_seq_enum( [data['tn'][tid] for tid in data['tn']], 2)), 'fp': sorted( seqmining.freq_seq_enum( [data['fp'][tid] for tid in data['fp']], 2)), 'fn': sorted( seqmining.freq_seq_enum( [data['fn'][tid] for tid in data['fn']], 2)), } if (MINING_METHOD == 'item_mining'): freq_seqs_after_filter = { 'tp': itemmining.relim(itemmining.get_relim_input( [data['tp'][tid] for tid in data['tp']]), min_support=2), 'tn': itemmining.relim(itemmining.get_relim_input( [data['tn'][tid] for tid in data['tn']]), min_support=2), 'fp': itemmining.relim(itemmining.get_relim_input( [data['fp'][tid] for tid in data['fp']]), min_support=2), 'fn': itemmining.relim(itemmining.get_relim_input( [data['fn'][tid] for tid in data['fn']]), min_support=2), } freq_seqs_after_filter = { KEY: [(tuple(element), freq_seqs_after_filter[KEY][element]) for element in freq_seqs_after_filter[KEY]] for KEY in CONFUSION_MATRIX } filtered_freq_seqs_after_filter_old = { KEY: sorted([[ element[0], weight_freq_seqs(KEY, available_values, element, limefeats) ] for element in freq_seqs_after_filter[KEY] if weight_freq_seqs( KEY, available_values, element, limefeats) >= FREQ_SEQS[KEY]], key=lambda x: x[1], reverse=True) for KEY in CONFUSION_MATRIX } # todo: filter topK filtered_freq_seqs_after_filter = { KEY: filtered_freq_seqs_after_filter_old[KEY][0:FREQ_SEQS['top']] for KEY in CONFUSION_MATRIX } print('Sequence pattern recomputed successfully.') if (FREQ_SEQS['outputfile'] is not None): print('Start saving results..') printout_freq_seqs(filtered_freq_seqs_after_filter, FREQ_SEQS['RECOMPUTEDoutputfile'], maxlinelength=200) print('Results saved.') else: print('RECOMPUTED_FREQ_SEQS:\n', filtered_freq_seqs_after_filter) print('Done, cheers!') return confusion_matrix, data, freq_seqs_after_filter, filtered_freq_seqs_after_filter
def calculate_hyperopt(job: Job) -> (dict, dict, dict): """main entry method for hyperopt calculations returns the predictive_model for the best trial :param job: job configuration :return: tuple containing the results, config and predictive_model split from the search """ logger.info("Start hyperopt job {} with {}, performance_metric {}".format( job.type, get_run(job), job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower() ).performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT? ) global training_df, test_df, global_job global_job = job training_df, test_df = get_encoded_logs(job) #TODO evaluate on validation set if holdout: validation_df = test_df # test_df = training_df.sample(frac=.2) test_df = training_df.tail(int(len(training_df) * 20 / 100)) training_df = training_df.drop(test_df.index) train_start_time = time.time() space = _get_space(job) max_evaluations = job.hyperparameter_optimizer.__getattribute__( job.hyperparameter_optimizer.optimization_method.lower() ).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT? trials = Trials() algorithm = _choose_algorithm(job) try: fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials) except ValueError: raise ValueError("All jobs failed, cannot find best configuration") current_best = {'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {}} for trial in trials: a = trial['result'] if current_best['loss'] > a['loss']: current_best = a job.predictive_model = PredictiveModel.objects.filter(pk=current_best['predictive_model_id'])[0] job.predictive_model.save() job.save() current_best['results']['elapsed_time'] = timedelta(seconds=time.time() - train_start_time) # todo find better place for this job.evaluation.elapsed_time = current_best['results']['elapsed_time'] job.evaluation.save() #TODO evaluate on validation set if holdout: results_df, auc = _test( current_best['model_split'], validation_df.drop(['trace_id'], 1), evaluation=True, is_binary_classifier=_check_is_binary_classifier(job.labelling.type) ) results = _prepare_results(results_df, auc) results['elapsed_time'] = job.evaluation.elapsed_time job.evaluation = Evaluation.init( job.predictive_model.predictive_model, results, len(set(test_df['label'])) <= 2 ) job.evaluation.save() job.save() if holdout: logger.info("End hyperopt job {}, {}. \n\tResults on test {}. \n\tResults on validation {}.".format(job.type, get_run(job), current_best['results'], results)) return results, current_best['config'], current_best['model_split'] else: logger.info("End hyperopt job {}, {}. \n\tResults on test {}.".format(job.type, get_run(job), current_best['results'])) return current_best['results'], current_best['config'], current_best['model_split']
def handle(self, *args, **kwargs): TARGET_JOB = 439 initial_job_obj = Job.objects.filter(pk=TARGET_JOB)[0] # todo: return performances print('Initial Job:', initial_job_obj.evaluation.classificationmetrics ) # TODO future bug training_df_old, test_df_old = get_encoded_logs(initial_job_obj) training_df = training_df_old.copy() test_df = test_df_old.copy() # todo: what should I randomise? TARGETS = [ [('prefix_1', 2)], # <- simple pattern [('prefix_2', 3)], # <- simple pattern [ ('prefix_3', 2), ('prefix_4', 3), ] # <- complex pattern ] for target in TARGETS: if len(target) == 1: target = target[0] for df in [training_df, test_df]: m_col = df[target[0]] del df[target[0]] target_values1 = list(set(m_col.values)) df[target[0]] = m_col.apply(lambda x: x if (x != target[ 1]) else random.choice(target_values1)) elif len(target) > 1: for df in [training_df, test_df]: m_col = df[[column for column, _ in target]] possible_values = {} for column, _ in target: possible_values[column] = list(set(df[column])) del df[column] df[[column for column, _ in target ]] = m_col.apply(lambda x: x if any( [x[column] != value for column, value in target]) else Series({ column: random.choice(possible_values[column]) for column, value in target }), axis=1) else: raise Exception('target list with unexpected value') assert not training_df.equals(training_df_old) assert not test_df.equals(test_df_old) # todo: save new dataset in memory and create split to use it initial_split_obj = initial_job_obj.split new_split = duplicate_orm_row(initial_split_obj) train_log = duplicate_orm_row(new_split.train_log) test_log = duplicate_orm_row(new_split.test_log) # TODO future bug creates shadows train_log.name = 'RETRAIN' + train_log.name train_log.path = 'cache/log_cache/' + train_log.name train_log.properties = {} test_log.name = 'RETRAIN' + test_log.name test_log.path = 'cache/log_cache/' + test_log.name test_log.properties = {} new_split.train_log = train_log new_split.test_log = test_log new_split.additional_columns = None new_split.save() prediction_job = create_prediction_job( initial_job_obj, initial_job_obj.encoding.prefix_length) prediction_job.split = new_split prediction_job.split.save() prediction_job.save() put_labelled_logs(prediction_job, training_df, test_df) # todo: build model prediction_task(prediction_job.id, do_publish_result=False) prediction_job.refresh_from_db() # todo: return performances print('Retrain Job:', prediction_job.evaluation.classificationmetrics) print('Done, cheers!')