def get_dict(features, labels, percentage_words=0.1, iterations=50): real_features = features.copy() words_label = USES_MULTI.words_label(features, labels) number_words = len(words_label) number_selected_words = int(round(number_words * percentage_words)) distinct_labels = list(set(labels)) number_selected_words_label = int(number_selected_words / len(distinct_labels)) features_score = USES_MULTI.feature_score(words_label, labels) candidates = {} for label in features_score: positive_candidates = features_score[ label][:number_selected_words_label] candidates.update(positive_candidates) negative_candidates = features_score[label][ -number_selected_words_label:] candidates.update(negative_candidates) results = {} actual_iteration = 0 while (actual_iteration < iterations): actual_candidates = USES_MULTI.random_items( candidates, number_selected_words) filtered_features = NLP.filter_features(features, actual_candidates) try: #pprint(actual_iteration) FMeasure = USES_MULTI.classifier(filtered_features, labels) #pprint(FMeasure) except Exception as e: #pprint(e) FMeasure = 0 results.update({FMeasure: actual_candidates}) actual_iteration = actual_iteration + 1 results = sorted(results.items(), key=lambda x: x[0], reverse=True) best_FM = results[0][0] dict_words = results[0][1] #print('Best: ', best_FM) return dict_words
def get_dict(features, labels, max_words=5): real_features = features.copy() real_labels = labels.copy() words_label = USES_MULTI.words_label(features, labels) number_words = len(words_label) if (number_words < max_words): max_words = number_words features_score = ALTER_USES.feature_score(words_label, labels) results = {} number_words = 1 while (number_words <= max_words): dict_words = ALTER_USES.build_dict(features, labels, features_score, number_words) filtered_features = NLP.filter_features(features, dict_words) try: #pprint(number_words) FMeasure = USES_MULTI.classifier(filtered_features, labels) #pprint(FMeasure) except Exception as e: #pprint(e) FMeasure = 0 results.update({FMeasure: dict_words}) number_words = number_words + 1 results = sorted(results.items(), key=lambda x: x[0], reverse=True) best_FM = results[0][0] dict_words = results[0][1] #print('Best FM: ', best_FM) return dict_words
defect = ' '.join(lines) defect = defect.lower().strip().replace("\n", "") features = NLP.tokenizer([defect]) features = NLP.remove_numbers(features) features = NLP.remove_small_words(features) features = NLP.remove_stop_words(features, 'portuguese') features = NLP.lemmatizer(features, 'portuguese') features = NLP.remove_punctuation(features) dict_words = load('files/dict.joblib') dict_words = dict_words[0] features = NLP.filter_features(features, dict_words) features = NLP.text_to_numeric(features, [dict_words]) model = load('files/model.joblib') labels = model.predict(features) if (labels[0] == '1'): print('High Severity') else: print('Low Severity') fim = time.time() #print ("Tempo decorrido: ", fim-ini)