def _add_dict_to_pattern(self, regex, required_pwds, pwds): """Inserts the required dictionary lists into the pattern. E.g "(He|She) is from {country}" -> country_list Arguments: regex {String} -- Regex required_pwds {list} -- List of required pwds pwds {dict} -- Personalized word dictionary {"dict_name" -> list} Returns: regex {String} -- Modified regex with dictionary inserted """ #string.format can't work if we regexes which have curly braces like so \d{4} since str.format expects a value #opting for a simple replace method try: regex_pwds = {key: "|".join(pwds[key]) for key in required_pwds} for key in regex_pwds: if regex_pwds[key]: regex = regex.replace("{{{}}}".format(key), regex_pwds[key]) except KeyError as e: cause = e.args[0] raise SpecialException( "Could not find dictionary {}. Make sure it is included in " "the Required Dictionaries textfield of the Variable Settings tab." .format(cause)) return regex
def get_labeled_data(ids_list, data_list, label_file, l_id_col=1, l_label_col=None, l_first_row=3, label_func=None, encoding=None): """ :param ids_list: :param data_list: :param label_file: :param l_id_col: :param l_label_col: :param l_first_row: :param label_func: :return: """ try: new_data_list = [] new_labels_list = [] new_ids_list = [] local_data_loader = data_from_csv if label_file.endswith( '.csv') else data_from_excel # TODO: accordingly increment/decrement l_id_col, l_label_col, l_first_row, check_col depending on filetype _, temp_labels, temp_ids = local_data_loader([label_file], id_cols=l_id_col, label_cols=l_label_col, repeat_ids=False, first_row=l_first_row, check_col=1, encoding=encoding) new_list = [] for i, data_id in enumerate(ids_list): if data_id in temp_ids: # temp_ids must be unique new_list.append([ temp_labels[temp_ids.index(data_id)], data_list[i], data_id ]) new_list = sorted(new_list, key=lambda j: j[2]) for each in new_list: new_labels_list.append(each[0]) new_data_list.append(each[1]) new_ids_list.append(each[2]) if label_func: label_func(new_labels_list) except IndexError: raise SpecialException( "Specified label column does not appear in the file.") return new_ids_list, new_data_list, new_labels_list
def __init__(self, pwds, categories_of_interest, to_lower=False): try: self.pwds = pwds self.categories = categories_of_interest self.to_lower = to_lower self._max_len = 4 if self.to_lower: self.pwds = {category: {term.lower() for term in self.pwds[category]} for category in categories_of_interest} else: self.pwds = {category: {term for term in self.pwds[category]} for category in categories_of_interest} except KeyError as e: cause = e.args[0] raise SpecialException("Could not find dictionary {}".format(cause))
def get_data(data_col, label_col, id_col, data, labels, ids, repeat_ids, row_process_func): """Gets data, label and id from a row Arguments: data_col {int} -- data column number label_col {int} -- label column number id_col {int} -- id column number data {list} -- list of data so far labels {list} -- list of labels so far ids {list} -- list of ids so far repeat_ids {boolean} -- if False, data is concatenated row_process_func {function} -- function that extracts a value from a row Returns: data {list} -- list of data labels {list} -- list of labels ids {list} -- list of ids """ try: concat_index = None # id_col which is a list if id_col is not None: cur_id = row_process_func(id_col) #if repeat ids is false and id in ids, we want to concatenate the ids if not repeat_ids and cur_id in ids: concat_index = ids.index(cur_id) else: ids.append(cur_id) if label_col is not None: #If we are concatenating data (i.e repeat_ids = False), use all the diagnoses cur_label = [] for actual_label_col in label_col: val = row_process_func(actual_label_col) val = "None" if not val else val cur_label.append(val) if concat_index is not None: if type(labels[concat_index]) == list: labels[concat_index].extend(cur_label) else: labels[concat_index] = [labels[concat_index] ].extend(cur_label) else: if len(cur_label) == 1: labels.append(cur_label[0]) else: labels.append(cur_label) if data_col is not None: data_string = row_process_func(data_col[0]) for i in range(1, len(data_col)): data_string += "\n{}".format(row_process_func(data_col[i])) # print("-"*100) # print('NoneType has been found' if datum is None else datum) if concat_index is not None: data[concat_index] += "{}\n".format( preprocess_data(data_string)) else: data.append(preprocess_data(data_string)) except IndexError: raise SpecialException("Specified column does not appear in the file.") return data, labels, ids
def regexes_from_json(filename, use_custom_score=False, all_matches=False, flags=[re.IGNORECASE]): regexes = [] class_name = None with open(filename, 'r') as f: data = json.load(f) all_matches = data[ "All Matches"] if "All Matches" in data else all_matches if "Case Sensitive" in data: flags = [re.IGNORECASE] if not data["Case Sensitive"] else None else: flags = flags if "Name" not in data: raise SpecialException("Rule file requires a label name.") else: class_name = data["Name"] rule_types = {"Replace": "r", "Ignore": "i", "Add": "a"} if "Rules" in data: for rule in data["Rules"]: score = None if not use_custom_score else rule["Primary"][ "Score"] primary_pattern = _compile_tags_to_regex( rule["Primary"]["Rule"]) secondary_regexes = [] for rule_type in rule_types: for secondary_rule in rule["Secondary"][rule_type]: secondary_score = None if not use_custom_score else secondary_rule[ "Score"] secondary_pattern = _compile_tags_to_regex( secondary_rule["Rule"]) effect = rule_types[rule_type] if "Modifier" in secondary_rule and secondary_rule[ "Modifier"] != "None": effect += secondary_rule["Modifier"] #Remember to never do what is under this line ever again # effect = secondary_rule["Type"] + secondary_rule["Modifier"] \ # if "Modifier" in secondary_rule else "" secondary_regex = Regex(name="sec_reg{}-{}-{}".format( len(regexes) + 1, len(secondary_regexes) + 1, class_name), regex=secondary_pattern, effect=effect, score=secondary_score, all_matches=all_matches, flags=flags, secondary_regexes=[]) secondary_regexes.append(secondary_regex) primary_regex = Regex(name="reg{}-{}".format( len(regexes) + 1, class_name), regex=primary_pattern, score=score, effect='p', secondary_regexes=secondary_regexes, all_matches=all_matches, flags=flags) regexes.append(primary_regex) return class_name, regexes
def data_from_csv(filenames, data_cols=None, label_cols=None, id_cols=None, repeat_ids=True, first_row=1, limit=None, preprocess_func=None, check_col=0, encoding=None): """Reads data from CSV files Arguments: filenames {list of string} -- List of CSV filenames Keyword Arguments: data_cols {list of int or int} -- List of location of data columns in each file (default: {None}) label_cols {list of int or int} -- List of location of label columns in each file (default: {None}) id_cols {list of int or int} -- List of location of id columns in each file (default: {None}) repeat_ids {bool} -- If False, data corresponding to already existing ids are concatenated (default: {True}) first_row {int} -- Starts reading from specified row number (default: {1}) limit {int} -- Stops reading after specified number of lines have been read (default: {None}) preprocess_func {function} -- Applies preprocess function to each row in a file (default: {None}) check_col {int} -- Data column to check whether to continue evaluation (default: {0}) encoding {str} -- Encoding of file - if not specified, try "utf8" and then "latin-1" Returns: data {list} -- list of data labels {list} -- list of labels ids {list} -- list of ids """ # If encoding is not specified, try reading file with UTF-8 # encoding. If that fails, use latin-1 encoding if not encoding: try: encoding = "utf8" pd.read_csv(filenames[0]) except UnicodeDecodeError: encoding = "latin-1" try: data, labels, ids, data_cols, \ label_cols, id_cols = _data_helper(len(filenames), data_cols, label_cols, id_cols) print("Reading data from csv file...") count = 0 for file_num, filename in enumerate(filenames): if limit is not None and count == limit: break with open(filename, "r", encoding=encoding) as csv_file: rows = csv.reader(csv_file, delimiter=",", quotechar='"') for i, row in enumerate(rows): if i >= first_row: # If label column is empty don't include it if row[check_col] == '': continue count += 1 # getting data, label and ids from each row and # concatenating it data, labels, ids = get_data(data_cols[file_num], label_cols[file_num], id_cols[file_num], data, labels, ids, repeat_ids, lambda col: str(row[col])) if preprocess_func is not None: for i in range(len(data)): data[i] = preprocess_func(data[i]) except SpecialException as e: raise SpecialException(str(e)) return data, labels, ids
def data_from_excel(filenames, data_cols=None, label_cols=None, id_cols=None, repeat_ids=True, first_row=1, limit=None, preprocess_func=None, check_col=0, encoding=None): """Reads data from excel files Arguments: filenames {list of string} -- List of Excel filenames Keyword Arguments: data_cols {list of int or int} -- List of location of data columns in each file (default: {None}) label_cols {list of int or int} -- List of location of label columns in each file (default: {None}) id_cols {list of int or int} -- List of location of id columns in each file (default: {None}) repeat_ids {bool} -- If False, data corresponding to already existing ids are concatenated (default: {True}) first_row {int} -- Starts reading from specified row number (default: {1}) limit {int} -- Stops reading after specified number of lines have been read (default: {None}) preprocess_func {function} -- Applies preprocess function to each row in a file (default: {None}) check_col {int} -- Data column to check whether to continue evaluation (default: {0}) Returns: data {list} -- list of data labels {list} -- list of labels ids {list} -- list of ids """ # If encoding is not specified, try reading file with UTF-8 # encoding. If that fails, use latin-1 encoding if not encoding: try: encoding = "utf8" pd.read_excel(filenames[0]) except UnicodeDecodeError: encoding = "latin-1" try: data, labels, ids, data_cols, label_cols, id_cols = _data_helper( len(filenames), data_cols, label_cols, id_cols) print("Reading data from excel file...") count = 0 for file_num, filename in enumerate(filenames): if limit is not None and count == limit: break workbook = openpyxl.load_workbook(filename, data_only=True, read_only=True) workbook.encoding = encoding sheet_names = workbook.get_sheet_names() for sheet_name in sheet_names: #getting rows in worksheet cur_ws = workbook[sheet_name].rows for i, row in enumerate(cur_ws): if i >= first_row: #If check column is empty don't include it if row[check_col].value is None: continue count += 1 #getting data, label and ids from each row and concatenating it data, labels, ids = get_data( data_cols[file_num], label_cols[file_num], id_cols[file_num], data, labels, ids, repeat_ids, lambda col: str(row[col].value)) end = time() if preprocess_func is not None: for i in range(len(data)): data[i] = preprocess_func(data[i]) except SpecialException as e: raise SpecialException(str(e)) return data, labels, ids
def get_failures(classifier_runner, cur_dataset, conf_path, print_output=True): try: failures_dict = {} try: cur_labels_list = sorted( list( set(classifier_runner.classifier.dataset[cur_dataset] ["preds"].tolist()) | set(classifier_runner.classifier.dataset[cur_dataset] ["labels"].tolist()))) except TypeError: raise SpecialException( "Error occurred when generating stats. Check label file to make sure all ids are unique." ) accuracy, \ incorrect_indices, num_correct = calculate_accuracy(classifier_runner.classifier.dataset[cur_dataset]["preds"], classifier_runner.classifier.dataset[cur_dataset]["labels"]) if print_output: print("\nAccuracy: ", accuracy) print("\nIds: ", classifier_runner.classifier.dataset[cur_dataset]["ids"]) print("Predictions: ", classifier_runner.classifier.dataset[cur_dataset]["preds"]) print("Labels: ", classifier_runner.classifier.dataset[cur_dataset]["labels"]) print( "\nIncorrect Ids: ", classifier_runner.classifier.dataset[cur_dataset]["ids"] [incorrect_indices]) print( "Incorrect Predictions: ", classifier_runner.classifier.dataset[cur_dataset]["preds"] [incorrect_indices]) print( "Incorrect Labels: ", classifier_runner.classifier.dataset[cur_dataset]["labels"] [incorrect_indices]) classifier_type = classifier_runner.classifier_type.__name__ classifier_classes = sorted( list(classifier_runner.classifier_parameters["regexes"])) if classifier_type == "CaptureClassifier": cnf_matrix = None ppv_and_accuracy = compute_ppv_accuracy_capture( classifier_runner.classifier.dataset[cur_dataset]["labels"], classifier_runner.classifier.dataset[cur_dataset]["preds"], classifier_classes, classifier_runner.classifier.negative_label) predicted_positive, positive_cases, predicted_negative_cases, negative_cases, \ false_positives, false_negatives = get_classification_stats_capture( classifier_runner.classifier.dataset[cur_dataset]["labels"], classifier_runner.classifier.dataset[cur_dataset]["preds"], classifier_classes, classifier_runner.classifier.negative_label) cur_labels_list = classifier_classes else: cnf_matrix = confusion_matrix( classifier_runner.classifier.dataset[cur_dataset]["labels"], classifier_runner.classifier.dataset[cur_dataset]["preds"]) ppv_and_accuracy = compute_ppv_accuracy_ova( cnf_matrix, cur_labels_list) predicted_positive, positive_cases, predicted_negative_cases, negative_cases, \ false_positives, false_negatives = get_classification_stats(cnf_matrix, cur_labels_list) if print_output: print("Confusion Matrix: ") print("OVA PPV and Accuracy: ", ppv_and_accuracy) print("Number of Positive Predictions: ", predicted_positive) print("Actual number of Positive Cases: ", positive_cases) print("Number of Predicted Negative Cases: ", predicted_negative_cases) print("Actual Number of Negative Cases: ", negative_cases) for index in incorrect_indices: cur_patient_id = classifier_runner.classifier.dataset[cur_dataset][ "ids"][index] cur_pred = classifier_runner.classifier.dataset[cur_dataset][ "preds"][index] cur_label = classifier_runner.classifier.dataset[cur_dataset][ "labels"][index] cur_match_obj = classifier_runner.classifier.dataset[cur_dataset][ "matches"][index] cur_score = classifier_runner.classifier.dataset[cur_dataset][ "scores"][index] cur_text = classifier_runner.classifier.dataset[cur_dataset][ "data"][index] failures_dict[cur_patient_id] = { "label": cur_label, "data": cur_text, "pred": cur_pred, "matches": cur_match_obj, "score": cur_score } error_data = { "Predicted Positive": predicted_positive, "Positive Cases": positive_cases, "Predicted Negative": predicted_negative_cases, "Negative Cases": negative_cases, "False Positives": false_positives, "False Negatives": false_negatives, "Confusion Matrix": cnf_matrix.tolist() if cnf_matrix is not None else [], "OVA PPV and Accuracy": ppv_and_accuracy, "Ordered Labels": cur_labels_list, "Negative Label": classifier_runner.classifier.negative_label, "Classifier Type": classifier_type, "Accuracy": accuracy, "Num Correct": num_correct, "Total Cases": num_correct + len(incorrect_indices) } if cnf_matrix is not None: plot_confusion_matrix(cnf_matrix, cur_labels_list, conf_path) except SpecialException as e: raise SpecialException(e) return failures_dict, error_data