示例#1
0
    def _add_dict_to_pattern(self, regex, required_pwds, pwds):
        """Inserts the required dictionary lists into the pattern. E.g "(He|She) is from {country}" -> country_list
        
        Arguments:
            regex {String} -- Regex
            required_pwds {list} -- List of required pwds
            pwds {dict} -- Personalized word dictionary {"dict_name" -> list}
        
        Returns:
            regex {String} -- Modified regex with dictionary inserted
        """

        #string.format can't work if we regexes which have curly braces like so \d{4} since str.format expects a value
        #opting for a simple replace method

        try:
            regex_pwds = {key: "|".join(pwds[key]) for key in required_pwds}

            for key in regex_pwds:
                if regex_pwds[key]:
                    regex = regex.replace("{{{}}}".format(key),
                                          regex_pwds[key])
        except KeyError as e:
            cause = e.args[0]
            raise SpecialException(
                "Could not find dictionary {}. Make sure it is included in "
                "the Required Dictionaries textfield of the Variable Settings tab."
                .format(cause))
        return regex
示例#2
0
def get_labeled_data(ids_list,
                     data_list,
                     label_file,
                     l_id_col=1,
                     l_label_col=None,
                     l_first_row=3,
                     label_func=None,
                     encoding=None):
    """
    :param ids_list:
    :param data_list:
    :param label_file:
    :param l_id_col:
    :param l_label_col:
    :param l_first_row:
    :param label_func:
    :return:
    """

    try:
        new_data_list = []
        new_labels_list = []
        new_ids_list = []

        local_data_loader = data_from_csv if label_file.endswith(
            '.csv') else data_from_excel
        # TODO: accordingly increment/decrement l_id_col, l_label_col, l_first_row, check_col depending on filetype
        _, temp_labels, temp_ids = local_data_loader([label_file],
                                                     id_cols=l_id_col,
                                                     label_cols=l_label_col,
                                                     repeat_ids=False,
                                                     first_row=l_first_row,
                                                     check_col=1,
                                                     encoding=encoding)

        new_list = []
        for i, data_id in enumerate(ids_list):
            if data_id in temp_ids:
                # temp_ids must be unique
                new_list.append([
                    temp_labels[temp_ids.index(data_id)], data_list[i], data_id
                ])

        new_list = sorted(new_list, key=lambda j: j[2])

        for each in new_list:
            new_labels_list.append(each[0])
            new_data_list.append(each[1])
            new_ids_list.append(each[2])

        if label_func:
            label_func(new_labels_list)

    except IndexError:
        raise SpecialException(
            "Specified label column does not appear in the file.")

    return new_ids_list, new_data_list, new_labels_list
    def __init__(self, pwds, categories_of_interest, to_lower=False):
        try:
            self.pwds = pwds
            self.categories = categories_of_interest
            self.to_lower = to_lower
            self._max_len = 4

            if self.to_lower:
                self.pwds = {category: {term.lower() for term in self.pwds[category]}
                             for category in categories_of_interest}
            else:
                self.pwds = {category: {term for term in self.pwds[category]}
                             for category in categories_of_interest}
        except KeyError as e:
            cause = e.args[0]
            raise SpecialException("Could not find dictionary {}".format(cause))
示例#4
0
def get_data(data_col, label_col, id_col, data, labels, ids, repeat_ids,
             row_process_func):
    """Gets data, label and id from a row

    Arguments:
        data_col {int} -- data column number
        label_col {int} -- label column number
        id_col {int} -- id column number
        data {list} -- list of data so far
        labels {list} -- list of labels so far
        ids {list} -- list of ids so far
        repeat_ids {boolean} -- if False, data is concatenated
        row_process_func {function} -- function that extracts a value from a row

    Returns:
        data {list} -- list of data
        labels {list} -- list of labels
        ids {list} -- list of ids
    """
    try:
        concat_index = None
        # id_col which is a list
        if id_col is not None:
            cur_id = row_process_func(id_col)

            #if repeat ids is false and id in ids, we want to concatenate the ids
            if not repeat_ids and cur_id in ids:
                concat_index = ids.index(cur_id)
            else:
                ids.append(cur_id)

        if label_col is not None:
            #If we are concatenating data (i.e repeat_ids = False), use all the diagnoses
            cur_label = []
            for actual_label_col in label_col:
                val = row_process_func(actual_label_col)
                val = "None" if not val else val
                cur_label.append(val)
            if concat_index is not None:
                if type(labels[concat_index]) == list:
                    labels[concat_index].extend(cur_label)
                else:
                    labels[concat_index] = [labels[concat_index]
                                            ].extend(cur_label)
            else:
                if len(cur_label) == 1:
                    labels.append(cur_label[0])
                else:
                    labels.append(cur_label)

        if data_col is not None:
            data_string = row_process_func(data_col[0])
            for i in range(1, len(data_col)):
                data_string += "\n{}".format(row_process_func(data_col[i]))
            # print("-"*100)
            # print('NoneType has been found' if datum is None else datum)

            if concat_index is not None:
                data[concat_index] += "{}\n".format(
                    preprocess_data(data_string))
            else:
                data.append(preprocess_data(data_string))

    except IndexError:
        raise SpecialException("Specified column does not appear in the file.")

    return data, labels, ids
示例#5
0
def regexes_from_json(filename,
                      use_custom_score=False,
                      all_matches=False,
                      flags=[re.IGNORECASE]):
    regexes = []
    class_name = None

    with open(filename, 'r') as f:
        data = json.load(f)
        all_matches = data[
            "All Matches"] if "All Matches" in data else all_matches

        if "Case Sensitive" in data:
            flags = [re.IGNORECASE] if not data["Case Sensitive"] else None
        else:
            flags = flags

        if "Name" not in data:
            raise SpecialException("Rule file requires a label name.")
        else:
            class_name = data["Name"]

        rule_types = {"Replace": "r", "Ignore": "i", "Add": "a"}

        if "Rules" in data:
            for rule in data["Rules"]:
                score = None if not use_custom_score else rule["Primary"][
                    "Score"]
                primary_pattern = _compile_tags_to_regex(
                    rule["Primary"]["Rule"])

                secondary_regexes = []

                for rule_type in rule_types:
                    for secondary_rule in rule["Secondary"][rule_type]:
                        secondary_score = None if not use_custom_score else secondary_rule[
                            "Score"]
                        secondary_pattern = _compile_tags_to_regex(
                            secondary_rule["Rule"])

                        effect = rule_types[rule_type]

                        if "Modifier" in secondary_rule and secondary_rule[
                                "Modifier"] != "None":
                            effect += secondary_rule["Modifier"]

                        #Remember to never do what is under this line ever again
                        # effect = secondary_rule["Type"] + secondary_rule["Modifier"] \
                        #     if "Modifier" in secondary_rule else ""

                        secondary_regex = Regex(name="sec_reg{}-{}-{}".format(
                            len(regexes) + 1,
                            len(secondary_regexes) + 1, class_name),
                                                regex=secondary_pattern,
                                                effect=effect,
                                                score=secondary_score,
                                                all_matches=all_matches,
                                                flags=flags,
                                                secondary_regexes=[])

                        secondary_regexes.append(secondary_regex)

                primary_regex = Regex(name="reg{}-{}".format(
                    len(regexes) + 1, class_name),
                                      regex=primary_pattern,
                                      score=score,
                                      effect='p',
                                      secondary_regexes=secondary_regexes,
                                      all_matches=all_matches,
                                      flags=flags)

                regexes.append(primary_regex)

    return class_name, regexes
示例#6
0
def data_from_csv(filenames,
                  data_cols=None,
                  label_cols=None,
                  id_cols=None,
                  repeat_ids=True,
                  first_row=1,
                  limit=None,
                  preprocess_func=None,
                  check_col=0,
                  encoding=None):
    """Reads data from CSV files

    Arguments:
        filenames {list of string} -- List of CSV filenames

    Keyword Arguments:
        data_cols {list of int or int} -- List of location of data
            columns in each file (default: {None})
        label_cols {list of int or int} -- List of location of label
            columns in each file (default: {None})
        id_cols {list of int or int} -- List of location of id columns
            in each file (default: {None})
        repeat_ids {bool} -- If False, data corresponding to already
            existing ids are concatenated (default: {True})
        first_row {int} -- Starts reading from specified row number
            (default: {1})
        limit {int} -- Stops reading after specified number of lines
            have been read (default: {None})
        preprocess_func {function} -- Applies preprocess function to
            each row in a file (default: {None})
        check_col {int} -- Data column to check whether to continue
            evaluation (default: {0})
        encoding {str} -- Encoding of file - if not specified,
            try "utf8" and then "latin-1"

    Returns:
        data {list} -- list of data
        labels {list} -- list of labels
        ids {list} -- list of ids
    """

    # If encoding is not specified, try reading file with UTF-8
    # encoding. If that fails, use latin-1 encoding
    if not encoding:
        try:
            encoding = "utf8"
            pd.read_csv(filenames[0])
        except UnicodeDecodeError:
            encoding = "latin-1"

    try:
        data, labels, ids, data_cols, \
            label_cols, id_cols = _data_helper(len(filenames),
                                               data_cols,
                                               label_cols,
                                               id_cols)

        print("Reading data from csv file...")

        count = 0
        for file_num, filename in enumerate(filenames):
            if limit is not None and count == limit:
                break
            with open(filename, "r", encoding=encoding) as csv_file:
                rows = csv.reader(csv_file, delimiter=",", quotechar='"')
                for i, row in enumerate(rows):
                    if i >= first_row:
                        # If label column is empty don't include it
                        if row[check_col] == '':
                            continue

                        count += 1

                        # getting data, label and ids from each row and
                        # concatenating it
                        data, labels, ids = get_data(data_cols[file_num],
                                                     label_cols[file_num],
                                                     id_cols[file_num], data,
                                                     labels, ids, repeat_ids,
                                                     lambda col: str(row[col]))

        if preprocess_func is not None:
            for i in range(len(data)):
                data[i] = preprocess_func(data[i])

    except SpecialException as e:
        raise SpecialException(str(e))

    return data, labels, ids
示例#7
0
def data_from_excel(filenames,
                    data_cols=None,
                    label_cols=None,
                    id_cols=None,
                    repeat_ids=True,
                    first_row=1,
                    limit=None,
                    preprocess_func=None,
                    check_col=0,
                    encoding=None):
    """Reads data from excel files

    Arguments:
        filenames {list of string} -- List of Excel filenames

    Keyword Arguments:
        data_cols {list of int or int} -- List of location of data columns in each file (default: {None})
        label_cols {list of int or int} -- List of location of label columns in each file (default: {None})
        id_cols {list of int or int} -- List of location of id columns in each file (default: {None})
        repeat_ids {bool} -- If False, data corresponding to already existing ids are concatenated (default: {True})
        first_row {int} -- Starts reading from specified row number (default: {1})
        limit {int} -- Stops reading after specified number of lines have been read (default: {None})
        preprocess_func {function} -- Applies preprocess function to each row in a file (default: {None})
        check_col {int} -- Data column to check whether to continue evaluation (default: {0})

    Returns:
        data {list} -- list of data
        labels {list} -- list of labels
        ids {list} -- list of ids
    """

    # If encoding is not specified, try reading file with UTF-8
    # encoding. If that fails, use latin-1 encoding
    if not encoding:
        try:
            encoding = "utf8"
            pd.read_excel(filenames[0])
        except UnicodeDecodeError:
            encoding = "latin-1"

    try:
        data, labels, ids, data_cols, label_cols, id_cols = _data_helper(
            len(filenames), data_cols, label_cols, id_cols)

        print("Reading data from excel file...")

        count = 0
        for file_num, filename in enumerate(filenames):
            if limit is not None and count == limit:
                break

            workbook = openpyxl.load_workbook(filename,
                                              data_only=True,
                                              read_only=True)
            workbook.encoding = encoding
            sheet_names = workbook.get_sheet_names()
            for sheet_name in sheet_names:
                #getting rows in worksheet
                cur_ws = workbook[sheet_name].rows
                for i, row in enumerate(cur_ws):
                    if i >= first_row:
                        #If check column is empty don't include it
                        if row[check_col].value is None:
                            continue
                        count += 1
                        #getting data, label and ids from each row and concatenating it
                        data, labels, ids = get_data(
                            data_cols[file_num], label_cols[file_num],
                            id_cols[file_num], data, labels, ids, repeat_ids,
                            lambda col: str(row[col].value))
            end = time()
        if preprocess_func is not None:
            for i in range(len(data)):
                data[i] = preprocess_func(data[i])

    except SpecialException as e:
        raise SpecialException(str(e))

    return data, labels, ids
示例#8
0
def get_failures(classifier_runner, cur_dataset, conf_path, print_output=True):
    try:
        failures_dict = {}
        try:
            cur_labels_list = sorted(
                list(
                    set(classifier_runner.classifier.dataset[cur_dataset]
                        ["preds"].tolist())
                    | set(classifier_runner.classifier.dataset[cur_dataset]
                          ["labels"].tolist())))
        except TypeError:
            raise SpecialException(
                "Error occurred when generating stats. Check label file to make sure all ids are unique."
            )
        accuracy, \
            incorrect_indices, num_correct = calculate_accuracy(classifier_runner.classifier.dataset[cur_dataset]["preds"],
                                                   classifier_runner.classifier.dataset[cur_dataset]["labels"])

        if print_output:
            print("\nAccuracy: ", accuracy)
            print("\nIds: ",
                  classifier_runner.classifier.dataset[cur_dataset]["ids"])
            print("Predictions: ",
                  classifier_runner.classifier.dataset[cur_dataset]["preds"])
            print("Labels: ",
                  classifier_runner.classifier.dataset[cur_dataset]["labels"])

            print(
                "\nIncorrect Ids: ",
                classifier_runner.classifier.dataset[cur_dataset]["ids"]
                [incorrect_indices])
            print(
                "Incorrect Predictions: ",
                classifier_runner.classifier.dataset[cur_dataset]["preds"]
                [incorrect_indices])
            print(
                "Incorrect Labels: ",
                classifier_runner.classifier.dataset[cur_dataset]["labels"]
                [incorrect_indices])

        classifier_type = classifier_runner.classifier_type.__name__
        classifier_classes = sorted(
            list(classifier_runner.classifier_parameters["regexes"]))

        if classifier_type == "CaptureClassifier":
            cnf_matrix = None

            ppv_and_accuracy = compute_ppv_accuracy_capture(
                classifier_runner.classifier.dataset[cur_dataset]["labels"],
                classifier_runner.classifier.dataset[cur_dataset]["preds"],
                classifier_classes,
                classifier_runner.classifier.negative_label)

            predicted_positive, positive_cases, predicted_negative_cases, negative_cases, \
            false_positives, false_negatives = get_classification_stats_capture(
                classifier_runner.classifier.dataset[cur_dataset]["labels"],
                classifier_runner.classifier.dataset[cur_dataset]["preds"],
                classifier_classes, classifier_runner.classifier.negative_label)

            cur_labels_list = classifier_classes

        else:

            cnf_matrix = confusion_matrix(
                classifier_runner.classifier.dataset[cur_dataset]["labels"],
                classifier_runner.classifier.dataset[cur_dataset]["preds"])
            ppv_and_accuracy = compute_ppv_accuracy_ova(
                cnf_matrix, cur_labels_list)
            predicted_positive, positive_cases, predicted_negative_cases, negative_cases, \
            false_positives, false_negatives = get_classification_stats(cnf_matrix, cur_labels_list)

        if print_output:
            print("Confusion Matrix: ")

            print("OVA PPV and Accuracy: ", ppv_and_accuracy)

            print("Number of Positive Predictions: ", predicted_positive)
            print("Actual number of Positive Cases: ", positive_cases)
            print("Number of Predicted Negative Cases: ",
                  predicted_negative_cases)
            print("Actual Number of Negative Cases: ", negative_cases)

        for index in incorrect_indices:
            cur_patient_id = classifier_runner.classifier.dataset[cur_dataset][
                "ids"][index]
            cur_pred = classifier_runner.classifier.dataset[cur_dataset][
                "preds"][index]
            cur_label = classifier_runner.classifier.dataset[cur_dataset][
                "labels"][index]
            cur_match_obj = classifier_runner.classifier.dataset[cur_dataset][
                "matches"][index]
            cur_score = classifier_runner.classifier.dataset[cur_dataset][
                "scores"][index]
            cur_text = classifier_runner.classifier.dataset[cur_dataset][
                "data"][index]

            failures_dict[cur_patient_id] = {
                "label": cur_label,
                "data": cur_text,
                "pred": cur_pred,
                "matches": cur_match_obj,
                "score": cur_score
            }

        error_data = {
            "Predicted Positive":
            predicted_positive,
            "Positive Cases":
            positive_cases,
            "Predicted Negative":
            predicted_negative_cases,
            "Negative Cases":
            negative_cases,
            "False Positives":
            false_positives,
            "False Negatives":
            false_negatives,
            "Confusion Matrix":
            cnf_matrix.tolist() if cnf_matrix is not None else [],
            "OVA PPV and Accuracy":
            ppv_and_accuracy,
            "Ordered Labels":
            cur_labels_list,
            "Negative Label":
            classifier_runner.classifier.negative_label,
            "Classifier Type":
            classifier_type,
            "Accuracy":
            accuracy,
            "Num Correct":
            num_correct,
            "Total Cases":
            num_correct + len(incorrect_indices)
        }

        if cnf_matrix is not None:
            plot_confusion_matrix(cnf_matrix, cur_labels_list, conf_path)
    except SpecialException as e:
        raise SpecialException(e)

    return failures_dict, error_data