예제 #1
0
    def eval(self, json_path, batch_size=100, lowercase=True, ignore_cpos=False, cui_filter=None, score_average='weighted',
            replace_center=None):
        data = json.load(open(json_path, 'r'))

        # Prepare the data
        data = prepare_from_json(data, self.cntx_left, self.cntx_right, self.tokenizer, lowercase=lowercase, cui_filter=cui_filter,
                replace_center=replace_center)

        # Check is the name there
        if self.category_name not in data:
            raise Exception("The category name does not exist in this json file.")

        data = data[self.category_name]

        # We already have everything, just get the data
        data, _ = encode_category_values(data, vals=self.category_values)

        # Convert data tkns to ids
        data = tkns_to_ids(data, self.tokenizer)

        # Run evaluation
        result = eval_network(self.model, data, max_seq_len=(self.cntx_left+self.cntx_right+1), pad_id=self.pad_id,
                batch_size=batch_size, device=self.device, ignore_cpos=ignore_cpos, score_average=score_average)

        return result
예제 #2
0
파일: meta_cat.py 프로젝트: halloju/MedCAT
    def train(self,
              json_path,
              category_name=None,
              model_name='BERT_GRU',
              Bio_BERT_PATH=None,
              lr=0.01,
              test_size=0.1,
              batch_size=100,
              nepochs=20,
              lowercase=True,
              class_weights=None,
              cv=0,
              ignore_cpos=False,
              model_config={},
              tui_filter=None,
              fine_tune=False,
              auto_save_model=True,
              score_average='weighted',
              replace_center=None,
              seed=11):
        r''' TODO: Docs
        '''
        set_all_seeds(seed)
        data = json.load(open(json_path, 'r'))

        # Create directories if they don't exist
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

        # Prepare the data
        data = prepare_from_json(data,
                                 self.cntx_left,
                                 self.cntx_right,
                                 self.tokenizer,
                                 lowercase=lowercase,
                                 tui_filter=tui_filter,
                                 replace_center=replace_center)

        if category_name is not None:
            self.category_name = category_name

        # Check is the name there
        if self.category_name not in data:
            raise Exception(
                "The category name does not exist in this json file. You've provided '{}', while the possible options are: {}"
                .format(self.category_name, " | ".join(list(data.keys()))))

        data = data[self.category_name]

        if not fine_tune:
            # Encode the category values
            data, self.category_values = encode_category_values(data)
            self.i_category_values = {
                v: k
                for k, v in self.category_values.items()
            }
        else:
            # We already have everything, just get the data
            data, _ = encode_category_values(data, vals=self.category_values)

        # Convert data tkns to ids
        data = tkns_to_ids(data, self.tokenizer)

        if not fine_tune:
            if model_name == 'lstm':
                from medcat.utils.models import LSTM
                nclasses = len(self.category_values)
                bid = model_config.get("bid", True)
                num_layers = model_config.get("num_layers", 2)
                input_size = model_config.get("input_size", 300)
                hidden_size = model_config.get("hidden_size", 300)
                dropout = model_config.get("dropout", 0.5)

                self.model = LSTM(self.embeddings,
                                  self.pad_id,
                                  nclasses=nclasses,
                                  bid=bid,
                                  num_layers=num_layers,
                                  input_size=input_size,
                                  hidden_size=hidden_size,
                                  dropout=dropout)

            if model_name == 'bert_gru':
                from medcat.utils.models import BERT_GRU
                nclasses = len(self.category_values)
                bid = model_config.get("bid", True)
                num_layers = model_config.get("num_layers", 5)
                input_size = model_config.get("input_size", 768)
                hidden_size = model_config.get("hidden_size", 768)
                dropout = model_config.get("dropout", 0.5)

                self.model = BERT_GRU(Bio_BERT_PATH,
                                      nclasses=nclasses,
                                      bid=bid,
                                      num_layers=num_layers,
                                      input_size=input_size,
                                      hidden_size=hidden_size,
                                      dropout=dropout)

        if cv == 0:
            (f1, p, r, cls_report) = train_network(
                self.model,
                data,
                max_seq_len=(self.cntx_left + self.cntx_right + 1),
                lr=lr,
                test_size=test_size,
                pad_id=self.pad_id,
                batch_size=batch_size,
                nepochs=nepochs,
                device=self.device,
                class_weights=class_weights,
                ignore_cpos=ignore_cpos,
                save_dir=self.save_dir,
                auto_save_model=auto_save_model,
                score_average=score_average)
        elif cv > 0:
            # Mainly for testing, not really used in a normal workflow
            f1s = []
            ps = []
            rs = []
            cls_reports = []
            for i in range(cv):
                # Reset the model
                if fine_tune:
                    self.load_model(model=model_name)
                else:
                    if model_name == 'lstm':
                        from medcat.utils.models import LSTM
                        nclasses = len(self.category_values)
                        self.model = LSTM(self.embeddings,
                                          self.pad_id,
                                          nclasses=nclasses)

                (_f1, _p, _r, _cls_report) = train_network(
                    self.model,
                    data,
                    max_seq_len=(self.cntx_left + self.cntx_right + 1),
                    lr=lr,
                    test_size=test_size,
                    pad_id=self.pad_id,
                    batch_size=batch_size,
                    nepochs=nepochs,
                    device=self.device,
                    class_weights=class_weights,
                    ignore_cpos=ignore_cpos,
                    save_dir=self.save_dir,
                    score_average=score_average)
                f1s.append(_f1)
                ps.append(_p)
                rs.append(_r)
                cls_reports.append(_cls_report)
            f1 = np.average(f1s)
            p = np.average(ps)
            r = np.average(rs)

            # Average cls reports
            cls_report = {}
            _cls_report = cls_reports[0]
            for label in _cls_report.keys():
                cls_report[label] = {}
                if type(_cls_report[label]) == dict:
                    for score in _cls_report[label].keys():
                        cls_report[label][score] = sum(
                            [r[label][score]
                             for r in cls_reports]) / len(cls_reports)

        print("Best/Average scores: F1: {}, P: {}, R: {}".format(f1, p, r))

        return {'f1': f1, 'p': p, 'r': r, 'cls_report': cls_report}
예제 #3
0
    def train(self,
              json_path,
              category_name=None,
              model_name='lstm',
              lr=0.01,
              test_size=0.1,
              batch_size=100,
              nepochs=20,
              lowercase=True,
              class_weights=None,
              cv=0,
              ignore_cpos=False,
              model_config={},
              tui_filter=None,
              fine_tune=False,
              auto_save_model=True):
        r''' TODO: Docs
        '''
        data = json.load(open(json_path, 'r'))

        # Create directories if they don't exist
        if not os.path.exists(self.save_dir):
            os.makedirs(self.save_dir)

        # Prepare the data
        data = prepare_from_json(data,
                                 self.cntx_left,
                                 self.cntx_right,
                                 self.tokenizer,
                                 lowercase=lowercase,
                                 tui_filter=tui_filter)

        if category_name is not None:
            self.category_name = category_name

        # Check is the name there
        if self.category_name not in data:
            raise Exception(
                "The category name does not exist in this json file.")

        data = data[self.category_name]

        if not fine_tune:
            # Encode the category values
            data, self.category_values = encode_category_values(data)
            self.i_category_values = {
                v: k
                for k, v in self.category_values.items()
            }
        else:
            # We already have everything, just get the data
            data, _ = encode_category_values(data, vals=self.category_values)

        # Convert data tkns to ids
        data = tkns_to_ids(data, self.tokenizer)

        if not fine_tune:
            if model_name == 'lstm':
                from medcat.utils.models import LSTM
                nclasses = len(self.category_values)
                bid = model_config.get("bid", True)
                num_layers = model_config.get("num_layers", 2)
                input_size = model_config.get("input_size", 300)
                hidden_size = model_config.get("hidden_size", 300)
                dropout = model_config.get("dropout", 0.5)

                self.model = LSTM(self.embeddings,
                                  self.pad_id,
                                  nclasses=nclasses,
                                  bid=bid,
                                  num_layers=num_layers,
                                  input_size=input_size,
                                  hidden_size=hidden_size,
                                  dropout=dropout)

        if cv == 0:
            (f1, p, r) = train_network(self.model,
                                       data,
                                       max_seq_len=(self.cntx_left +
                                                    self.cntx_right + 1),
                                       lr=lr,
                                       test_size=test_size,
                                       pad_id=self.pad_id,
                                       batch_size=batch_size,
                                       nepochs=nepochs,
                                       device=self.device,
                                       class_weights=class_weights,
                                       ignore_cpos=ignore_cpos,
                                       save_dir=self.save_dir,
                                       auto_save_model=auto_save_model)
        elif cv > 0:
            # Mainly for testing, not really used in a normal workflow
            f1s = []
            ps = []
            rs = []
            for i in range(cv):
                # Reset the model
                if fine_tune:
                    self.load_model(model=model_name)
                else:
                    if model_name == 'lstm':
                        from medcat.utils.models import LSTM
                        nclasses = len(self.category_values)
                        self.model = LSTM(self.embeddings,
                                          self.pad_id,
                                          nclasses=nclasses)

                (_f1, _p, _r) = train_network(
                    self.model,
                    data,
                    max_seq_len=(self.cntx_left + self.cntx_right + 1),
                    lr=lr,
                    test_size=test_size,
                    pad_id=self.pad_id,
                    batch_size=batch_size,
                    nepochs=nepochs,
                    device=self.device,
                    class_weights=class_weights,
                    ignore_cpos=ignore_cpos,
                    save_dir=self.save_dir)
                f1s.append(_f1)
                ps.append(_p)
                rs.append(_r)
            f1 = np.average(f1s)
            p = np.average(ps)
            r = np.average(rs)

        print("Best/Average scores: F1: {}, P: {}, R: {}".format(f1, p, r))

        return {'f1': f1, 'p': p, 'r': r}
예제 #4
0
    def train(self,
              json_path,
              category_name,
              model_name='lstm',
              lr=0.01,
              test_size=0.1,
              batch_size=100,
              nepochs=20,
              device='cpu',
              lowercase=True,
              class_weights=None,
              cv=0):
        data = json.load(open(json_path, 'r'))

        # Prepare the data
        data = prepare_from_json(data,
                                 self.cntx_left,
                                 self.cntx_right,
                                 self.tokenizer,
                                 lowercase=lowercase)

        # Check is the name there
        if category_name not in data:
            raise Exception(
                "The category name does not exist in this json file")

        data = data[category_name]

        # Encode the category values
        self.category_name = category_name
        data, self.category_values = encode_category_values(data)
        self.i_category_values = {
            v: k
            for k, v in self.category_values.items()
        }

        # Convert data tkns to ids
        data = tkns_to_ids(data, self.tokenizer)

        if model_name == 'lstm':
            from medcat.utils.models import LSTM
            nclasses = len(self.category_values)
            model = LSTM(self.embeddings, self.pad_id, nclasses=nclasses)

        if cv == 0:
            (f1, p, r) = train_network(model,
                                       data,
                                       max_seq_len=(self.cntx_left +
                                                    self.cntx_right + 1),
                                       lr=lr,
                                       test_size=test_size,
                                       pad_id=self.pad_id,
                                       batch_size=batch_size,
                                       nepochs=nepochs,
                                       device=device,
                                       class_weights=class_weights)
        elif cv > 0:
            # Mainly for testing, not really used in a normal workflow
            f1s = []
            ps = []
            rs = []
            for i in range(cv):
                # Reset the model
                if model_name == 'lstm':
                    from medcat.utils.models import LSTM
                    nclasses = len(self.category_values)
                    model = LSTM(self.embeddings,
                                 self.pad_id,
                                 nclasses=nclasses)

                (_f1, _p, _r) = train_network(
                    model,
                    data,
                    max_seq_len=(self.cntx_left + self.cntx_right + 1),
                    lr=lr,
                    test_size=test_size,
                    pad_id=self.pad_id,
                    batch_size=batch_size,
                    nepochs=nepochs,
                    device=device,
                    class_weights=class_weights)
                f1s.append(_f1)
                ps.append(_p)
                rs.append(_r)
            f1 = np.average(f1s)
            p = np.average(ps)
            r = np.average(rs)

        print("Best/Average scores: F1: {}, P: {}, R: {}".format(f1, p, r))

        self.model = model