Exemplo n.º 1
0
def predict(filename, model_id='latest'):
    ''' Makes a prediction using the named entity recognition model.

    Args:
        input (filename): The filename to evaluate.
        model_id (string): the id of the model to use.
    '''
    model_path = __get_model_path(model_id)
    nlp, _ = ner.get_model()
    nlp_bytes = persistence.bin_to_obj(model_path + 'ner_mdl.pickle')
    nlp.from_bytes(nlp_bytes)
    recognizer = EntityRecognizer(nlp)
    return recognizer.predict(filename)
Exemplo n.º 2
0
    def build(self, *args):
        '''Builds the object.

        Args:
            args: The args required to build the object.
        '''
        classifier = Classifier(pickle.loads(args[1]), pickle.loads(args[0]),
                                json.loads(args[2]))

        label_dict = json.loads(args[2])
        label_list = [{'id': k, 'name': v} for (k, v) in label_dict.items()]
        label_json = json.dumps(label_list)

        nlp, _ = ner.get_model()
        nlp_bytes = pickle.loads(args[3])
        nlp.from_bytes(nlp_bytes)
        recognizer = EntityRecognizer(nlp)

        return {
            "labels": label_list,
            "classifier": classifier,
            "recognizer": recognizer
        }
    def run(self):
        '''Runs the pipeline step.

        '''
        iterations = 10
        train_data = persistence.bin_to_obj(self.input['train_data'])
        nlp, ner_pipe = ner.get_model()

        # Add labels
        for _, annotations in train_data:
            for ent in annotations.get("entities"):
                ner_pipe.add_label(ent[2])

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        with nlp.disable_pipes(*other_pipes):  # only train NER
            nlp.begin_training()
            for _ in range(iterations):
                random.shuffle(train_data)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(train_data,
                                    size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(texts, annotations, drop=0.5, losses=losses)
                #self.print(losses['ner'])

        output_dir = self.output['output_dir'].format(
            timestamp=self.__get_timestamp())
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)

        self.output['model'] = self.output['model'].format(
            output_dir=output_dir)

        persistence.obj_to_bin(nlp.to_bytes(), self.output['model'])
Exemplo n.º 4
0
    def run(self):
        '''Runs the pipeline step.

        '''
        nlp, _ = ner.get_model()
        nlp.from_bytes(persistence.bin_to_obj(self.input['TrainNerModel_model']))
        recognizer = EntityRecognizer(nlp)

        def predict(row):
            return recognizer.predict(row['name'])

        df = pd.read_csv(self.input['predictions'])
        df['actual'] = df.apply(predict, axis=1)

        def print_incorrect(row):
            actual_list = list(row['actual'])
            expected_list = list(ast.literal_eval(row['expected']))

            if len(actual_list) != len(expected_list):
                self.print(
                    '\'{name}\' [ expected: {expected}, actual: {actual} ]',
                    name=row['name'],
                    expected=row['expected'],
                    actual=row['actual'])
            else:
                for i in range(len(actual_list)): # pylint: disable=consider-using-enumerate
                    x = actual_list[i]
                    y = expected_list[i]
                    if x[0] != y[0] or x[1] != y[1]:
                        self.print(
                            '\'{name}\' [ expected: {expected}, actual: {actual} ]',
                            name=row['name'],
                            expected=y,
                            actual=x)

        df.apply(print_incorrect, axis=1)
def fixture_model():
    nlp, _ = ner.get_model()
    nlp.from_bytes(persistence.bin_to_obj('models/ner_mdl.pickle'))
    return EntityRecognizer(nlp)