Пример #1
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False):
    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]
   
    templates = Tagger.default_templates()
    nlp = Language(data_dir=model_dir, tagger=False)
    nlp.tagger = Tagger.blank(nlp.vocab, templates)

    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                words = annot_tuples[1]
                gold_tags = annot_tuples[2]
                score_model(scorer, nlp, raw_text, annot_tuples)
                if raw_text is None:
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    tokens = nlp.tokenizer(raw_text)
                loss += nlp.tagger.train(tokens, gold_tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    nlp.end_training(model_dir)
def main(output_dir=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        ensure_dir(output_dir)
        ensure_dir(output_dir / "pos")
        ensure_dir(output_dir / "vocab")

    vocab = Vocab(tag_map=TAG_MAP)
    # The default_templates argument is where features are specified. See
    # spacy/tagger.pyx for the defaults.
    tagger = Tagger(vocab)
    for i in range(25):
        for words, tags in DATA:
            doc = Doc(vocab, words=words)
            gold = GoldParse(doc, tags=tags)
            tagger.update(doc, gold)
        random.shuffle(DATA)
    tagger.model.end_training()
    doc = Doc(vocab,
              orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4))
    tagger(doc)
    for word in doc:
        print(word.text, word.tag_, word.pos_)
    if output_dir is not None:
        tagger.model.dump(str(output_dir / 'pos' / 'model'))
        with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
            tagger.vocab.strings.dump(file_)
Пример #3
0
def main(output_dir=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        ensure_dir(output_dir)
        ensure_dir(output_dir / "pos")
        ensure_dir(output_dir / "vocab")

    vocab = Vocab(tag_map=TAG_MAP)
    # The default_templates argument is where features are specified. See
    # spacy/tagger.pyx for the defaults.
    tagger = Tagger(vocab)
    for i in range(25):
        for words, tags in DATA:
            doc = Doc(vocab, words=words)
            gold = GoldParse(doc, tags=tags)
            tagger.update(doc, gold)
        random.shuffle(DATA)
    tagger.model.end_training()
    doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4))
    tagger(doc)
    for word in doc:
        print(word.text, word.tag_, word.pos_)
    if output_dir is not None:
        tagger.model.dump(str(output_dir / 'pos' / 'model'))
        with (output_dir / 'vocab' / 'strings.json').open('wb') as file_:
            tagger.vocab.strings.dump(file_)
Пример #4
0
def train(Language,
          gold_tuples,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          seed=0,
          gold_preproc=False,
          force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)

    Config.write(dep_model_dir,
                 'config',
                 features=feat_set,
                 seed=seed,
                 labels=ArcEager.get_labels(gold_tuples))

    nlp = Language(data_dir=model_dir,
                   tagger=False,
                   parser=False,
                   entity=False)
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)

    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, _ in sents:
                if len(annot_tuples[1]) == 1:
                    continue

                score_model(scorer, nlp, None, annot_tuples, verbose=False)

                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples)

                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f' %
              (itn, loss, scorer.uas, scorer.tags_acc, scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #5
0
def main(train_loc, dev_loc, model_dir, tag_map_loc=None):
    if tag_map_loc:
        with open(tag_map_loc) as file_:
            tag_map = json.loads(file_.read())
    else:
        tag_map = DEFAULT_TAG_MAP
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))
    vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if tag_map:
                for tag in tags:
                    assert tag in tag_map, repr(tag)
    tagger = Tagger(vocab, tag_map=tag_map)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(15):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
def main(train_loc, dev_loc, output_dir=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        ensure_dir(output_dir)
        ensure_dir(output_dir / "pos")
        ensure_dir(output_dir / "vocab")

    train_data = read_ud_data(train_loc)
    vocab = Vocab(tag_map=TAG_MAP, lex_attr_getters=LEX_ATTR_GETTERS)
    # Populate vocab
    for words, _ in train_data:
        for word in words:
            _ = vocab[word]
    
    model = spacy.tagger.TaggerModel(spacy.tagger.Tagger.feature_templates)
    tagger = Tagger(vocab, model)
    print(tagger.tag_names)
    for i in range(30):
        print("training model (iteration " + str(i) + ")...")
        score = 0.
        num_samples = 0.
        for words, tags in train_data:
            doc = Doc(vocab, words=words)
            gold = GoldParse(doc, tags=tags)
            cost = tagger.update(doc, gold)
            for i, word in enumerate(doc):
                num_samples += 1
                if word.tag_ == tags[i]:
                    score += 1
        print('Train acc', score/num_samples) 
        random.shuffle(train_data)
    tagger.model.end_training()

    score = 0.0
    test_data = read_ud_data(dev_loc)
    num_samples = 0
    for words, tags in test_data:
        doc = Doc(vocab, words)
        tagger(doc)
        for i, word in enumerate(doc):
            num_samples += 1
            if word.tag_ == tags[i]:
                score += 1
    print("score: " + str(score / num_samples * 100.0))
    
    if output_dir is not None:
        tagger.model.dump(str(output_dir / 'pos' / 'model'))
        with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
            tagger.vocab.strings.dump(file_)
Пример #7
0
def main(model_dir=None):
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print(
            'please run: `python -m spacy.en.download --force all` for better performance'
        )
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    train_data = json.load(open('train_ner.json'))
    ner = train_ner(nlp, train_data, ['Event_Time'])

    doc = nlp.make_doc('how about coffee tomorrow at 5pm?')
    nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_,
              word.ent_iob)

    if model_dir is not None:
        save_model(ner, model_dir)
Пример #8
0
def main(model_dir=None):
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print(
            'please run: `python -m spacy.en.download --force all` for better performance'
        )
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    train_data = [('Who is Shaka Khan?',
                   [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]),
                  ('I like London and Berlin.', [
                      (len('I like '), len('I like London'), 'LOC'),
                      (len('I like London and '),
                       len('I like London and Berlin'), 'LOC')
                  ])]
    ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])

    doc = nlp.make_doc('Who is Shaka Khan?')
    nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_,
              word.ent_iob)

    if model_dir is not None:
        save_model(ner, model_dir)
def main(model_dir=None):
    nlp = spacy.get_lang_class('pt')(path=None)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('Setting tagger')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    if (len(sys.argv) > 4):
        filetrain = sys.argv[1]
        model_dir = sys.argv[2]
        level = sys.argv[3]
        n_iter = sys.argv[4]
    else:
        print("Usage: python " + sys.argv[0] +
              " <input filename train> <model_dir> <level> <n_iterations>\n")
        sys.exit()

    train_data = get_training_data(filetrain)

    nlp = create_vocab(nlp, train_data)

    categories = [
        'Pessoa', 'Organizacao', 'Localizacao', 'Curso', 'Data', 'Hora',
        'Evento', 'UnidadeOrganica'
    ]

    ner = train_ner(nlp, train_data, categories, int(n_iter))

    if model_dir is not None:
        save_model(ner, model_dir + '/' + level)
Пример #10
0
def main(tagged_output,
         traindata,
         testdata,
         traindataformat="",
         testdataformat="",
         model_dir=None):
    nlp = spacy.load('de', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print(
            'please run: `python -m spacy.en.download --force all` for better performance'
        )
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    if traindataformat == 'germeval':
        Cv = convert_germaeval2spacy(traindata)
        train_data = Cv.convert()[0]
        ner = train_ner(nlp, train_data, [
            'B-OTH', 'I-OTH', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-PER',
            'I-PER', 'O'
        ])

    elif traindataformat == 'conll':
        Cv = convert_conll2spacy(traindata)
        train_data = Cv.convert()[0]
        ner = train_ner(nlp, train_data, [
            'B-MISC', 'I-MISC', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-PER',
            'I-PER', 'O'
        ])

    if testdataformat == 'germeval':
        Cv_test = convert_germaeval2spacy(testdata)
        test_data = Cv_test.convert()[2]

    elif testdataformat == 'conll':
        Cv_test = convert_conll2spacy(testdata)
        test_data = Cv_test.convert()[2]

    #doc = nlp.make_doc(test_data)
    doc = Doc(nlp.vocab, words=test_data)
    ner(doc)
    nlp.tagger(doc)
    i = 0
    testfilespacygermeval = open(tagged_output, "w")
    for word in doc:
        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_,
              word.ent_iob)
        line = word.text + "\t" + word.ent_type_ + "\n"
        testfilespacygermeval.write(line)
        i += 1
    print(i)

    if model_dir is not None:
        save_model(ner, model_dir)
Пример #11
0
    def from_dir(cls, tag_map, model_dir):
        vocab = Vocab(tag_map=tag_map, get_lex_attr=Language.default_lex_attrs())
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)

        cfg = Config.read(path.join(model_dir, 'deps'), 'config')
        parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings, ArcEager)
        return cls(vocab, tokenizer, tagger, parser)
Пример #12
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', seed=0,
          gold_preproc=False, force_gold=False):
    dep_model_dir = path.join(model_dir, 'deps')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples))

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
 
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for _, sents in gold_tuples:
            for annot_tuples, _ in sents:
                if len(annot_tuples[1]) == 1:
                    continue

                score_model(scorer, nlp, None, annot_tuples, verbose=False)

                tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples
                    )
 
                loss += nlp.parser.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas,
                                             scorer.tags_acc, scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #13
0
def main(output_dir):
    ensure_dir(output_dir)
    ensure_dir(output_dir, "pos")
    ensure_dir(output_dir, "vocab")

    vocab = Vocab(tag_map=TAG_MAP)
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    # The default_templates argument is where features are specified. See
    # spacy/tagger.pyx for the defaults.
    tagger = Tagger.blank(vocab, Tagger.default_templates())

    for i in range(5):
        for words, tags in DATA:
            tokens = tokenizer.tokens_from_list(words)
            tagger.train(tokens, tags)
        random.shuffle(DATA)
    tagger.model.end_training(path.join(output_dir, 'pos', 'model'))
    vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt'))
Пример #14
0
def main(output_dir):
    ensure_dir(output_dir)
    ensure_dir(output_dir, "pos")
    ensure_dir(output_dir, "vocab")
    
    vocab = Vocab(tag_map=TAG_MAP)
    tokenizer = Tokenizer(vocab, {}, None, None, None)
    # The default_templates argument is where features are specified. See
    # spacy/tagger.pyx for the defaults.
    tagger = Tagger.blank(vocab, Tagger.default_templates())

    for i in range(5):
        for words, tags in DATA:
            tokens = tokenizer.tokens_from_list(words)
            tagger.train(tokens, tags)
        random.shuffle(DATA)
    tagger.model.end_training(path.join(output_dir, 'pos', 'model'))
    vocab.strings.dump(path.join(output_dir, 'vocab', 'strings.txt'))
Пример #15
0
def main(train_loc, dev_loc, model_dir, tag_map_loc):
    with open(tag_map_loc) as file_:
        tag_map = json.loads(file_.read())
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')
    
    model_dir = pathlib.Path(model_dir)
    with (model_dir / 'deps' / 'config.json').open('w') as file_:
        json.dump({'pseudoprojective': True, 'labels': actions, 'features': features}, file_)

    vocab = Vocab(lex_attr_getters=Language.Defaults.lex_attr_getters, tag_map=tag_map)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            for tag in tags:
                assert tag in tag_map, repr(tag)
    tagger = Tagger(vocab, tag_map=tag_map)
    parser = DependencyParser(vocab, actions=actions, features=features)
    
    for itn in range(15):
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                parser.update(doc, gold)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f' % (itn, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
Пример #16
0
    def from_dir(cls, tag_map, model_dir):
        vocab = Vocab(tag_map=tag_map,
                      get_lex_attr=Language.default_lex_attrs())
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        tagger = Tagger.blank(vocab, TAGGER_TEMPLATES)

        cfg = Config.read(path.join(model_dir, 'deps'), 'config')
        parser = Parser.from_dir(path.join(model_dir, 'deps'), vocab.strings,
                                 ArcEager)
        return cls(vocab, tokenizer, tagger, parser)
Пример #17
0
def train(Language,
          gold_tuples,
          model_dir,
          n_iter=15,
          feat_set=u'basic',
          seed=0,
          gold_preproc=False,
          n_sents=0,
          corruption_level=0,
          beam_width=1,
          verbose=False,
          use_orig_arc_eager=False):
    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    templates = Tagger.default_templates()
    nlp = Language(data_dir=model_dir, tagger=False)
    nlp.tagger = Tagger.blank(nlp.vocab, templates)

    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                words = annot_tuples[1]
                gold_tags = annot_tuples[2]
                score_model(scorer, nlp, raw_text, annot_tuples)
                if raw_text is None:
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    tokens = nlp.tokenizer(raw_text)
                loss += nlp.tagger.train(tokens, gold_tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' %
              (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc,
               scorer.token_acc))
    nlp.end_training(model_dir)
Пример #18
0
def get_model(model_name):
    if model_name not in _models:
        model = spacy.load(model_name)
        if model.tagger is None:
            model.tagger = Tagger(model.vocab, features=Tagger.feature_templates)
        if model.entity is None:
            model.entity = EntityRecognizer(model.vocab, entity_types=['PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE',
                                                                       'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART',
                                                                       'LANGUAGE', 'DATE', 'TIME', 'PERCENT',
                                                                       'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL'])
        model.pipeline = [model.tagger, model.entity, model.parser]
        _models[model_name] = model
    return _models[model_name]
Пример #19
0
def main(model_dir=None):
    #, parser=False, entity=False, add_vectors=False)
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print(
            'please run: `python -m spacy.en.download --force all` for better performance'
        )
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
    train_data = [
        ('I want to buy a Boxer', [(len('I want to buy a '),
                                    len('I want to buy a Boxer'), 'PRODUCT')]),
        ('Do you have a Blanket', [(len('Do you have a '),
                                    len('Do you have Blanket'), 'PRODUCT')]),
        ('Can you show me some Pants', [(len('Can you show me some '),
                                         len('Can you show me some Pants'),
                                         'PRODUCT')]),
        ('Show me some tops', [(len('Show me some '), len('Show me some tops'),
                                'PRODUCT')]),
    ]
    ner = train_ner(nlp, train_data, ['PRODUCT'])

    #     doc = nlp.make_doc('I want a Blanket')
    #     nlp.tagger(doc)
    #     ner(doc)
    #     for word in doc:
    #         print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
    #     train_data = [
    #         (
    #             'Radha',
    #             [(0, len('Radha'), 'PRODUCT')]
    #         )
    #         ]
    #     ner = train_ner(nlp, train_data, ['PRODUCT'])
    #     doc = nlp.make_doc('where is London?')
    #     nlp.tagger(doc)
    #     ner(doc)
    #     for word in doc:
    #         print(word.text,word.ent_type_)

    if model_dir is not None:
        save_model(ner, model_dir)
Пример #20
0
def main(data_dir, model_dir=None, exclude_normalize_tags=None, keys={}):
    '''
	data_dir -> path to brat annotation data. searches recursively
	model_dir -> path to save spacy training model
	exclude_normalize_tags -> list of tags to exclude from normalization. If NONE, no normalization is performed.
	keys -> dict translating brat tags to training tags. keys not in dict will be preserved
	'''

    r = RepoModel(data_dir, recursive=True, cached=False)

    nlp = spacy.load('en')

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print(
            'please run: `python -m spacy.en.download --force all` for better performance'
        )
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    normalized_train_data = []
    excludes = exclude_normalize_tags  #we have manually tagged all instances of these

    for key, data in r.documents.items():
        if exclude_normalize_tags:
            normalized_train_data.extend(
                normalize_tags(nlp, get_annotated_sents(data, keys), excludes))
        else:
            normalized_train_data.extend(get_annotated_sents(data, keys))

    nlp = train_ner(nlp, normalized_train_data, keys.values())

    doc = nlp(
        u"Hi Adam,\nSounds great to me. I'll send through the QA department. In the invite you through Skype, and we can discuss if Applause is right for you.\nI look forward to it!\nRegards,\nAndrew"
    )
    for word in doc:
        print(word.text, word.tag_, word.ent_type_)

    if model_dir is not None:
        save_model(nlp, model_dir)
Пример #21
0
def main(model_dir=None):
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
        if not model_dir.exists():
            model_dir.mkdir()
        assert model_dir.is_dir()

    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print('please run: `python -m spacy.en.download –force all` for better performance')
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    train_data = [
        (
            'Who is Shaka Khan?',
            [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
        ),
        (
            'I like London and Berlin.',
            [(len('I like '), len('I like London'), 'LOC'),
            (len('I like London and '), len('I like London and Berlin'), 'LOC')]
        )
    ]
    ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])

    doc = nlp.make_doc('Who is Shaka Khan?')
    nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.tag_, word.ent_type_, word.ent_iob)

    if model_dir is not None:
        with (model_dir / 'config.json').open('w') as file_:
            json.dump(ner.cfg, file_)
        ner.model.dump(str(model_dir / 'model'))
Пример #22
0
def main(model_dir=None):
    train_data = make_train_data()
    entity_types = load_entyty_types()

    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
    if nlp.tagger is None:
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    ner = train(nlp, train_data, entity_types, 20)

    # small test
    doc = nlp.make_doc(u'is there a delta flight from denver to san francisco')
    nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.ent_type_)
    #

    test_model(nlp, ner)

    if model_dir is not None:
        save_model(ner, model_dir)
def main(model_dir=None):
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
        if not model_dir.exists():
            model_dir.mkdir()
        assert model_dir.is_dir()

    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print('please run: `python -m spacy.en.download --force all` for better performance')
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    train_data =getData()
    ner = train_ner(nlp, train_data, ['ACC','EMAIL'])

    doc = nlp.make_doc('update email of account peps to [email protected]')
    #nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)

    if model_dir is not None:
        with (model_dir / 'config.json').open('wb') as file_:
            json.dump(ner.cfg, file_)
        ner.model.dump(str(model_dir / 'model'))
        if not (model_dir / 'vocab').exists():
            (model_dir / 'vocab').mkdir()
        ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
        with (model_dir / 'vocab' / 'strings.json').open('w',encoding='utf8') as file_:
            ner.vocab.strings.dump(file_)
Пример #24
0
 def test_load(self):
     data_dir = English.default_data_dir()
     vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
     tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
Пример #25
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples, make_projective=True)
                if not gold.is_projective:
                    raise Exception(
                        "Non-projective sentence in training, after we should "
                        "have enforced projectivity: %s" % annot_tuples
                    )
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #26
0
 def test_create(self):
     vocab = Vocab()
     templates = ((1, ), )
     model = Model(vocab.morphology.n_tags, templates, model_loc=None)
     tagger = Tagger(vocab, model)
Пример #27
0
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
    LangClass = spacy.util.get_lang_class(lang_name)
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not model_dir.exists():
        model_dir.mkdir()
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    if not (model_dir / 'pos').exists():
        (model_dir / 'pos').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))

    vocab = LangClass.Defaults.create_vocab()
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
    else:
        if (model_dir / 'vocab' / 'strings.json').exists():
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
                vocab.strings.load(file_)
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')

    if clusters_loc is not None:
        clusters_loc = pathlib.Path(clusters_loc)
        with clusters_loc.open() as file_:
            for line in file_:
                try:
                    cluster, word, freq = line.split()
                except ValueError:
                    continue
                lex = vocab[word]
                lex.cluster = int(cluster[::-1], 2)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if vocab.morphology.tag_map:
                for tag in tags:
                    assert tag in vocab.morphology.tag_map, repr(tag)
    tagger = Tagger(vocab)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(30):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
import os
import random

import spacy
from django.conf import settings
from spacy.gold import GoldParse
from spacy.pipeline import EntityRecognizer
from spacy.tagger import Tagger

# Load up our Data dir
# NLP Module
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

# Quick and easy if you don't have the data installed
if nlp.tagger is None:
    nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)


# Trains our query object
def train_query(queryObj):
    global nlp

    # Our query string
    story = queryObj.story
    querystring = queryObj.querystring
    parsed_ner = queryObj.parsed_ner

    # Where our model is located
    model_path = os.path.normpath(
        os.path.join(settings.SPACYMODEL_DIR, str(story.name)))
Пример #29
0
            if g == p:
                right += 1
            else:
                wrong += 1
    acc = 100 * right / (right + wrong)
    print(f"Accuracy: {acc:.2f}")


tag_map = {t: {'pos': 'X'} for t in alltags}
#tag_map.update(TAG_MAP)
vocab = Vocab(tag_map=tag_map)

# Add all train words to vocab!
for (ws, _) in trainset + testset:
    for w in ws:
        _ = vocab[w]

tagger = Tagger(vocab)

for i in range(50):
    print(f"Epoch {i}:")
    for (ws, ts) in trainset:
        doc = Doc(vocab, words=ws)
        gold = GoldParse(doc, tags=ts)
        tagger.update(doc, gold)
    eval(tagger)
    tagger.model.end_training()
    eval(tagger)
    tagger.model.resume_training()
    shuffle(trainset)
Пример #30
0
    def test_load(self):
        data_dir = English.default_data_dir()

        if path.exists(path.join(data_dir, 'vocab')):
            vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
            tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
Пример #31
0
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          seed=0, gold_preproc=False, n_sents=0, corruption_level=0,
          beam_width=1, verbose=False,
          use_orig_arc_eager=False, pseudoprojective=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)

    if pseudoprojective:
        # preprocess training data here before ArcEager.get_labels() is called
        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)

    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
                 beam_width=beam_width,projectivize=pseudoprojective)
    Config.write(ner_model_dir, 'config', features='ner', seed=seed,
                 labels=BiluoPushDown.get_labels(gold_tuples),
                 beam_width=0)

    if n_sents > 0:
        gold_tuples = gold_tuples[:n_sents]

    nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False)
    if nlp.lang == 'de':
        nlp.vocab.morphology.lemmatizer = lambda string,pos: set([string])
    nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates())
    nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager)
    nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown)
    print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
    for itn in range(n_iter):
        scorer = Scorer()
        loss = 0
        for raw_text, sents in gold_tuples:
            if gold_preproc:
                raw_text = None
            else:
                sents = _merge_sents(sents)
            for annot_tuples, ctnt in sents:
                if len(annot_tuples[1]) == 1:
                    continue
                score_model(scorer, nlp, raw_text, annot_tuples,
                            verbose=verbose if itn >= 2 else False)
                if raw_text is None:
                    words = add_noise(annot_tuples[1], corruption_level)
                    tokens = nlp.tokenizer.tokens_from_list(words)
                else:
                    raw_text = add_noise(raw_text, corruption_level)
                    tokens = nlp.tokenizer(raw_text)
                nlp.tagger(tokens)
                gold = GoldParse(tokens, annot_tuples)
                if not gold.is_projective:
                    raise Exception("Non-projective sentence in training: %s" % annot_tuples[1])
                loss += nlp.parser.train(tokens, gold)
                nlp.entity.train(tokens, gold)
                nlp.tagger.train(tokens, gold.tags)
        random.shuffle(gold_tuples)
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
Пример #32
0
def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
    LangClass = spacy.util.get_lang_class(lang_name)
    train_sents = list(read_conllx(train_loc))
    train_sents = PseudoProjectivity.preprocess_training_data(train_sents)

    actions = ArcEager.get_actions(gold_parses=train_sents)
    features = get_templates('basic')

    model_dir = pathlib.Path(model_dir)
    if not model_dir.exists():
        model_dir.mkdir()
    if not (model_dir / 'deps').exists():
        (model_dir / 'deps').mkdir()
    if not (model_dir / 'pos').exists():
        (model_dir / 'pos').mkdir()
    with (model_dir / 'deps' / 'config.json').open('wb') as file_:
        file_.write(
            json.dumps(
                {'pseudoprojective': True, 'labels': actions, 'features': features}).encode('utf8'))

    vocab = LangClass.Defaults.create_vocab()
    if not (model_dir / 'vocab').exists():
        (model_dir / 'vocab').mkdir()
    else:
        if (model_dir / 'vocab' / 'strings.json').exists():
            with (model_dir / 'vocab' / 'strings.json').open() as file_:
                vocab.strings.load(file_)
            if (model_dir / 'vocab' / 'lexemes.bin').exists():
                vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')

    if clusters_loc is not None:
        clusters_loc = pathlib.Path(clusters_loc)
        with clusters_loc.open() as file_:
            for line in file_:
                try:
                    cluster, word, freq = line.split()
                except ValueError:
                    continue
                lex = vocab[word]
                lex.cluster = int(cluster[::-1], 2)
    # Populate vocab
    for _, doc_sents in train_sents:
        for (ids, words, tags, heads, deps, ner), _ in doc_sents:
            for word in words:
                _ = vocab[word]
            for dep in deps:
                _ = vocab[dep]
            for tag in tags:
                _ = vocab[tag]
            if vocab.morphology.tag_map:
                for tag in tags:
                    assert tag in vocab.morphology.tag_map, repr(tag)
    tagger = Tagger(vocab)
    parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)

    for itn in range(30):
        loss = 0.
        for _, doc_sents in train_sents:
            for (ids, words, tags, heads, deps, ner), _ in doc_sents:
                doc = Doc(vocab, words=words)
                gold = GoldParse(doc, tags=tags, heads=heads, deps=deps)
                tagger(doc)
                loss += parser.update(doc, gold, itn=itn)
                doc = Doc(vocab, words=words)
                tagger.update(doc, gold)
        random.shuffle(train_sents)
        scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
        print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc))
    nlp = Language(vocab=vocab, tagger=tagger, parser=parser)
    nlp.end_training(model_dir)
    scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
    print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
Пример #33
0
def main(model_dir=None):
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print(
            'please run: `python -m spacy.en.download --force all` for better performance'
        )
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    train_data = [
        ("Masheesh Ikram\nLEAD SOFTWARE ENGINEER\nSupply Chain | Research & Development\nIFS R&D International, \nNo 501, Galle Road, Colombo 06, SRI LANKA\nTel +94 (0) 11 2364 400. Fax +94 (0) 11 2364401. Mobile +94 (0) 779050954\[email protected] | www.IFSWORLD.com \nIFS World Operations AB is a limited liability company registered in Sweden. \nCorporate identity number: 556040-6042. \nRegistered office: Teknikringen 5, Box 1545, SE-581 15 Linköping.",
         [(len('Masheesh Ikram\n'),
           len('Masheesh Ikram\nLEAD SOFTWARE ENGINEER'), 'POS')]),
        ("Asanka Gallege\nSecretary | IFS Welfare\n501, Galle Road, Colombo 06,   SRI LANKA\nTel +94 11 236 4400 (ext. 1722). Fax +94 11 236 4401. Mobile +94 71 563 9556\[email protected] | www.IFSWORLD.com \nIFS World Operations AB is a limited liability company registered in Sweden. \nCorporate identity number: 556040-6042. \nRegistered office: Teknikringen 5, Box 1545, SE-581 15 Linköping.",
         [(len('Asanka Gallege\n'), len('Asanka Gallege\nSecretary'), 'POS')]),
        ("David Anderson\nEmail: [email protected]\nChief Executive Officer\nOffice  800-555-5555 \nBroadlook Technologies	\nCell :  414-555-5555 \n21140 Capitol Drive\nFax   : 262-754-8081\nPewaukee WI 53072\nBlog www.idanato.com\nhttp://www.broadlook.com",
         [(len('David Anderson\nEmail: [email protected]\n'),
           len('David Anderson\nEmail: [email protected]\nChief Executive Officer'
               ), 'POS')]),
        ("Valerie Richardson \nAccountant\n2906 N. Glenwood Terrace, Atlanta, GA 30310\n(404) 555-0789\[email protected]\n501, Galle Road, , Colombo 06,  SRI LANKA\nTel +94 11 236 44 00. Fax +94 11 236 44 01\[email protected] | www.IFSWORLD.com ",
         [(len('Valerie Richardson \n'),
           len('Valerie Richardson \nAccountant'), 'POS')]),
        ('Kandasamy Yogendirakumar (Yogi)\nMSc, MBCS, MIET | DIRECTOR IFS ACADEMY \n501, Galle Road, Colombo 06,   SRI LANKA\nTel +94 (0)112 364 440. Fax +94 (0)112 364 441. Mobile +94 (0)714 039 089 \[email protected]|www.IFSWORLD.com \nIFS World Operations AB is a limited liability company registered in Sweden. \nCorporate identity number: 556040-6042. \nRegistered office: Teknikringen 5, Box 1545, SE-581 15 Linköping.',
         [(len('Kandasamy Yogendirakumar (Yogi)\nMSc, MBCS, MIET | '),
           len('Kandasamy Yogendirakumar (Yogi)\nMSc, MBCS, MIET | DIRECTOR'),
           'POS')]),
        ('He was a  Software Engineer.',
         [(len('He was a '), len('He was a Lead Software Engineer'), 'POS')]),
        ('I am an Engineer', [(len('I am an '), len('I am an Engineer'), 'POS')
                              ]),
        ('I am an Lead Engineer as well as Software Engineer in IFS.',
         [(len('I am an '), len('I am an Lead Engineer'), 'POS'),
          (len('I am an Lead Engineer as well as '),
           len('I am an Lead Engineer as well as Software Engineer'), 'POS')]),
        ('Secretary', [(0, len('Secretary'), 'POS')]),
        ('Chief Executive Officer', [(0, len('Chief Executive Officer'), 'POS')
                                     ]),
        ('David Anderson Secretary',
         [(len('David Anderson'), len('David Anderson Secretary'), 'POS')]),
        ('David Anderson\nSecretary',
         [(len('David Anderson\n'), len('David Anderson Secretary'), 'POS')]),
        ('Asanka Gallege\nSecretary | IFS Welfare\n501, Galle Road, Colombo 06, SRI LANKA\nTel +94 11 236 4400 (ext. 1722). Fax +94 11 236 4401. Mobile +94 71 563 9556',
         [(len('Asanka Gallege\n'), len('Asanka Gallege\nSecretary'), 'POS')]),
        ('Fredrik Vom\nGROUP SENIOR VICE PRESIDENT\nBusiness Development\nGullbergs Strandgata 15, SE-411 04 Goteborg,SWEDEN\nTel +46 31 726 3046. Fax +46 31726 3001. Mobile +46 733 453046\[email protected] | www.IFSWORLD.com\nIFS World Operations AB is a limited liability company registered in Sweden.',
         [(len('Fredrik Vom\n'),
           len('Fredrik Vom\nGROUP SENIOR VICE PRESIDENT'), 'POS')]),
        ('Fredrik Vom\nGROUP SENIOR VICE PRESIDENT\nBusiness Development\nGullbergs Strandgata 15, SE-411 04 Goteborg,SWEDEN\nTel +46 31 726 3046. Fax +46 31726 3001. Mobile +46 733 453046\[email protected] | www.IFSWORLD.com\nIFS World Operations AB is a limited liability company registered in Sweden.',
         [(len('Fredrik Vom\n'),
           len('Fredrik Vom\nGROUP SENIOR VICE PRESIDENT'), 'POS')]),
        ('Dr. Ashok Padhye\nGeneral Physician\nA-205, Natasha Apartments\n2, Inner Ring Road\nDomlur\nBANGALORE - 560071\nKarnataka',
         [(len('Dr. Ashok Padhye\n'),
           len('Dr. Ashok Padhye\nGeneral Physician'), 'POS')]),
        ('Dr. Ashok Padhye\nGeneral Physician\nA-205, Natasha Apartments\n2, Inner Ring Road\nDomlur\nBANGALORE - 560071\nKarnataka',
         [(len('Dr. Ashok Padhye\n'),
           len('Dr. Ashok Padhye\nGeneral Physician'), 'POS')]),
    ]
    # file_name = 'D:\PYTHON\Input\input.txt'
    # train_data = open(file_name, "r")
    ner = train_ner(nlp, train_data, ['POS'])

    doc = nlp.make_doc("""  
I am an Lead Engineer as well as Software Engineer in IFS.
""")
    doc1 = unicode(doc)
    nlp.tagger(doc)
    ner(doc)

    position = []
    for word in doc:
        if word.ent_type_ == 'POS':
            position.append(word.text)
            # print(word.text,word.ent_type_, word.ent_iob)
    # print(position)
    # position=[word for word in doc if word.ent_type =='POS']
    # print (position)
    i = 0
    pos = []
    new_pos = []
    pos = position
    for x in pos:
        if word.ent_iob == 3 and i != 0:
            new_pos.append(pos[:i])
            pos = position[i:]
        elif i == len(position) - 1:
            new_pos.append(pos)

        i += 1
    for y in new_pos:
        string = " ".join(str(x) for x in y)
        print(string)
    # print (doc1)

    if model_dir is not None:
        save_model(ner, model_dir)
Пример #34
0
def main(model_dir=None):
    nlp = spacy.get_lang_class('pt')(path=None)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('Setting tagger')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    if (len(sys.argv) > 3):
        filetrain = sys.argv[1]
        model_dir = sys.argv[2]
        level = sys.argv[3]
    else:
        print("Usage: python " + sys.argv[0] +
              " <input filename train> <model_dir> <level>\n")
        sys.exit()

    train_data = get_training_data(filetrain)

    nlp = create_vocab(nlp, train_data)

    cat = [
        'ABSTRACCAO', 'OUTRO', 'LOCAL', 'ACONTECIMENTO', 'TEMPO', 'PESSOA',
        'OBRA', 'ORGANIZACAO', 'VALOR', 'COISA'
    ]
    types = [
        'ABSTRACCAO_IDEIA', 'LOCAL_HUMANO', 'ACONTECIMENTO_EVENTO',
        'ACONTECIMENTO_EFEMERIDE', 'TEMPO_TEMPO_CALEND', 'PESSOA_POVO',
        'PESSOA_INDIVIDUAL', 'OBRA_REPRODUZIDA', 'ABSTRACCAO_DISCIPLINA',
        'PESSOA_GRUPOMEMBRO', 'ORGANIZACAO_INSTITUICAO', 'PESSOA_CARGO',
        'OBRA_PLANO', 'ORGANIZACAO_ADMINISTRACAO', 'TEMPO_GENERICO',
        'ABSTRACCAO_NOME', 'TEMPO_FREQUENCIA', 'LOCAL_FISICO',
        'VALOR_QUANTIDADE', 'COISA_SUBSTANCIA', 'LOCAL_VIRTUAL',
        'COISA_OBJECTO', 'PESSOA_GRUPOIND', 'ORGANIZACAO_EMPRESA',
        'PESSOA_MEMBRO', 'COISA_CLASSE', 'ACONTECIMENTO_ORGANIZADO',
        'TEMPO_DURACAO', 'VALOR_MOEDA', 'VALOR_CLASSIFICACAO', 'OBRA_ARTE',
        'PESSOA_GRUPOCARGO', 'COISA_MEMBROCLASSE', 'ABSTRACCAO_ESTADO',
        'ABSTRACCAO_', 'ORGANIZACAO_', 'OUTRO_', 'ACONTECIMENTO_',
        'LOCAL_OUTRO', 'COISA_OUTRO'
    ]
    subtypes = [
        'LOCAL_HUMANO_DIVISAO', 'TEMPO_TEMPO_CALEND_DATA', 'LOCAL_HUMANO_PAIS',
        'OBRA_REPRODUZIDA_LIVRO', 'PESSOA_POVO_', 'LOCAL_HUMANO_REGIAO',
        'TEMPO_TEMPO_CALEND_INTERVALO', 'LOCAL_FISICO_AGUACURSO',
        'LOCAL_FISICO_AGUAMASSA', 'TEMPO_TEMPO_CALEND_HORA',
        'LOCAL_FISICO_PLANETA', 'LOCAL_HUMANO_RUA', 'LOCAL_HUMANO_CONSTRUCAO',
        'LOCAL_FISICO_OUTRO', 'LOCAL_VIRTUAL_SITIO',
        'OBRA_REPRODUZIDA_PROGRAMA', 'ORGANIZACAO_INSTITUICAO_',
        'LOCAL_HUMANO_OUTRO', 'OBRA_REPRODUZIDA_MUSICA',
        'OBRA_REPRODUZIDA_OUTRO', 'ORGANIZACAO_INSTITUICAO_SUB',
        'ORGANIZACAO_ADMINISTRACAO_', 'LOCAL_FISICO_REGIAO',
        'ABSTRACCAO_IDEIA_', 'OBRA_ARTE_CONSTRUCAO', 'OBRA_ARTE_OUTRO',
        'LOCAL_FISICO_RELEVO', 'ORGANIZACAO_ADMINISTRACAO_SUB',
        'LOCAL_VIRTUAL_COMSOCIAL', 'ACONTECIMENTO_EFEMERIDE_',
        'ACONTECIMENTO_EVENTO_', 'COISA_OBJECTO_', 'LOCAL_FISICO_ILHA',
        'OBRA_PLANO_', 'OBRA_REPRODUZIDA_FILME', 'ORGANIZACAO_EMPRESA_',
        'LOCAL_VIRTUAL_OBRA', 'ORGANIZACAO_EMPRESA_SUB',
        'ACONTECIMENTO_ORGANIZADO_', 'OBRA_REPRODUZIDA_',
        'LOCAL_VIRTUAL_OUTRO', 'OBRA_ARTE_', 'ABSTRACCAO_NOME_',
        'TEMPO_DURACAO_', 'OBRA_REPRODUZIDA_TEATRO', 'OBRA_ARTE_PINTURA',
        'OBRA_ARTE_EDIFICIO'
    ]
    filtered = [
        'LOCAL', 'ACONTECIMENTO', 'TEMPO', 'PESSOA', 'ORGANIZACAO', 'VALOR'
    ]

    if level == 'cat':
        categories = cat
    elif level == 'types':
        categories = types
    elif level == 'subtypes':
        categories = subtypes
    else:
        categories = filtered

    ner = train_ner(nlp, train_data, categories)

    if model_dir is not None:
        save_model(ner, model_dir + '/' + level)