def test_pipe_factories_language_specific():
    """Test that language sub-classes can have their own factories, with
    fallbacks to the base factories."""
    name1 = "specific_component1"
    name2 = "specific_component2"
    Language.component(name1, func=lambda: "base")
    English.component(name1, func=lambda: "en")
    German.component(name2, func=lambda: "de")

    assert Language.has_factory(name1)
    assert not Language.has_factory(name2)
    assert English.has_factory(name1)
    assert not English.has_factory(name2)
    assert German.has_factory(name1)
    assert German.has_factory(name2)

    nlp = Language()
    assert nlp.create_pipe(name1)() == "base"
    with pytest.raises(ValueError):
        nlp.create_pipe(name2)
    nlp_en = English()
    assert nlp_en.create_pipe(name1)() == "en"
    with pytest.raises(ValueError):
        nlp_en.create_pipe(name2)
    nlp_de = German()
    assert nlp_de.create_pipe(name1)() == "base"
    assert nlp_de.create_pipe(name2)() == "de"
Пример #2
0
def test_issue3289():
    """Test that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    bytes_data = nlp.to_bytes()
    new_nlp = English()
    new_nlp.add_pipe(nlp.create_pipe("textcat"))
    new_nlp.from_bytes(bytes_data)
Пример #3
0
def test_issue3289():
    """Test that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    bytes_data = nlp.to_bytes()
    new_nlp = English()
    new_nlp.add_pipe(nlp.create_pipe("textcat"))
    new_nlp.from_bytes(bytes_data)
Пример #4
0
class SpacyTokenizer():
    def __init__(self, lang='en'):
        """ Construct a spaCy-based tokenizer by loading the spaCy pipeline.
        """
        if lang != 'en' and lang != "ru2":
            raise Exception(
                "spaCy tokenizer is currently only allowed in English pipeline."
            )

        try:
            import spacy
            from spacy.lang.en import English
        except ImportError:
            raise ImportError(
                "spaCy 2.0+ is used but not installed on your machine. Go to https://spacy.io/usage for installation instructions."
            )

        # Create a Tokenizer with the default settings for English
        # including punctuation rules and exceptions
        if lang == "ru2":
            self.nlp = spacy.load('ru2')
        else:
            self.nlp = English()
        # by default spacy uses dependency parser to do ssplit
        # we need to add a sentencizer for fast rule-based ssplit
        if lang == "ru":
            sentencizer = self.nlp.create_pipe('sentencizer', first=True)
        else:
            sentencizer = self.nlp.create_pipe('sentencizer')

        self.nlp.add_pipe(sentencizer)

    def tokenize(self, text):
        """ Tokenize a document with the spaCy tokenizer and wrap the results into a Doc object.
        """
        if not isinstance(text, str):
            raise Exception("Must supply a string to the spaCy tokenizer.")
        spacy_doc = self.nlp(text)

        sentences = []
        for sent in spacy_doc.sents:
            tokens = []
            for tok in sent:
                token_entry = {
                    doc.TEXT:
                    tok.text,
                    doc.MISC:
                    f"{doc.START_CHAR}={tok.idx}|{doc.END_CHAR}={tok.idx+len(tok.text)}"
                }
                tokens.append(token_entry)
            sentences.append(tokens)

        return doc.Document(sentences, text)
Пример #5
0
def test_issue4707():
    """Tests that disabled component names are also excluded from nlp.from_disk
    by default when loading a model.
    """
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    nlp.add_pipe(nlp.create_pipe("entity_ruler"))
    assert nlp.pipe_names == ["sentencizer", "entity_ruler"]
    exclude = ["tokenizer", "sentencizer"]
    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir, exclude=exclude)
        new_nlp = load_model_from_path(tmpdir, disable=exclude)
    assert "sentencizer" not in new_nlp.pipe_names
    assert "entity_ruler" in new_nlp.pipe_names
def test_issue4267():
    """ Test that running an entity_ruler after ner gives consistent results"""
    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PEOPLE")
    nlp.add_pipe(ner)
    nlp.begin_training()

    assert "ner" in nlp.pipe_names

    # assert that we have correct IOB annotations
    doc1 = nlp("hi")
    assert doc1.is_nered
    for token in doc1:
        assert token.ent_iob == 2

    # add entity ruler and run again
    ruler = EntityRuler(nlp)
    patterns = [{"label": "SOFTWARE", "pattern": "spacy"}]

    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    assert "entity_ruler" in nlp.pipe_names
    assert "ner" in nlp.pipe_names

    # assert that we still have correct IOB annotations
    doc2 = nlp("hi")
    assert doc2.is_nered
    for token in doc2:
        assert token.ent_iob == 2
Пример #7
0
 def get_sentences(self):
     # Returns sentences from text
     nlp = English()
     nlp.add_pipe(nlp.create_pipe('sentencizer'))  # updated
     doc = nlp(self.text)
     sentences = [sent.string.strip() for sent in doc.sents]
     return sentences
Пример #8
0
 def sentencize(self, input_string):
     """Produces a list of sentences"""
     nlp = English()
     nlp.add_pipe(nlp.create_pipe('sentencizer'))
     doc = nlp(input_string)
     sentences = [s.text.strip() for s in doc.sents if s.text.strip() != '']
     return sentences
    def createTextChunks(self, longString):
        import spacy
        from spacy.lang.en import English

        # Break into sentences after coref
        nlp = English()
        nlp.add_pipe(nlp.create_pipe("sentencizer"))

        # Chunk into sentences
        doc = nlp(longString, disable=['ner'])

        # ************* COREF BASE ********************
        # Load your usual SpaCy model (one of SpaCy English models)
        # nlp = spacy.load('en_core_web_sm')

        # Add COREF
        # neuralcoref.add_to_pipe(nlp)

        # Perform parallel COREF for each sentence from above
        # coref_sentences = nlp.pipe([s.text[:4999] for s in doc.sents], disable=['ner'])

        # limiting to 4900 - after testing I find that
        # rows and rows of table data that are not sentences are what that doesn't get chunked
        # Hence forcing a manual chunk - there will be loss of information (TODO)
        #return [s._.coref_resolved[:4999] for s in coref_sentences]
        return [s.text[:4999] for s in doc.sents]
Пример #10
0
def test_issue5137():
    class MyComponent(object):
        name = "my_component"

        def __init__(self, nlp, **cfg):
            self.nlp = nlp
            self.categories = cfg.get("categories", "all_categories")

        def __call__(self, doc):
            pass

        def to_disk(self, path, **kwargs):
            pass

        def from_disk(self, path, **cfg):
            pass

    Language.factories["my_component"] = lambda nlp, **cfg: MyComponent(nlp, **cfg)

    nlp = English()
    nlp.add_pipe(nlp.create_pipe("my_component"))
    assert nlp.get_pipe("my_component").categories == "all_categories"

    with make_tempdir() as tmpdir:
        nlp.to_disk(tmpdir)
        nlp2 = spacy.load(tmpdir, categories="my_categories")
        assert nlp2.get_pipe("my_component").categories == "my_categories"
Пример #11
0
 def split_into_sentences(text: str) -> List[Span]:
     nlp = English()
     nlp.add_pipe(nlp.create_pipe('sentencizer'))
     document = nlp(text)
     return list(
         filter(lambda s: not TextProcessingUtil.is_empty(s.text.strip()),
                document.sents))
def train(data_dir, save_dir):

    # load the Huggingface config, tokenizer, and model
    model_name = "clulab/roberta-timex-semeval"
    config = AutoConfig.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              config=config,
                                              use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                            config=config)

    # load the spacy sentence segmenter
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))

    # create a torch dataset from a directory of Anafora XML annotations and text files
    dataset = TimexDataset.from_texts(data_dir, nlp, tokenizer, config)

    # train and save the torch model
    trainer = Trainer(
        model=model,
        args=TrainingArguments(save_dir),
        train_dataset=dataset,
        data_collator=lambda features: dict(
            input_ids=torch.stack([f.input_ids for f in features]),
            attention_mask=torch.stack([f.attention_mask for f in features]),
            labels=torch.stack([f.label for f in features])))
    trainer.train()
    trainer.save_model()
    tokenizer.save_pretrained(save_dir)
def split_text_to_sentences(raw_text):
    nlp = English()
    nlp.max_length = 12306482
    nlp.add_pipe(nlp.create_pipe('sentencizer'))  # updated
    doc = nlp(raw_text)
    sentences = [sent.string.strip() for sent in doc.sents]
    return sentences
Пример #14
0
def predict(predict_dir, output_dir):

    # load the Huggingface config, tokenizer, and model
    model_name = "clulab/roberta-timex-semeval"
    config = AutoConfig.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              config=config,
                                              use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                            config=config)

    # load the spacy sentence segmenter
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))

    # create a torch dataset from a directory of text files
    dataset = TimexDataset.from_texts(predict_dir, nlp, tokenizer)

    # get predictions from the torch model
    trainer = Trainer(
        model=model,
        args=TrainingArguments("save_run/"),
        data_collator=lambda features: dict(
            input_ids=torch.stack([f.input_ids for f in features]),
            attention_mask=torch.stack([f.attention_mask for f in features])))
    # write the predictions in Anafora XML format
    write_anafora(output_dir, dataset, predictions, tokenizer, config)
 def autocorrect_line(self, line):
     ''' Takes in string as input, tokenizes and sentence
     segments it with spacy, then returns the concatenated
     result of calling autocorrect_sentence on all of the 
     resulting sentence objects '''
     nlp = English()
     nlp.add_pipe(nlp.create_pipe('sentencizer'))
     doc = nlp(line)
     sents = list(doc.sents)
     punc = [s[-1] for s in sents]  # save end of sentence punctuation
     sents = [s[:-1]
              for s in sents]  # get rid of end of sentence punctuation
     text = []
     for i in range(len(sents)):
         if len(sents[i]) > 0:
             wordList = [t.text for t in sents[i]]
             wordList = [w.lower()
                         for w in wordList]  # get rid of capitalization
             wordList = [
                 ''.join(ch for ch in word
                         if ch not in set(string.punctuation))
                 for word in wordList
             ]
             wordList = list(filter(
                 lambda x: x != "",
                 wordList))  # get rid of things that only consisted of punc
             checked = self.autocorrect_sentence(wordList)
             checked[-1] += str(punc[i])  # replace punctuation at end
             checked[0] = checked[0][0].upper() + checked[0][
                 1:]  # capitalize first character
             text.extend(checked)
     return text
Пример #16
0
def test_issue5458():
    # Test that the noun chuncker does not generate overlapping spans
    # fmt: off
    words = [
        "In", "an", "era", "where", "markets", "have", "brought", "prosperity",
        "and", "empowerment", "."
    ]
    vocab = Vocab(strings=words)
    deps = [
        "ROOT", "det", "pobj", "advmod", "nsubj", "aux", "relcl", "dobj", "cc",
        "conj", "punct"
    ]
    pos = [
        "ADP", "DET", "NOUN", "ADV", "NOUN", "AUX", "VERB", "NOUN", "CCONJ",
        "NOUN", "PUNCT"
    ]
    heads = [0, 2, 0, 9, 6, 6, 2, 6, 7, 7, 0]
    # fmt: on
    en_doc = Doc(vocab, words=words, pos=pos, heads=heads, deps=deps)
    en_doc.noun_chunks_iterator = noun_chunks

    # if there are overlapping spans, this will fail with an E102 error "Can't merge non-disjoint spans"
    nlp = English()
    merge_nps = nlp.create_pipe("merge_noun_chunks")
    merge_nps(en_doc)
Пример #17
0
def test_issue6177():
    """Test that after fixing the random seed, the results of the pipeline are truly identical"""

    # NOTE: no need to transform this code to v3 when 'master' is merged into 'develop'.
    # A similar test exists already for v3: test_issue5551
    # This is just a backport

    results = []
    for i in range(3):
        fix_random_seed(0)
        nlp = English()
        example = (
            "Once hot, form ping-pong-ball-sized balls of the mixture, each weighing roughly 25 g.",
            {
                "cats": {
                    "Labe1": 1.0,
                    "Label2": 0.0,
                    "Label3": 0.0
                }
            },
        )
        textcat = nlp.create_pipe("textcat")
        nlp.add_pipe(textcat)
        for label in set(example[1]["cats"]):
            textcat.add_label(label)
        nlp.begin_training()
        # Store the result of each iteration
        result = textcat.model.predict([nlp.make_doc(example[0])])
        results.append(list(result[0]))

    # All results should be the same because of the fixed seed
    assert len(results) == 3
    assert results[0] == results[1]
    assert results[0] == results[2]
 def get_sentences(self):
     original_review = nltk.tokenize.treebank.TreebankWordDetokenizer().detokenize(self.X)
     nlp = English()
     nlp.add_pipe(nlp.create_pipe('sentencizer'))
     doc = nlp(original_review)
     sentences = [sent.string.strip() for sent in doc.sents]
     return sentences
Пример #19
0
def bulk_make_annotation_json(fns=[]):
    archive = load_archive(get_srl_model())
    predictor = SemanticRoleLabelerPredictor.\
                from_archive(archive, "semantic-role-labeling")
    jsons = []
    for fn in fns:
        lines = [l.strip() for l in open(fn)]
        # Look for a year in the first line
        date = get_date_from_string(lines[0])
        if date:
            # If the first line is a year,
            # then ignore that line and get
            # the rest of the doc as a string
            text = ' '.join(lines[1:])
        else:
            text = ' '.join(lines)

        # Get spacy doc
        nlp = English()
        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        doc = nlp(text)

        # Get sentencized text, and json format for AllenNLP
        sentences, json_sentences = doc2json(doc)
        srl_sents = predictor.predict_batch_json(json_sentences)

        #print(annotations2json(fn, sentences, srl_sents))
        jsons.append(annotations2json(fn, date, sentences, srl_sents))

    return jsons
Пример #20
0
    def sent_segment(self, txt):
        """ sentence tokenization
    
        Parameters:
        txt : tex to tokenize into sentences
        Returns: list of sentences
        
        """

        # Load English tokenizer, tagger, parser, NER and word vectors
        nlp = English()

        # A simple pipeline component, to allow custom sentence boundary detection logic
        # that doesn’t require the dependency parse. It splits on punctuation by default
        sbd = nlp.create_pipe('sentencizer')

        # Add the component to the pipeline
        nlp.add_pipe(sbd)

        #nlp is used to create documents with linguistic annotations.
        doc = nlp(txt)

        # create list of sentence tokens
        sents_list = []
        for sent in doc.sents:
            sents_list.append(sent.text)

        return sents_list
Пример #21
0
class SpacyService(object):
    def __init__(self):
        spacy_model = Env.get_value(Env.SPACY_MODEL)
        if spacy_model == 'english':
            self.nlp = English()
        else:
            self.nlp = spacy.load(spacy_model)
        self.nlp.add_pipe(self.hashtag_pipe)
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))

    def hashtag_pipe(self, doc):
        '''Inspired by https://github.com/explosion/spaCy/issues/503
        '''
        i = 0
        while i < len(doc) - 1:
            token = doc[i]
            if token.text == '#':
                if re.match(r'^\w+$', str(doc[i + 1])):
                    with doc.retokenize() as retokenizer:
                        retokenizer.merge(doc[i:i + 2])
            i += 1
        return doc

    def tokenizer(self, text: str):
        return self.nlp(text)

    def sentencizer(self, text: str):
        return self.nlp(text).sents
Пример #22
0
class SentenceHandler(object):
    def __init__(self, lang=English):
        if lang == "fr":
            self.nlp = French()
        else:
            self.nlp = English()
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))

    def process(self,
                body: str,
                min_length: int = 40,
                max_length: int = 600) -> List[str]:
        """
        Processes the content sentences.

        :param body: The raw string body to process
        :param min_length: Minimum length that the sentences must be
        :param max_length: Max length that the sentences mus fall under
        :return: Returns a list of sentences.
        """
        doc = self.nlp(body)
        return [
            c.string.strip() for c in doc.sents
            if max_length > len(c.string.strip()) > min_length
        ]

    def __call__(self,
                 body: str,
                 min_length: int = 40,
                 max_length: int = 600) -> List[str]:
        return self.process(body, min_length, max_length)
Пример #23
0
def test_issue3880():
    """Test that `nlp.pipe()` works when an empty string ends the batch.

    Fixed in v7.0.5 of Thinc.
    """
    texts = ["hello", "world", "", ""]
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("parser"))
    nlp.add_pipe(nlp.create_pipe("ner"))
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.get_pipe("parser").add_label("dep")
    nlp.get_pipe("ner").add_label("PERSON")
    nlp.get_pipe("tagger").add_label("NN")
    nlp.begin_training()
    for doc in nlp.pipe(texts):
        pass
Пример #24
0
class Sentencizer:
    def __init__(self):
        self.nlp = English()
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))

    def split(self, text: str) -> List[str]:
        return [str(sent) for sent in self.nlp(text).sents]
Пример #25
0
def test_train_empty():
    """Test that training an empty text does not throw errors."""
    train_data = [
        ("Who is Shaka Khan?", {
            "entities": [(7, 17, "PERSON")]
        }),
        ("", {
            "entities": []
        }),
    ]

    nlp = English()
    ner = nlp.create_pipe("ner")
    ner.add_label("PERSON")
    nlp.add_pipe(ner, last=True)

    nlp.begin_training()
    for itn in range(2):
        losses = {}
        batches = minibatch(train_data)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                losses=losses,
            )
def predict(predict_dir, output_dir):

    model_name = "clulab/roberta-timex-semeval"
    config = AutoConfig.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                              config=config,
                                              use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(model_name,
                                                            config=config)

    tokenizer = AutoTokenizer.from_pretrained(
        '/content/drive/My Drive/SFDA/Time/model/task2_model_APM/1/',
        use_fast=True)
    model = AutoModelForTokenClassification.from_pretrained(
        '/content/drive/My Drive/SFDA/Time/model/task2_model_APM/1/')
    print(3)
    # model.load_state_dict(torch.load('/content/drive/My Drive/SFDA/Time/model/model_wl_r0_e2.pt'))

    # load the spacy sentence segmenter
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))

    # create a torch dataset from a directory of text files
    dataset = TimexDataset.from_texts(predict_dir, nlp, tokenizer)

    # get predictions from the torch model
    trainer = Trainer(
        model=model,
        args=TrainingArguments("save_run/"),
        data_collator=lambda features: dict(
            input_ids=torch.stack([f.input_ids for f in features]),
            attention_mask=torch.stack([f.attention_mask for f in features])))
    predictions, _, _ = trainer.predict(dataset)
    # write the predictions in Anafora XML format
    write_anafora(output_dir, dataset, predictions, tokenizer, config)
Пример #27
0
 def test_sentence_tokenization(self):
     nlp = English()
     nlp.add_pipe(nlp.create_pipe('sentencizer'))
     doc = nlp("This is a sentence. This is another one.")
     sentences = [sent.text for sent in doc.sents]
     self.assertEqual(len(sentences), 2)
     self.assertEqual(sentences[0], "This is a sentence.")
def preprocessing(raw_clinical_note_file,
                  sent_parsing=True,
                  num_of_sen=100,
                  num_of_sen_len=25):

    with open(raw_clinical_note_file, 'r') as file:
        raw_clinical_note = file.read()

    #set the tokenizer: retain only alphanumeric
    tokenizer = RegexpTokenizer(r'\w+')  # original

    if sent_parsing:
        ##First: sentence tokenisation
        nlp = English()  # just the language with no model
        sentencizer = nlp.create_pipe("sentencizer")
        nlp.add_pipe(sentencizer)  #rule-based sentencizer: .?!
        nlp.add_pipe(set_custom_boundaries)  #add custom rules: \n\n
        #see https://spacy.io/usage/linguistic-features#sbd

        doc = nlp(raw_clinical_note)
        tokens = []
        for i, sent_tokens in enumerate(doc.sents):
            ##Second: tokenisation same as in the original CAML-MIMIC step for tokens in each sentence
            list_token_str = [
                t.lower() for t in tokenizer.tokenize(sent_tokens.text)
                if not t.isnumeric()
            ]

            ##Third: add all the tokens in all sentences together with sentence sign as dot
            if len(list_token_str) != 0:
                tokens = tokens + list_token_str + [
                    '.'
                ]  # add tokens of sentences all together with sentence split sign as dot.
        clinical_note_tokenised = ' '.join(tokens)

        ##Forth: comine short sentences (length below 5)
        clinical_note_tokenised_combined = short_sentence_combined_with_previous_one(
            clinical_note_tokenised, length_threshold=10)

        ##Fifth: padding to 100 sentences and 25 tokens per sentence
        sentences = clinical_note_tokenised_combined.split(".")
        sen_n = len(sentences)
        padded_clinical_note = ""
        for i in range(num_of_sen):
            if i + 1 <= sen_n:  # i starts from 0
                padded_clinical_note = padded_clinical_note.strip(
                ) + " " + pad(sentences[i], num_of_sen_len)
            else:
                padded_clinical_note = padded_clinical_note.strip(
                ) + " " + pad("", num_of_sen_len)
        return padded_clinical_note
    else:
        #directly tokenise each word in the document
        #tokenize, lowercase and remove numerics
        tokens = [
            t.lower() for t in tokenizer.tokenize(raw_clinical_note)
            if not t.isnumeric()
        ]
        preprocessed_clinical_note = '"' + ' '.join(tokens) + '"'
        return preprocessed_clinical_note
class ModelProcessor(object):

    def __init__(self, model='bert-large-uncased',
                 hidden: int=-2,
                 reduce_option: str = 'mean',
                 greedyness: float=0.45):
        self.model = BertParent(model)
        self.hidden = hidden
        self.reduce_option = reduce_option
        self.nlp = English()
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))
        neuralcoref.add_to_pipe(self.nlp, greedyness=greedyness)

    def process_content_sentences(self, body: str, min_length=40, max_length=600) -> List[str]:
        doc = self.nlp(body)._.coref_resolved
        doc = self.nlp(doc)
        return [c.string.strip() for c in doc.sents
                if len(c.string.strip()) > min_length and len(c.string.strip()) < max_length]

    @abstractmethod
    def run_clusters(self, content: List[str], ratio=0.2, algorithm='kmeans', use_first: bool=True) -> List[str]:
        raise NotImplementedError("Must Implement run_clusters")

    def run(self, body: str, ratio: float=0.2, min_length: int=40, max_length: int=600,
            use_first: bool=True, algorithm='kmeans') -> str:
        sentences = self.process_content_sentences(body, min_length, max_length)
        if sentences:
            sentences = self.run_clusters(sentences, ratio, algorithm, use_first)
        return ' '.join(sentences)

    def __call__(self, body: str, ratio: float=0.2, min_length: int=40, max_length: int=600,
                 use_first: bool=True, algorithm='kmeans') -> str:
        return self.run(body, ratio, min_length, max_length)
Пример #30
0
def test_issue4042_bug2():
    """
    Test that serialization of an NER works fine when new labels were added.
    This is the second bug of two bugs underlying the issue 4042.
    """
    nlp1 = English()
    # add ner pipe
    ner1 = nlp1.add_pipe("ner")
    ner1.add_label("SOME_LABEL")
    nlp1.initialize()
    # add a new label to the doc
    doc1 = nlp1("What do you think about Apple ?")
    assert len(ner1.labels) == 1
    assert "SOME_LABEL" in ner1.labels
    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
    doc1.ents = list(doc1.ents) + [apple_ent]
    # Add the label explicitly. Previously we didn't require this.
    ner1.add_label("MY_ORG")
    ner1(doc1)
    assert len(ner1.labels) == 2
    assert "SOME_LABEL" in ner1.labels
    assert "MY_ORG" in ner1.labels
    with make_tempdir() as d:
        # assert IO goes fine
        output_dir = ensure_path(d)
        if not output_dir.exists():
            output_dir.mkdir()
        ner1.to_disk(output_dir)
        config = {}
        ner2 = nlp1.create_pipe("ner", config=config)
        ner2.from_disk(output_dir)
        assert len(ner2.labels) == 2
Пример #31
0
def get_sentences(text):

  nlp = English()
  sentencizer = nlp.create_pipe("sentencizer")
  nlp.add_pipe(sentencizer)
  doc = nlp(text)
  return list(doc.sents)
Пример #32
0
def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    """
    nlp = English()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)

    ner.add_label("ANIMAL")
    nlp.begin_training()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    nlp2.from_bytes(nlp.to_bytes())
    assert nlp2.get_pipe("ner").move_names == move_names
Пример #33
0
def test_issue3449():
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
    t1 = nlp(text1)
    t2 = nlp(text2)
    t3 = nlp(text3)
    assert t1[5].text == "I"
    assert t2[5].text == "I"
    assert t3[5].text == "I"
Пример #34
0
def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.is_sentenced
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.is_sentenced
    assert len(list(new_doc.sents)) == 1