def setUpClass(cls): cls.dataset = Dataset() doc1 = Document() cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1 for s in TEST_SENTENCES_SINGLE_ROOT: part = Part(s) doc1.parts[s] = part doc2 = Document() cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2 for s in TEST_SENTENCES_MULTI_ROOT: part = Part(s) doc2.parts[s] = part cls.nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(cls.nlp) cls.splitter = NLTKSplitter() cls.tokenizer = GenericTokenizer( lambda string: (tok.text for tok in cls.nlp.tokenizer(string))) cls.splitter.split(cls.dataset) cls.tokenizer.tokenize(cls.dataset) cls.parser.parse(cls.dataset) cls.computed_sentences = [] for sentence in cls.dataset.sentences(): dist, then = compute_shortest_paths(sentence) cls.computed_sentences.append((dist, then, sentence))
def setUpClass(cls): cls.dataset = Dataset() cls.doc = Document() cls.dataset.documents['testid'] = cls.doc # TEXT = "123 45678" # POS = "012345678" # ANN1 = " X " # ANN2 = " XXX " # PAR1 = "XXX " # PAR1 = " XXXXX" cls.part = Part( 'Here is a random sentence for the benefit of your mamma') cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID, offset=10, text='random sentence', confidence=0) cls.part.annotations.append(cls.entity) cls.doc.parts['s1h1'] = cls.part # Apply through pipeline NLTKSplitter().split(cls.dataset) NLTK_TOKENIZER.tokenize(cls.dataset) nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(nlp) cls.parser.parse(cls.dataset) # cls.part.percolate_tokens_to_entities() cls.sentence = cls.part.sentences[0]
def __init__(self, edge_generator, use_spacy_pipelines=False): super().__init__(edge_generator.entity1_class, edge_generator.entity2_class, edge_generator.relation_type) if use_spacy_pipelines: nlp = get_spacy_nlp_english(load_parser=True) self.sentence_splitter = GenericSplitter(lambda string: (sent.text for sent in nlp(string).sents)) self.tokenizer = GenericTokenizer(lambda string: (tok.text for tok in nlp.tokenizer(string))) else: self.sentence_splitter = NLTK_SPLITTER self.tokenizer = NLTK_TOKENIZER self.edge_generator = edge_generator
def __init__(self, class1, class2, rel_type, parser=None, splitter=None, tokenizer=None, edge_generator=None, feature_set=None, feature_generators=None): self.class1 = class1 self.class2 = class2 self.rel_type = rel_type if not parser: nlp = get_spacy_nlp_english(load_parser=True) parser = SpacyParser(nlp) self.parser = parser if not splitter: # if nlp: # Spacy parser is used, which includes a sentence splitter # splitter = GenericSplitter(lambda string: (sent.text for sent in nlp(string).sents)) # else: # splitter = NLTK_SPLITTER splitter = NLTK_SPLITTER self.splitter = splitter if not tokenizer: if nlp: # Spacy parser is used, which includes a tokenizer tokenizer = GenericTokenizer( lambda string: (tok.text for tok in nlp.tokenizer(string))) else: tokenizer = NLTK_TOKENIZER self.tokenizer = tokenizer self.edge_generator = SentenceDistanceEdgeGenerator( self.class1, self.class2, self.rel_type, distance=0) if edge_generator is None else edge_generator self.feature_set = FeatureDictionary( ) if feature_set is None else feature_set self.feature_generators = self._verify_feature_generators( feature_generators) if feature_generators else [ SentenceFeatureGenerator(f_counts_individual=1), ]
def __init__(self, nlp=None, constituency_parser=False): if nlp is None: nlp = get_spacy_nlp_english(load_parser=True) elif not isinstance(nlp, English): raise TypeError('Not an instance of spacy.en.English') self.nlp = nlp """an instance of spacy.en.English""" self.constituency_parser = constituency_parser """the type of constituency parser to use, current supports only bllip""" # NOTE: SpaCy may soon have its own constituency parser: https://github.com/explosion/spaCy/issues/59 if self.constituency_parser is True: self.parser = BllipParser(only_parse=True)
def __init__(self): self.nlp = get_spacy_nlp_english()
def setUpClass(cls): nlp = get_spacy_nlp_english(load_parser=True) cls.parser = SpacyParser(nlp)