예제 #1
0
    def setUpClass(cls):
        cls.dataset = Dataset()

        doc1 = Document()
        cls.dataset.documents['TEST_SENTENCES_SINGLE_ROOT'] = doc1

        for s in TEST_SENTENCES_SINGLE_ROOT:
            part = Part(s)
            doc1.parts[s] = part

        doc2 = Document()
        cls.dataset.documents['TEST_SENTENCES_MULTI_ROOT'] = doc2

        for s in TEST_SENTENCES_MULTI_ROOT:
            part = Part(s)
            doc2.parts[s] = part

        cls.nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(cls.nlp)
        cls.splitter = NLTKSplitter()
        cls.tokenizer = GenericTokenizer(
            lambda string: (tok.text for tok in cls.nlp.tokenizer(string)))

        cls.splitter.split(cls.dataset)
        cls.tokenizer.tokenize(cls.dataset)
        cls.parser.parse(cls.dataset)

        cls.computed_sentences = []

        for sentence in cls.dataset.sentences():
            dist, then = compute_shortest_paths(sentence)
            cls.computed_sentences.append((dist, then, sentence))
예제 #2
0
파일: test_data.py 프로젝트: zxsted/nalaf
    def setUpClass(cls):
        cls.dataset = Dataset()
        cls.doc = Document()
        cls.dataset.documents['testid'] = cls.doc

        # TEXT = "123 45678"
        # POS  = "012345678"
        # ANN1 = " X       "
        # ANN2 = "     XXX "
        # PAR1 = "XXX      "
        # PAR1 = "    XXXXX"

        cls.part = Part(
            'Here is a random sentence for the benefit of your mamma')
        cls.entity = Entity(class_id=STUB_ENTITY_CLASS_ID,
                            offset=10,
                            text='random sentence',
                            confidence=0)
        cls.part.annotations.append(cls.entity)
        cls.doc.parts['s1h1'] = cls.part

        # Apply through pipeline

        NLTKSplitter().split(cls.dataset)
        NLTK_TOKENIZER.tokenize(cls.dataset)

        nlp = get_spacy_nlp_english(load_parser=True)
        cls.parser = SpacyParser(nlp)
        cls.parser.parse(cls.dataset)
        # cls.part.percolate_tokens_to_entities()

        cls.sentence = cls.part.sentences[0]
예제 #3
0
    def __init__(self, edge_generator, use_spacy_pipelines=False):
        super().__init__(edge_generator.entity1_class, edge_generator.entity2_class, edge_generator.relation_type)

        if use_spacy_pipelines:
            nlp = get_spacy_nlp_english(load_parser=True)
            self.sentence_splitter = GenericSplitter(lambda string: (sent.text for sent in nlp(string).sents))
            self.tokenizer = GenericTokenizer(lambda string: (tok.text for tok in nlp.tokenizer(string)))
        else:
            self.sentence_splitter = NLTK_SPLITTER
            self.tokenizer = NLTK_TOKENIZER

        self.edge_generator = edge_generator
예제 #4
0
    def __init__(self,
                 class1,
                 class2,
                 rel_type,
                 parser=None,
                 splitter=None,
                 tokenizer=None,
                 edge_generator=None,
                 feature_set=None,
                 feature_generators=None):
        self.class1 = class1
        self.class2 = class2
        self.rel_type = rel_type

        if not parser:
            nlp = get_spacy_nlp_english(load_parser=True)
            parser = SpacyParser(nlp)

        self.parser = parser

        if not splitter:
            # if nlp:  # Spacy parser is used, which includes a sentence splitter
            #     splitter = GenericSplitter(lambda string: (sent.text for sent in nlp(string).sents))
            # else:
            #     splitter = NLTK_SPLITTER
            splitter = NLTK_SPLITTER

        self.splitter = splitter

        if not tokenizer:
            if nlp:  # Spacy parser is used, which includes a tokenizer
                tokenizer = GenericTokenizer(
                    lambda string: (tok.text for tok in nlp.tokenizer(string)))
            else:
                tokenizer = NLTK_TOKENIZER

        self.tokenizer = tokenizer

        self.edge_generator = SentenceDistanceEdgeGenerator(
            self.class1, self.class2, self.rel_type,
            distance=0) if edge_generator is None else edge_generator

        self.feature_set = FeatureDictionary(
        ) if feature_set is None else feature_set

        self.feature_generators = self._verify_feature_generators(
            feature_generators) if feature_generators else [
                SentenceFeatureGenerator(f_counts_individual=1),
            ]
예제 #5
0
    def __init__(self, nlp=None, constituency_parser=False):
        if nlp is None:
            nlp = get_spacy_nlp_english(load_parser=True)
        elif not isinstance(nlp, English):
            raise TypeError('Not an instance of spacy.en.English')

        self.nlp = nlp
        """an instance of spacy.en.English"""

        self.constituency_parser = constituency_parser
        """the type of constituency parser to use, current supports only bllip"""
        # NOTE: SpaCy may soon have its own constituency parser: https://github.com/explosion/spaCy/issues/59

        if self.constituency_parser is True:
            self.parser = BllipParser(only_parse=True)
예제 #6
0
 def __init__(self):
     self.nlp = get_spacy_nlp_english()
예제 #7
0
파일: test_data.py 프로젝트: zxsted/nalaf
 def setUpClass(cls):
     nlp = get_spacy_nlp_english(load_parser=True)
     cls.parser = SpacyParser(nlp)