Пример #1
0
    def __init__(self, class1, class2, rel_type, splitter=None, tokenizer=None,
            parser=None):
        self.class1 = class1
        self.class2 = class2

        if not splitter:
            splitter = NLTKSplitter()

        if isinstance(splitter, Splitter):
            self.splitter = splitter
        else:
            raise TypeError('not an instance that implements Splitter')

        if not tokenizer:
            tokenizer = TmVarTokenizer()

        if isinstance(tokenizer, Tokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError('not an instance that implements Tokenizer')

        self.graphs = {}

        if not parser:
            parser = BllipParser()
        if isinstance(parser, Parser):
            self.parser = parser
        else:
            raise TypeError('not an instance that implements Parser')

        self.edge_generator = SimpleEdgeGenerator(class1, class2, rel_type)
Пример #2
0
class RelationExtractionPipeline:
    """
    Prepares an instance of a dataset by executing modules in fixed order.
        * Finally executes each feature generator in the order they were provided

    :param class1: the class of entity1
    :type class1: str
    :param class1: the class of entity2
    :type class1: str
    :param rel_type: the relation type between the two entities
    :type rel_type: str
    :param train: if the mode is training or testing
    :type train: bool
    :param feature_set: the feature_set of the original training data
    :type feature_set: str
    :param feature_generators: one or more modules responsible for generating features
    :type feature_generators: collections.Iterable[FeatureGenerator]
    """

    def __init__(self, class1, class2, rel_type, splitter=None, tokenizer=None,
            parser=None):
        self.class1 = class1
        self.class2 = class2

        if not splitter:
            splitter = NLTKSplitter()

        if isinstance(splitter, Splitter):
            self.splitter = splitter
        else:
            raise TypeError('not an instance that implements Splitter')

        if not tokenizer:
            tokenizer = TmVarTokenizer()

        if isinstance(tokenizer, Tokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError('not an instance that implements Tokenizer')

        self.graphs = {}

        if not parser:
            parser = BllipParser()
        if isinstance(parser, Parser):
            self.parser = parser
        else:
            raise TypeError('not an instance that implements Parser')

        self.edge_generator = SimpleEdgeGenerator(class1, class2, rel_type)

    def set_mode(self, train, feature_set, feature_generators=None):
        if feature_generators is None:
            feature_generators = [NamedEntityCountFeatureGenerator(self.class1, feature_set, training_mode=train),
                                  NamedEntityCountFeatureGenerator(self.class2, feature_set, training_mode=train),
                                  BagOfWordsFeatureGenerator(feature_set, training_mode=train),
                                  StemmedBagOfWordsFeatureGenerator(feature_set, training_mode=train),
                                  SentenceFeatureGenerator(feature_set, training_mode=train),
                                  WordFilterFeatureGenerator(feature_set, ['interact', 'bind', 'colocalize'], training_mode=train),
                                  EntityHeadTokenFeatureGenerator(feature_set, training_mode=train),
                                  EntityHeadTokenUpperCaseFeatureGenerator(feature_set, training_mode=train),
                                  EntityHeadTokenDigitsFeatureGenerator(feature_set, training_mode=train),
                                  EntityHeadTokenLetterPrefixesFeatureGenerator(feature_set, training_mode=train),
                                  EntityHeadTokenPunctuationFeatureGenerator(feature_set, training_mode=train),
                                  EntityHeadTokenChainFeatureGenerator(feature_set, training_mode=train),
                                  LinearContextFeatureGenerator(feature_set, training_mode=train),
                                  EntityOrderFeatureGenerator(feature_set, training_mode=train),
                                  LinearDistanceFeatureGenerator(feature_set, training_mode=train),
                                  IntermediateTokensFeatureGenerator(feature_set, training_mode=train),
                                  PathFeatureGenerator(feature_set, self.graphs, training_mode=train),
                                  ProteinWordFeatureGenerator(feature_set, self.graphs, training_mode=train),
                                  LocationWordFeatureGenerator(feature_set, training_mode=train),
                                  FoundInFeatureGenerator(feature_set, training_mode=train),
                                  BiGramFeatureGenerator(feature_set, training_mode=train),
                                  TriGramFeatureGenerator(feature_set, training_mode=train),
                                 ]
        if hasattr(feature_generators, '__iter__'):
            for index, feature_generator in enumerate(feature_generators):
                if not isinstance(feature_generator, FeatureGenerator):
                    raise TypeError('not an instance that implements FeatureGenerator at index {}'.format(index))
                if not feature_generator.training_mode==train:
                    raise ValueError('FeatureGenerator at index {} not set in the correct mode'.format(index))
            self.feature_generators = feature_generators
        elif isinstance(feature_generators, FeatureGenerator):
            if not feature_genenrators.training_mode==train:
                raise ValueError('FeatureGenerator at index not set in the correct mode.')
            else:
                self.feature_generators = [feature_generators]
        else:
            raise TypeError('not an instance or iterable of instances that implements FeatureGenerator')

    def execute(self, dataset, train=False, feature_set=None, feature_generators=None):
        if feature_set is None:
            self.feature_set = FeatureDictionary()
        else:
            self.feature_set = feature_set
        self.set_mode(train, feature_set=self.feature_set, feature_generators=feature_generators)
        try:
            gen = dataset.tokens()
            next(gen)
        except StopIteration:
            self.splitter.split(dataset)
            self.tokenizer.tokenize(dataset)
        self.edge_generator.generate(dataset)
        self.parser.parse(dataset)
        dataset.label_edges()
        for feature_generator in self.feature_generators:
            feature_generator.generate(dataset)