示例#1
0
文件: tagger.py 项目: Roger/quepy
def get_tagger():
    """
    Return a tagging function given some app settings.
    `Settings` is the settings module of an app.
    The returned value is a function that receives a unicode string and returns
    a list of `Word` instances.
    """
    if settings.USE_FREELING:
        from quepy.freeling import run_freeling

        tagger_function = lambda x: run_freeling(x, settings.FREELING_CMD)
    else:
        from quepy.nltktagger import run_nltktagger

        tagger_function = lambda x: run_nltktagger(x, settings.NLTK_DATA_PATH)

    def wrapper(string):
        assert_valid_encoding(string)
        words = tagger_function(string)
        for word in words:
            if word.pos not in PENN_TAGSET:
                logger.warning("Tagger emmited a non-penn " "POS tag {!r}".format(word.pos))
        return words

    return wrapper
示例#2
0
def get_tagger():
    """
    Return a tagging function given some app settings.
    `Settings` is the settings module of an app.
    The returned value is a function that receives a unicode string and returns
    a list of `Word` instances.
    """
    from quepy.nltktagger import run_nltktagger
    tagger_function = lambda x: run_nltktagger(x, settings.NLTK_DATA_PATH)

    def wrapper(string):
        assert_valid_encoding(string)
        words = tagger_function(string)
        for word in words:
            if word.pos not in PENN_TAGSET:
                logger.warning("Tagger emmited a non-penn "
                               "POS tag {!r}".format(word.pos))
        return words
    return wrapper
示例#3
0
    def test_word_output(self):
        output = nltktagger.run_nltktagger(u"this is a test case «¢ðßæŋħħ")

        self.assertIsInstance(output, list)
        for word in output:
            self.assertIsInstance(word, Word)
示例#4
0
    def test_word_output(self):
        output = nltktagger.run_nltktagger(u"this is a test case «¢ðßæŋħħ")

        self.assertIsInstance(output, list)
        for word in output:
            self.assertIsInstance(word, Word)
示例#5
0
def get_tagger(app_name):
    """
    Return a tagging function given some app settings.
    `Settings` is the settings module of an app.
    The returned value is a function that receives a unicode string and returns
    a list of `Word` instances.
    """

    # TODO: Decide approach
    #parser = getattr( settings.PARSER, None )
    #if parser == "spaCy":
    if settings.PARSER == 'spaCy':
        from neuroarch_nlp.data import colors_values
        defaults = importlib.import_module('{}.defaults'.format(app_name))
        neuropils = getattr(defaults, 'neuropils')
        arborization_regions = getattr(defaults, 'arborization_regions')
        subregions = getattr(defaults, 'subregions')
        neuron_types = getattr(defaults, 'neuron_types')

        nlp = spacy.load('en_core_web_sm')
        matcher = Matcher(nlp.vocab)
        vocab = nlp.vocab

        vector_data = {}

        neuron_type_patterns = []
        for n in neuron_types:
            lexes = n.split()
            for lex in lexes:
                if lex not in vocab:
                    vector_data[lex] = np.random.uniform(
                        -1, 1, (300, )).astype(np.float32)
            if len(lexes) > 1:
                neuron_type_patterns.append([{
                    LOWER: lex.lower()
                } for lex in lexes])
        matcher.add('NEURONTYPE', neuron_type_patterns)

        neuropil_patterns = []
        for db_rep, string_reps in neuropils:
            for string in string_reps:
                lexes = string.split()
                for lex in lexes:
                    if lex not in vocab:
                        vector_data[lex] = np.random.uniform(
                            -1, 1, (300, )).astype(np.float32)

                if len(lexes) > 1:
                    neuropil_patterns.append([{
                        LOWER: lex.lower()
                    } for lex in lexes])

        matcher.add('NEUROPIL', neuropil_patterns)

        subregion_patterns = []
        for db_rep, string_reps in arborization_regions:
            for string in string_reps:
                lexes = string.split()
                for lex in lexes:
                    if lex not in vocab:
                        vector_data[lex] = np.random.uniform(
                            -1, 1, (300, )).astype(np.float32)

                if len(lexes) > 1:
                    subregion_patterns.append([{
                        LOWER: lex.lower()
                    } for lex in lexes])

        for phrase in subregions:
            phrase = phrase.split()
            for lex in phrase:
                if lex not in vocab:
                    vector_data[lex] = np.random.uniform(
                        -1, 1, (300, )).astype(np.float32)
            if len(phrase) > 1:
                subregion_patterns.append([{
                    LOWER: lex.lower()
                } for lex in phrase])
        matcher.add('SUBREGION', subregion_patterns)

        color_patterns = []
        for phrase in colors_values:
            phrase = phrase.split()
            for lex in phrase:
                if lex not in vocab:
                    vector_data[lex] = np.random.uniform(
                        -1, 1, (300, )).astype(np.float32)
            if len(phrase) > 1:
                color_patterns.append([{LOWER: lex.lower()} for lex in phrase])
        matcher.add('COLOR', color_patterns)

        for word, vector in vector_data.items():
            vocab.set_vector(word, vector)
            special_case = [{ORTH: word}]
            nlp.tokenizer.add_special_case(word, special_case)

        compilers = [
            ('presynaptic',
             re.compile(
                 "pre-synaptic|pre- synaptic|pre -synaptic|pre synaptic|pre - synaptic",
                 re.IGNORECASE)),
            ('postsynaptic',
             re.compile(
                 "post-synaptic|post- synaptic|post -synaptic|post synaptic|post - synaptic",
                 re.IGNORECASE))
        ]

        def collapse(string):
            for substr, compiler in compilers:
                string = compiler.sub(substr, string)
            return string

        def run_spacytagger(string):
            """
            Runs spacy on `string` and returns a list of
            :class:`quepy.tagger.Word` objects.
            """
            assert_valid_encoding(string)

            # For now, at least, perform our own pre-processing
            # --to ensure terms like "presynaptic" are easily found later.
            string = ' '.join(string.split())
            string = collapse(string)

            doc = nlp(string)  # NOTE: spaCy expects and returns unicode

            spans = [doc[start:end] for match_id, start, end in matcher(doc)]
            filtered = filter_spans(spans)
            with doc.retokenize() as retokenizer:
                for span in filtered:
                    retokenizer.merge(span)
            # tag_ is the "fine-grained" POS
            words = [Word(x.text, x.lemma_, x.tag_) for x in doc]

            # The following is only for logging purposes; if necessary, it could be removed for production
            logger.info(' '.join([t.text + '[' + str(t.i) + ']' for t in doc]))
            indent = "  "
            longest = max(len(t.text) for t in doc)
            column = (len(doc) - 1) * len(indent) + longest + 2
            wout = '{:' + str(column) + '}| '

            def trav_tree(indents, node):
                logger.info( wout.format((indent * indents) + node.text) + ', '.join(
                    [ str(x) for x in [
                        node.i, node.is_oov, node.lemma_, node.tag_, \
                        "<-"+ str(node.left_edge), str(node.right_edge) +"->"] ]) )
                for el in node.children:
                    # NOTE: Could also change display based on node.lefts and node.rights
                    trav_tree(indents + 1, el)

            for sent in doc.sents:
                trav_tree(0, sent.root)
            logger.info('Ents:  ' + str(doc.ents))
            logger.info('NPs:   ' + str(list(doc.noun_chunks)))

            return words

        tagger_function = run_spacytagger
    else:
        from quepy.nltktagger import run_nltktagger
        tagger_function = lambda x: run_nltktagger(x, settings.NLTK_DATA_PATH)

    def wrapper(string):
        assert_valid_encoding(string)
        words = tagger_function(string)
        for word in words:
            if word.pos not in PENN_TAGSET:
                logger.warning("Tagger emmited a non-penn "
                               "POS tag {!r}".format(word.pos))
        return words

    return wrapper