Пример #1
0
    def _create_unigrams_for_parseset_n(self, parseset_index):
        print "Parsing parse set {} and generating unigrams with occurrence counts".format(parseset_index)

        dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index)))
        parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])

        print "Found {} sentences".format(len(parseset.sentences))
        words = [word for sentence in parseset.sentences for word in sentence.words]
        print "Found {} words".format(len(words))
        print "Found {} parsable words".format(
            len(filter(lambda word: not isinstance(word, UnparsableWordBinding), words)))

        generator = WordNGramGenerator(1)

        collection = self.db['wordUnigrams{}'.format(parseset_index)]

        # delete everything in the collection
        collection.remove({})

        bulk_insert_buffer = []
        for unigram in generator.iter_ngrams(words):
            entity = {
                'item_0': unigram
            }
            bulk_insert_buffer.append(entity)
            if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0:
                collection.insert(bulk_insert_buffer)
                bulk_insert_buffer = []

        collection.insert(bulk_insert_buffer)

        self._inspect_unigrams_for_parseset_n(parseset_index)
Пример #2
0
    def _create_trigrams_for_parseset_n(self, parseset_index):
        print "Parsing parse set {} and generating trigrams with occurrence counts".format(
            parseset_index)

        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])

        print "Found {} sentences".format(len(parseset.sentences))
        words = [
            word for sentence in parseset.sentences for word in sentence.words
        ]
        print "Found {} words".format(len(words))
        print "Found {} parsable words".format(
            len(
                filter(
                    lambda word: not isinstance(word, UnparsableWordBinding),
                    words)))

        generator = WordNGramGenerator(3)

        collection = self.db['wordTrigrams{}'.format(parseset_index)]

        # delete everything in the collection
        collection.remove({})

        bulk_insert_buffer = []
        for trigram in generator.iter_ngrams(words):
            entity = {
                'item_0': trigram[0],
                'item_1': trigram[1],
                'item_2': trigram[2]
            }
            bulk_insert_buffer.append(entity)
            if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0:
                collection.insert(bulk_insert_buffer)
                bulk_insert_buffer = []

        collection.insert(bulk_insert_buffer)

        trigram_count = collection.count()
        print "Generated {} trigrams".format(trigram_count)
    def _create_mock_container(self, word):
        if isinstance(word, UnparsableWordBinding):
            print u'Previous word is unparsable, skipped : {}'.format(word.str)
            return None

        surface_str, surface_syntactic_category = word.str, word.syntactic_category
        stem_str, stem_syntactic_category, stem_secondary_syntactic_category = WordNGramGenerator._get_stem(word)
        lemma_root_str, lemma_root_syntactic_category = word.root.lemma_root, word.root.syntactic_category

        if word.secondary_syntactic_category:
            surface_syntactic_category += u'_' + word.secondary_syntactic_category
        if stem_secondary_syntactic_category:
            stem_syntactic_category += u'_' + stem_secondary_syntactic_category
        if word.root.secondary_syntactic_category:
            lemma_root_syntactic_category += u'_' + word.root.secondary_syntactic_category

        return MockMorphemeContainerBuilder.builder(word.format(), surface_str, surface_syntactic_category).stem(stem_str, stem_syntactic_category).lexeme(lemma_root_str, lemma_root_syntactic_category).build()
    def _create_mock_container(self, word):
        if isinstance(word, UnparsableWordBinding):
            print u'Previous word is unparsable, skipped : {}'.format(word.str)
            return None

        surface_str, surface_syntactic_category = word.str, word.syntactic_category
        stem_str, stem_syntactic_category, stem_secondary_syntactic_category = WordNGramGenerator._get_stem(
            word)
        lemma_root_str, lemma_root_syntactic_category = word.root.lemma_root, word.root.syntactic_category

        if word.secondary_syntactic_category:
            surface_syntactic_category += u'_' + word.secondary_syntactic_category
        if stem_secondary_syntactic_category:
            stem_syntactic_category += u'_' + stem_secondary_syntactic_category
        if word.root.secondary_syntactic_category:
            lemma_root_syntactic_category += u'_' + word.root.secondary_syntactic_category

        return MockMorphemeContainerBuilder.builder(
            word.format(), surface_str, surface_syntactic_category).stem(
                stem_str, stem_syntactic_category).lexeme(
                    lemma_root_str, lemma_root_syntactic_category).build()