def _create_parseset_n(self, set_number):
        source_file_path = os.path.join(os.path.dirname(__file__), '../../testresources/simpleparsesets/simpleparseset{}.txt'.format(set_number))
        destination_file_path = os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(set_number))

        line_index = 0
        sentences = []
        with codecs.open(source_file_path, mode='r', encoding='utf-8') as src:
            entries_for_sentence = []
            for line in src:
                print u'Processing line {}'.format(line_index)
                line_index +=1
                if not line:
                    continue
                elif line.startswith(END_OF_SENTENCE_MARKER):
                    sentence_binding = self.parseset_creator.create_sentence_binding_from_morpheme_containers(entries_for_sentence)
                    sentences.append(sentence_binding)
                    entries_for_sentence = []
                elif line.startswith("#"):
                    continue
                else:
                    word_part = line[:line.find('=')].strip()
                    parse_result_part = line[line.find('=')+1:].strip()

                    parse_result_matching_simple_parseset = self._find_parse_result_matching_simple_parseset(word_part, parse_result_part)

                    entries_for_sentence.append((word_part, parse_result_matching_simple_parseset))

        parseset_binding = ParseSetBinding()
        parseset_binding.sentences = sentences
        parseset_dom = parseset_binding.to_dom()
        parseset_dom.setAttribute("xmlns", xmlbindings.NAMESPACE)
        with codecs.open(destination_file_path, mode='w', encoding='utf-8') as output:
            output.write(PARSESET_HEADER)
            output.write('\n')
            output.write(parseset_dom.toprettyxml())
    def _test_calculate_with_parseset_n(self, parseset_index,
                                        leading_context_size,
                                        following_context_size):
        start_time = datetime.datetime.today()

        self.calculator = self.create_calculator(parseset_index)

        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])
        self.parse_set_word_list = []
        for sentence in parseset.sentences:
            self.parse_set_word_list.extend(sentence.words)

        self._test_generate_likelihoods(leading_context_size,
                                        following_context_size)

        end_time = datetime.datetime.today()
        print u'Done in {} seconds for {} words'.format(
            end_time - start_time,
            len(self.parse_set_word_list) - 1)
        print u'Average in {} seconds'.format(
            (end_time - start_time) / (len(self.parse_set_word_list) - 1))
예제 #3
0
    def _create_unigrams_for_parseset_n(self, parseset_index):
        print "Parsing parse set {} and generating unigrams with occurrence counts".format(parseset_index)

        dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index)))
        parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])

        print "Found {} sentences".format(len(parseset.sentences))
        words = [word for sentence in parseset.sentences for word in sentence.words]
        print "Found {} words".format(len(words))
        print "Found {} parsable words".format(
            len(filter(lambda word: not isinstance(word, UnparsableWordBinding), words)))

        generator = WordNGramGenerator(1)

        collection = self.db['wordUnigrams{}'.format(parseset_index)]

        # delete everything in the collection
        collection.remove({})

        bulk_insert_buffer = []
        for unigram in generator.iter_ngrams(words):
            entity = {
                'item_0': unigram
            }
            bulk_insert_buffer.append(entity)
            if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0:
                collection.insert(bulk_insert_buffer)
                bulk_insert_buffer = []

        collection.insert(bulk_insert_buffer)

        self._inspect_unigrams_for_parseset_n(parseset_index)
    def setUpClass(cls):
        dom = parse(os.path.join(os.path.dirname(__file__), 'concordance_sample_parseset.xml'))
        parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])
        word_list = []
        for sentence in parseset.sentences:
            word_list.extend(sentence.words)

        cls.word_list = word_list
    def _create_parseset_n(self, set_number):
        source_file_path = os.path.join(
            os.path.dirname(__file__),
            '../../testresources/simpleparsesets/simpleparseset{}.txt'.format(
                set_number))
        destination_file_path = os.path.join(
            os.path.dirname(__file__),
            '../../testresources/parsesets/parseset{}.xml'.format(set_number))

        line_index = 0
        sentences = []
        with codecs.open(source_file_path, mode='r', encoding='utf-8') as src:
            entries_for_sentence = []
            for line in src:
                print u'Processing line {}'.format(line_index)
                line_index += 1
                if not line:
                    continue
                elif line.startswith(END_OF_SENTENCE_MARKER):
                    sentence_binding = self.parseset_creator.create_sentence_binding_from_morpheme_containers(
                        entries_for_sentence)
                    sentences.append(sentence_binding)
                    entries_for_sentence = []
                elif line.startswith("#"):
                    continue
                else:
                    word_part = line[:line.find('=')].strip()
                    parse_result_part = line[line.find('=') + 1:].strip()

                    parse_result_matching_simple_parseset = self._find_parse_result_matching_simple_parseset(
                        word_part, parse_result_part)

                    entries_for_sentence.append(
                        (word_part, parse_result_matching_simple_parseset))

        parseset_binding = ParseSetBinding()
        parseset_binding.sentences = sentences
        parseset_dom = parseset_binding.to_dom()
        parseset_dom.setAttribute("xmlns", xmlbindings.NAMESPACE)
        with codecs.open(destination_file_path, mode='w',
                         encoding='utf-8') as output:
            output.write(PARSESET_HEADER)
            output.write('\n')
            output.write(parseset_dom.toprettyxml())
예제 #6
0
    def setUpClass(cls):
        dom = parse(
            os.path.join(os.path.dirname(__file__),
                         'concordance_sample_parseset.xml'))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])
        word_list = []
        for sentence in parseset.sentences:
            word_list.extend(sentence.words)

        cls.word_list = word_list
예제 #7
0
    def setUpClass(cls):
        super(StatisticalParserTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),
                         '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))

        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(
            BasicSuffixGraph()))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder(
        )
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder(
        )

        contextless_parser = ContextlessMorphologicalParser(
            suffix_graph, predefined_paths, [
                word_root_finder, digit_numeral_root_finder,
                text_numeral_root_finder,
                proper_noun_from_apostrophe_root_finder,
                proper_noun_without_apostrophe_root_finder
            ])

        parseset_index = "001"
        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])
        parse_set_word_list = []
        for sentence in parseset.sentences:
            parse_set_word_list.extend(sentence.words)

        complete_word_concordance_index = CompleteWordConcordanceIndex(
            parse_set_word_list)

        cls.parser = StatisticalParser(contextless_parser,
                                       complete_word_concordance_index)
예제 #8
0
    def _validate_concordances_for_parse_set_n(self, parseset_index):
        dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index)))
        parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])
        word_list = []
        for sentence in parseset.sentences:
            word_list.extend(sentence.words)

        self._validate_complete_word_concordance_indexes(word_list)
        self._validate_root_concordance_indexes(word_list)
        self._validate_lemma_concordance_indexes(word_list)
        self._validate_transition_word_concordance_indexes(word_list)
        self._validate_transition_matched_word_concordance_indexes(word_list)
예제 #9
0
    def _test_calculate_with_parseset_n(self, parseset_index, leading_context_size, following_context_size):
        start_time = datetime.datetime.today()

        self.contextful_morphological_parser = self.create_contextful_morphological_parser(parseset_index)

        dom = parse(os.path.join(os.path.dirname(__file__), '../../../../testresources/parsesets/parseset{}.xml'.format(parseset_index)))
        parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])
        self.parse_set_word_list = []
        for sentence in parseset.sentences:
            self.parse_set_word_list.extend(sentence.words)

        self._test_generate_likelihoods(leading_context_size, following_context_size)

        end_time = datetime.datetime.today()
        print u'Done in {} seconds for {} words'.format(end_time - start_time, len(self.parse_set_word_list) - 1)
        print u'Average in {} seconds'.format((end_time - start_time) / (len(self.parse_set_word_list) - 1))
예제 #10
0
    def _create_trigrams_for_parseset_n(self, parseset_index):
        print "Parsing parse set {} and generating trigrams with occurrence counts".format(
            parseset_index)

        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])

        print "Found {} sentences".format(len(parseset.sentences))
        words = [
            word for sentence in parseset.sentences for word in sentence.words
        ]
        print "Found {} words".format(len(words))
        print "Found {} parsable words".format(
            len(
                filter(
                    lambda word: not isinstance(word, UnparsableWordBinding),
                    words)))

        generator = WordNGramGenerator(3)

        collection = self.db['wordTrigrams{}'.format(parseset_index)]

        # delete everything in the collection
        collection.remove({})

        bulk_insert_buffer = []
        for trigram in generator.iter_ngrams(words):
            entity = {
                'item_0': trigram[0],
                'item_1': trigram[1],
                'item_2': trigram[2]
            }
            bulk_insert_buffer.append(entity)
            if len(bulk_insert_buffer) % self.BULK_INSERT_SIZE == 0:
                collection.insert(bulk_insert_buffer)
                bulk_insert_buffer = []

        collection.insert(bulk_insert_buffer)

        trigram_count = collection.count()
        print "Generated {} trigrams".format(trigram_count)
예제 #11
0
    def _validate_concordances_for_parse_set_n(self, parseset_index):
        dom = parse(
            os.path.join(
                os.path.dirname(__file__),
                '../../testresources/parsesets/parseset{}.xml'.format(
                    parseset_index)))
        parseset = ParseSetBinding.build(
            dom.getElementsByTagName("parseset")[0])
        word_list = []
        for sentence in parseset.sentences:
            word_list.extend(sentence.words)

        self._validate_complete_word_concordance_indexes(word_list)
        self._validate_root_concordance_indexes(word_list)
        self._validate_lemma_concordance_indexes(word_list)
        self._validate_transition_word_concordance_indexes(word_list)
        self._validate_transition_matched_word_concordance_indexes(word_list)
예제 #12
0
    def setUpClass(cls):
        super(StatisticalParserTest, cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(os.path.join(os.path.dirname(__file__), '../../resources/master_dictionary.txt'))
        for di in lexemes:
            all_roots.extend(RootGenerator.generate(di))


        root_map_generator = RootMapGenerator()
        cls.root_map = root_map_generator.generate(all_roots)

        suffix_graph = CopulaSuffixGraph(NumeralSuffixGraph(BasicSuffixGraph()))
        suffix_graph.initialize()

        predefined_paths = PredefinedPaths(cls.root_map, suffix_graph)
        predefined_paths.create_predefined_paths()

        word_root_finder = WordRootFinder(cls.root_map)
        digit_numeral_root_finder = DigitNumeralRootFinder()
        text_numeral_root_finder = TextNumeralRootFinder(cls.root_map)
        proper_noun_from_apostrophe_root_finder = ProperNounFromApostropheRootFinder()
        proper_noun_without_apostrophe_root_finder = ProperNounWithoutApostropheRootFinder()

        contextless_parser = ContextlessMorphologicalParser(suffix_graph, predefined_paths,
            [word_root_finder, digit_numeral_root_finder, text_numeral_root_finder, proper_noun_from_apostrophe_root_finder, proper_noun_without_apostrophe_root_finder])

        parseset_index = "001"
        dom = parse(os.path.join(os.path.dirname(__file__), '../../testresources/parsesets/parseset{}.xml'.format(parseset_index)))
        parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])
        parse_set_word_list = []
        for sentence in parseset.sentences:
            parse_set_word_list.extend(sentence.words)

        complete_word_concordance_index = CompleteWordConcordanceIndex(parse_set_word_list)

        cls.parser = StatisticalParser(contextless_parser, complete_word_concordance_index)
예제 #13
0
"""
import os
import unittest
from xml.dom.minidom import parse
from hamcrest import *
from trnltk.morphology.contextless.parser.parser import ContextlessMorphologicalParser
from trnltk.morphology.contextless.parser.rootfinder import WordRootFinder
from trnltk.statistics.morphemecontainerstats import MorphemeContainerContextlessProbabilityGenerator
from trnltk.morphology.lexicon.lexiconloader import LexiconLoader
from trnltk.morphology.lexicon.rootgenerator import RootGenerator, RootMapGenerator
from trnltk.morphology.morphotactics.basicsuffixgraph import BasicSuffixGraph
from trnltk.parseset.xmlbindings import ParseSetBinding
from trnltk.statistics.suffixtransitionstats import SuffixTransitionProbabilityGenerator

dom = parse(os.path.join(os.path.dirname(__file__), '../../morphology/contextful/likelihoodmetrics/wordformcollocation/test/morphology_contextless_statistics_sample_parseset.xml'))
parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])
parse_set_word_list = []
for sentence in parseset.sentences:
    parse_set_word_list.extend(sentence.words)

class MorphemeContainerContextlessProbabilityGeneratorWithContainersTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        super(MorphemeContainerContextlessProbabilityGeneratorWithContainersTest, cls).setUpClass()
        all_roots = []

        lexicon_lines = u'''
            duvar
            tutku
            saç
            oğul [A:LastVowelDrop]
예제 #14
0
        verify(wrapped_calculator).calculate_oneway_likelihood(
            target, [mock_context_item_2], True, calculation_context)
        verify(wrapped_calculator).calculate_oneway_likelihood(
            target, [mock_context_item_1, mock_context_item_2], True,
            calculation_context)
        verify(wrapped_calculator).calculate_oneway_likelihood(
            target,
            [mock_context_item_0, mock_context_item_1, mock_context_item_2],
            True, calculation_context)
        verifyNoMoreInteractions(wrapped_calculator)


dom = parse(
    os.path.join(os.path.dirname(__file__),
                 'morphology_contextless_statistics_sample_parseset.xml'))
parseset = ParseSetBinding.build(dom.getElementsByTagName("parseset")[0])
parse_set_word_list = []
for sentence in parseset.sentences:
    parse_set_word_list.extend(sentence.words)


class InterpolatingLikelihoodCalculatorCalculationContextTest(
        unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        super(InterpolatingLikelihoodCalculatorCalculationContextTest,
              cls).setUpClass()
        all_roots = []

        lexemes = LexiconLoader.load_from_file(
            os.path.join(os.path.dirname(__file__),