예제 #1
0
 def test_doc_write(self):
     import sys, os
     # Create LMF objects
     lexical_entry = LexicalEntry()
     lexical_entry.lemma = Lemma()
     lexical_entry.partOfSpeech = "toto"
     lexical_entry.status = "draft"
     lexical_entry.lemma.lexeme = "hello"
     lexicon = Lexicon()
     lexicon.add_lexical_entry(lexical_entry)
     lexical_resource = LexicalResource()
     lexical_resource.add_lexicon(lexicon)
     # Write document file and test result
     utest_path = sys.path[0] + '/'
     doc_filename = utest_path + "output.docx"
     doc_write(lexical_resource, doc_filename)
     doc_file = open(doc_filename, "r")
     doc_file.readlines()
     doc_file.close()
     # Customize mapping
     def lmf2doc(lexicon, document, items, sort_order, paradigms, reverse):
         return "test"
     # Write document file and test result
     doc_write(lexical_resource, doc_filename, None, lmf2doc)
     doc_file = open(doc_filename, "r")
     doc_file.readlines()
     doc_file.close()
     del lexical_entry.lemma
     lexical_entry.lemma = None
     del lexical_entry, lexicon
     lexicon = None
     del lexical_resource
     # Remove document file
     os.remove(doc_filename)
예제 #2
0
 def test_get_lexicons(self):
     # Create lexicons
     lexicon1 = Lexicon()
     lexicon2 = Lexicon()
     # Add lexicons to the lexical resource
     self.lexical_resource.lexicon = [lexicon1, lexicon2]
     # Test get lexicons
     self.assertListEqual(self.lexical_resource.get_lexicons(),
                          [lexicon1, lexicon2])
     # Release Lexicon instances
     del self.lexical_resource.lexicon[:]
     del lexicon1, lexicon2
예제 #3
0
 def test_get_lexicon(self):
     # Create lexicons
     lexicon1 = Lexicon("lexicon1")
     lexicon2 = Lexicon("lexicon2")
     # Add lexicons to the lexical resource
     self.lexical_resource.lexicon = [lexicon1, lexicon2]
     # Test get lexicon
     self.assertIsNone(
         self.lexical_resource.get_lexicon("unknown identifier"))
     self.assertEqual(self.lexical_resource.get_lexicon("lexicon2"),
                      lexicon2)
     # Release Lexicon instances
     del lexicon1, lexicon2
예제 #4
0
 def test_remove_lexicon(self):
     # Create lexicons
     lexicon1 = Lexicon()
     lexicon2 = Lexicon()
     # Add lexicons to the lexical resource
     self.lexical_resource.lexicon = [lexicon1, lexicon2]
     # Test remove lexicons
     self.assertEqual(self.lexical_resource.remove_lexicon(lexicon1),
                      self.lexical_resource)
     self.assertListEqual(self.lexical_resource.lexicon, [lexicon2])
     self.assertEqual(self.lexical_resource.remove_lexicon(lexicon2),
                      self.lexical_resource)
     self.assertListEqual(self.lexical_resource.lexicon, [])
     # Release Lexicon instances
     del lexicon1, lexicon2
예제 #5
0
 def test_add_lexicon(self):
     # Create lexicons
     lexicon1 = Lexicon()
     lexicon2 = Lexicon()
     # Test add lexicons to the lexical resource
     self.assertEqual(self.lexical_resource.add_lexicon(lexicon1),
                      self.lexical_resource)
     self.assertListEqual(self.lexical_resource.lexicon, [lexicon1])
     self.assertEqual(self.lexical_resource.add_lexicon(lexicon2),
                      self.lexical_resource)
     self.assertListEqual(self.lexical_resource.lexicon,
                          [lexicon1, lexicon2])
     # Release Lexicon instances
     del self.lexical_resource.lexicon[:]
     del lexicon1, lexicon2
예제 #6
0
def get_data_generator(args, model_args, schema, test=False):
    from cocoa.core.scenario_db import ScenarioDB
    from cocoa.core.dataset import read_dataset
    from cocoa.core.util import read_json

    from core.scenario import Scenario
    from core.lexicon import Lexicon
    from preprocess import DataGenerator, Preprocessor
    import os.path

    # TODO: move this to dataset
    dataset = read_dataset(args, Scenario)

    mappings_path = model_args.mappings

    lexicon = Lexicon(schema.values['item'])
    preprocessor = Preprocessor(schema, lexicon, model_args.entity_encoding_form,
        model_args.entity_decoding_form, model_args.entity_target_form,
        model=model_args.model)

    if test:
        model_args.dropout = 0
        train, dev, test = None, None, dataset.test_examples
    else:
        train, dev, test = dataset.train_examples, dataset.test_examples, None
    data_generator = DataGenerator(train, dev, test, preprocessor, args, schema, mappings_path,
        cache=args.cache, ignore_cache=args.ignore_cache,
        num_context=model_args.num_context,
        batch_size=args.batch_size,
        model=model_args.model)

    return data_generator
예제 #7
0
def get_system(name, args, schema=None, timed=False, model_path=None):
    if name in ('rulebased', 'neural'):
        lexicon = Lexicon(schema,
                          args.learned_lex,
                          stop_words=args.stop_words,
                          lexicon_path=args.lexicon)
        if args.inverse_lexicon:
            realizer = InverseLexicon.from_file(args.inverse_lexicon)
        else:
            realizer = DefaultInverseLexicon()
    if name == 'rulebased':
        templates = Templates.from_pickle(args.templates)
        generator = Generator(templates)
        manager = Manager.from_pickle(args.policy)
        return RulebasedSystem(lexicon, generator, manager, timed)
    elif name == 'neural':
        assert args.model_path
        return NeuralSystem(schema,
                            lexicon,
                            args.model_path,
                            args.fact_check,
                            args.decoding,
                            realizer=realizer)
    elif name == 'cmd':
        return CmdSystem()
    else:
        raise ValueError('Unknown system %s' % name)
예제 #8
0
def get_system(name, args, schema, model_path=None, timed=False):
    lexicon = Lexicon.from_pickle(args.lexicon)
    templates = Templates.from_pickle(args.templates)
    if name == 'rulebased':
        templates = Templates.from_pickle(args.templates)
        generator = Generator(templates)
        manager = Manager.from_pickle(args.policy)
        return RulebasedSystem(lexicon, generator, manager, timed)
    elif name == 'cmd':
        return CmdSystem()
    else:
        raise ValueError('Unknown system %s' % name)
예제 #9
0
def get_system(name, args, schema=None, timed=False):
    lexicon = Lexicon(schema.values['owner'])
    if name == 'rulebased':
        templates = Templates.from_pickle(args.templates)
        generator = Generator(templates)
        manager = Manager.from_pickle(args.policy)
        return RulebasedSystem(lexicon, generator, manager, timed)
    elif name == 'cmd':
        return CmdSystem()
    # elif name == 'neural':
    #     return NeuralSystem(args.model_file, args.temperature, timed_session=timed, gpu=args.gpu)
    else:
        raise ValueError('Unknown system %s' % name)
예제 #10
0
 def test_mdf_write(self):
     import sys, os
     # Create LMF objects
     lexical_entry = LexicalEntry()
     lexical_entry.lemma = Lemma()
     lexical_entry.partOfSpeech = "toto"
     lexical_entry.status = "draft"
     lexical_entry.lemma.lexeme = "hello"
     lexicon = Lexicon()
     lexicon.add_lexical_entry(lexical_entry)
     # Write MDF file and test result
     utest_path = sys.path[0] + '/'
     mdf_filename = utest_path + "output.txt"
     mdf_write(lexicon, mdf_filename)
     mdf_file = open(mdf_filename, "r")
     expected_lines = ["\\lx hello" + EOL, "\\ps toto" + EOL, "\\st draft" + EOL, EOL]
     self.assertListEqual(expected_lines, mdf_file.readlines())
     mdf_file.close()
     # Customize mapping
     lmf2mdf = dict({
         "lx" : lambda lexical_entry: lexical_entry.get_status(),
         "ps" : lambda lexical_entry: lexical_entry.get_partOfSpeech(),
         "st" : lambda lexical_entry: lexical_entry.get_lexeme()
     })
     order = ["st", "lx", "ps"]
     # Write MDF file and test result
     mdf_write(lexicon, mdf_filename, lmf2mdf, order)
     mdf_file = open(mdf_filename, "r")
     expected_lines = ["\\st hello" + EOL, "\\lx draft" + EOL, "\\ps toto" + EOL, EOL]
     self.assertListEqual(expected_lines, mdf_file.readlines())
     mdf_file.close()
     del lexical_entry.lemma
     lexical_entry.lemma = None
     del lexical_entry, lexicon
     # Remove MDF file
     os.remove(mdf_filename)
예제 #11
0
    def test_odt_write(self):
        import sys, os
        # Create LMF objects
        lexical_entry = LexicalEntry()
        lexical_entry.lemma = Lemma()
        lexical_entry.partOfSpeech = "toto"
        lexical_entry.status = "draft"
        lexical_entry.lemma.lexeme = "hello"
        lexicon = Lexicon()
        lexicon.add_lexical_entry(lexical_entry)
        lexical_resource = LexicalResource()
        lexical_resource.add_lexicon(lexicon)
        # Write document file and test result
        utest_path = sys.path[0] + '/'
        odt_filename = utest_path + "output.odt"
        odt_write(lexical_resource, odt_filename)
        odt_file = open(odt_filename, "r")
        odt_file.readlines()
        odt_file.close()

        # Customize mapping
        def lmf2odt(lexicon, document, items, sort_order, paradigms, reverse):
            return "test"

        # Write document file and test result
        odt_write(lexical_resource, odt_filename, None, lmf2odt)
        odt_file = open(odt_filename, "r")
        odt_file.readlines()
        odt_file.close()
        del lexical_entry.lemma
        lexical_entry.lemma = None
        del lexical_entry, lexicon
        lexicon = None
        del lexical_resource
        # Remove document file
        os.remove(odt_filename)
예제 #12
0
def get_system(name, args, schema=None, timed=False, model_path=None):
    lexicon = Lexicon(schema.values['item'])
    if name == 'rulebased':
        templates = Templates.from_pickle(args.templates)
        generator = Generator(templates)
        manager = Manager.from_pickle(args.policy)
        return RulebasedSystem(lexicon, generator, manager, timed)
    elif name == 'hybrid':
        assert model_path
        templates = Templates.from_pickle(args.templates)
        manager = PytorchNeuralSystem(args, schema, lexicon, model_path, timed)
        generator = Generator(templates)
        return HybridSystem(lexicon, generator, manager, timed)
    elif name == 'cmd':
        return CmdSystem()
    elif name == 'fb-neural':
        assert model_path
        return FBNeuralSystem(model_path, args.temperature, timed_session=timed, gpu=False)
    elif name == 'pt-neural':
        assert model_path
        return PytorchNeuralSystem(args, schema, lexicon, model_path, timed)
    else:
        raise ValueError('Unknown system %s' % name)
예제 #13
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--transcripts',
                        nargs='*',
                        help='JSON transcripts to extract templates')
    parser.add_argument('--max-examples', default=-1, type=int)
    parser.add_argument('--templates', help='Path to load templates')
    parser.add_argument('--policy', help='Path to load model')
    parser.add_argument('--schema-path', help='Path to schema')
    parser.add_argument(
        '--agent', help='Only consider examples with the given type of agent')
    add_lexicon_arguments(parser)
    args = parser.parse_args()

    schema = Schema(args.schema_path)
    lexicon = Lexicon(schema,
                      False,
                      stop_words=args.stop_words,
                      lexicon_path=args.lexicon)
    #templates = Templates.from_pickle(args.templates)
    templates = Templates()
    manager = Manager.from_pickle(args.policy)
    analyzer = Analyzer(lexicon)

    examples = read_examples(args.transcripts, args.max_examples, Scenario)
    agent = args.agent
    if agent is not None:
        examples = [e for e in examples if agent in e.agents.values()]
    analyzer.example_stats(examples, agent=agent)
    #import sys; sys.exit()

    parsed_dialogues = []
    for example in examples:
예제 #14
0
    def test_tex_write(self):
        import sys, os
        # Create LMF objects
        lexical_entry = LexicalEntry()
        lexical_entry.lemma = Lemma()
        lexical_entry.partOfSpeech = "toto"
        lexical_entry.status = "draft"
        lexical_entry.lemma.lexeme = "hello"
        lexicon = Lexicon()
        lexicon.add_lexical_entry(lexical_entry)
        lexical_resource = LexicalResource()
        lexical_resource.add_lexicon(lexicon)
        # Write LaTeX file and test result
        utest_path = sys.path[0] + '/'
        tex_filename = utest_path + "output.tex"
        tex_write(lexical_resource, tex_filename)
        tex_file = open(tex_filename, "r")
        begin_lines = [
            EOL, "\\begin{document}" + EOL, "\\maketitle" + EOL,
            "\\newpage" + EOL, EOL,
            "\\def\\mytextsc{\\bgroup\\obeyspaces\\mytextscaux}" + EOL,
            "\\def\\mytextscaux#1{\\mytextscauxii #1\\relax\\relax\\egroup}" +
            EOL, "\\def\\mytextscauxii#1{%" + EOL,
            "\\ifx\\relax#1\\else \\ifcat#1\\@sptoken{} \\expandafter\\expandafter\\expandafter\\mytextscauxii\\else"
            + EOL,
            "\\ifnum`#1=\\uccode`#1 {\\normalsize #1}\\else {\\footnotesize \\uppercase{#1}}\\fi \\expandafter\\expandafter\\expandafter\\mytextscauxii\\expandafter\\fi\\fi}"
            + EOL, EOL, "\\setlength\\parindent{0cm}" + EOL, EOL,
            "\\addmediapath{.}" + EOL, "\\addmediapath{./mp3}" + EOL,
            "\\addmediapath{./wav}" + EOL, "\\graphicspath{{" +
            os.path.abspath('.') + "/pylmflib/output/img/}}" + EOL, EOL,
            "\\newpage" + EOL, "\\begin{multicols}{2}" + EOL, EOL
        ]
        end_lines = ["\end{multicols}" + EOL, "\end{document}" + EOL]
        expected_lines = [
            "\\newpage" + EOL,
            "\\section*{\\centering- \\textbf{\ipa{H}} \\textbf{\ipa{h}} -}" +
            EOL,
            #"\\pdfbookmark[1]{\ipa{ H h }}{ H h }" + EOL,
            "\\paragraph{\\hspace{-0.5cm} \\textbf{\ipa{hello}}} \\hypertarget{01}{}"
            + EOL,
            "\markboth{\\textbf{\\ipa{hello}}}{}" + EOL,
            "\\textit{Status:} draft" + EOL,
            "\lhead{\\firstmark}" + EOL,
            "\\rhead{\\botmark}" + EOL,
            EOL
        ]
        self.assertListEqual(begin_lines + expected_lines + end_lines,
                             tex_file.readlines())
        tex_file.close()
        # Customize mapping
        my_lmf_tex = dict({
            "Lemma.lexeme":
            lambda lexical_entry: "is " + lexical_entry.get_lexeme(
            ) + "." + EOL,
            "LexicalEntry.id":
            lambda lexical_entry: "The lexical entry " + str(lexical_entry.
                                                             get_id()) + " ",
            "LexicalEntry.partOfSpeech":
            lambda lexical_entry: "Its grammatical category is " +
            lexical_entry.get_partOfSpeech() + "." + EOL,
            "LexicalEntry.status":
            lambda lexical_entry: "Warning: " + lexical_entry.get_status(
            ) + " version!" + EOL
        })
        my_order = [
            "LexicalEntry.id", "Lemma.lexeme", "LexicalEntry.partOfSpeech",
            "LexicalEntry.status"
        ]

        def lmf2tex(entry, font):
            result = ""
            for attribute in my_order:
                result += my_lmf_tex[attribute](entry)
            return result

        # Write LaTeX file and test result
        tex_write(lexical_resource, tex_filename, None, None, lmf2tex, font)
        tex_file = open(tex_filename, "r")
        expected_lines = [
            "\\newpage" + EOL,
            "\\section*{\\centering- \\textbf{\ipa{H}} \\textbf{\ipa{h}} -}" +
            EOL,
            #"\\pdfbookmark[1]{\ipa{ H h }}{ H h }" + EOL,
            "The lexical entry 01 is hello." + EOL,
            "Its grammatical category is toto." + EOL,
            "Warning: draft version!" + EOL,
            "\lhead{\\firstmark}" + EOL,
            "\\rhead{\\botmark}" + EOL,
            EOL
        ]
        self.assertListEqual(begin_lines + expected_lines + end_lines,
                             tex_file.readlines())
        tex_file.close()
        del lexical_entry.lemma
        lexical_entry.lemma = None
        del lexical_entry, lexicon
        lexicon = None
        del lexical_resource
        # Remove LaTeX file
        os.remove(tex_filename)
예제 #15
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--transcripts',
                        nargs='*',
                        help='JSON transcripts to extract templates')
    parser.add_argument('--max-examples', default=-1, type=int)
    parser.add_argument('--templates', help='Path to load templates')
    parser.add_argument('--policy', help='Path to load model')
    parser.add_argument('--schema-path', help='Path to schema')
    parser.add_argument(
        '--agent', help='Only consider examples with the given type of agent')
    args = parser.parse_args()

    schema = Schema(args.schema_path)
    lexicon = Lexicon(schema.values['item'])
    #templates = Templates.from_pickle(args.templates)
    templates = Templates()
    manager = Manager.from_pickle(args.policy)
    analyzer = Analyzer(lexicon)

    # TODO: skip examples
    examples = read_examples(args.transcripts, args.max_examples, Scenario)
    agent = args.agent
    if agent is not None:
        examples = [e for e in examples if agent in e.agents.values()]
    analyzer.example_stats(examples, agent=agent)
    #import sys; sys.exit()

    parsed_dialogues = []
    for example in examples:
예제 #16
0
파일: mdf.py 프로젝트: yuhsianglin/HimalCo
def mdf_read(filename=None,
             mdf2lmf=mdf_lmf,
             lexicon=None,
             id=None,
             encoding=ENCODING):
    """! @brief Read an MDF file.
    @param filename The name of the MDF file to read with full path, for instance 'user/input.txt'.
    @param mdf2lmf A Python dictionary describing the mapping between MDF markers and LMF representation. Default value is 'mdf_lmf' dictionary defined in 'pylmflib/config/mdf.py'. Please refer to it as an example.
    @param lexicon An existing Lexicon to fill with lexical entries to read.
    @param id A Python string identifying the lexicon to create.
    @param encoding Use 'utf-8' encoding by default. Otherwise, user has to precise the native encoding of its document.
    @return A Lexicon instance containing all lexical entries.
    """
    import re
    # If not provided, create a Lexicon instance to contain all lexical entries
    if lexicon is None:
        lexicon = Lexicon(id)
    # Read in unicode
    if filename is None:
        filename = lexicon.get_entrySource()
    else:
        # Set lexicon attribute
        lexicon.set_entrySource(filename)
    # Read in unicode
    mdf_file = open_read(filename, encoding=encoding)
    # MDF syntax is the following: '\marker value'
    mdf_pattern = """^\\\(\w*) (<(.*)>)? ?(.*)$"""
    # Add each lexical entry to the lexicon
    current_entry = None
    sub_entry = None
    component = None
    main_entry = None
    for line in mdf_file.readlines():
        # Do not parse empty lines
        if line != EOL:
            result = re.match(mdf_pattern, line)
            if result is None:
                # Line is empty => continue parsing next line
                continue
            marker = result.group(1)
            attrs = result.group(3)
            value = result.group(4)
            # Do not consider markers starting with an underscore character (e.g. '_sh' and '_DateStampHasFourDigitYear')
            if marker[0] == '_':
                continue
            # Remove trailing spaces and end-of-line characters
            value = value.rstrip(' \r\n')
            # Do not consider empty fields
            if value == "":
                continue
            # Check if the current entry is a multiword expression
            is_mwe = False
            if marker == "lf":
                lf = value.split(" = ")
                if lf[0].startswith("Component"):
                    component_nb = lf[0].lstrip("Component")
                    value = lf[1]
                    is_mwe = True
            # 'lx' and 'se' markers indicate a new entry
            if marker == "lx" or marker == "se" or is_mwe:
                # Compute a unique identifier
                uid = uni2sampa(value)
                if marker == "se":
                    # Create a subentry
                    sub_entry = LexicalEntry(uid)
                    # An MDF subentry corresponds to an LMF lexical entry
                    mdf2lmf["lx"](value, sub_entry)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(sub_entry)
                    # Manage main entry
                    if main_entry is None:
                        main_entry = current_entry
                    else:
                        current_entry = main_entry
                    # Set main entry
                    homonym_nb = current_entry.get_homonymNumber()
                    if homonym_nb is None:
                        homonym_nb = ""
                    sub_entry.create_and_add_related_form(
                        current_entry.get_lexeme() + homonym_nb, "main entry")
                elif is_mwe:
                    # Create a subentry
                    component = LexicalEntry(uid)
                    # An MDF subentry corresponds to an LMF lexical entry
                    mdf2lmf["lx"](value, component)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(component)
                    # Manage current entry
                    if sub_entry is not None:
                        current_entry = sub_entry
                    # Set component
                    homonym_nb = current_entry.get_homonymNumber()
                    if homonym_nb is None:
                        homonym_nb = ""
                    current_entry.create_and_add_component(component_nb, value)
                    component.create_and_add_related_form(
                        current_entry.get_lexeme() + homonym_nb,
                        "complex predicate")
                    component.set_independentWord(False)
                else:
                    # Create a new entry
                    current_entry = LexicalEntry(uid)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(current_entry)
                    # Reset main entry
                    main_entry = None
            # Map MDF marker and value to LMF representation
            try:
                if attrs is not None:
                    # There are attributes
                    attributes = {}
                    # Remove quotation marks from attributes if any
                    attrs = attrs.replace('"', '')
                    for attr in attrs.split(' '):
                        attributes.update(
                            {attr.split('=')[0]: attr.split('=')[1]})
                    # A customized marker starts with '__' characters
                    mdf2lmf["__" + marker](attributes, value, current_entry)
                else:
                    mdf2lmf[marker](value, current_entry)
                if sub_entry is not None:
                    current_entry = sub_entry
                    sub_entry = None
                if component is not None:
                    sub_entry = current_entry
                    current_entry = component
                    component = None
            except KeyError:
                # When printing, we need to convert 'unicode' into 'str' using 'utf-8' encoding:
                print Warning(
                    "MDF marker '%s' encountered for lexeme '%s' is not defined in configuration"
                    % (marker.encode(ENCODING),
                       current_entry.get_lexeme().encode(ENCODING)))
            except Error as exception:
                exception.handle()
    mdf_file.close()
    return lexicon
예제 #17
0
    return parsed_utterances

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--transcripts', nargs='*', help='JSON transcripts to extract templates')
    parser.add_argument('--max-examples', default=-1, type=int)
    parser.add_argument('--templates', help='Path to load templates')
    parser.add_argument('--templates-output', help='Path to save templates')
    parser.add_argument('--model', help='Path to load model')
    parser.add_argument('--model-output', help='Path to save the dialogue manager model')
    parser.add_argument('--schema-path', help='Path to schema')
    add_lexicon_arguments(parser)
    args = parser.parse_args()

    schema = Schema(args.schema_path)
    lexicon = Lexicon(schema, args.learned_lex, stop_words=args.stop_words, lexicon_path=args.lexicon)
    examples = read_examples(args.transcripts, args.max_examples, Scenario)
    parsed_dialogues = []
    templates = Templates()

    for idx, example in enumerate(examples):
        utterances = parse_example(example, lexicon, templates)
        parsed_dialogues.append(utterances)

    #sample_intents(parsed_dialogues, "unknown", 30)
    #intent_breakdown(parsed_dialogues)

    templates.finalize()
    templates.save(args.templates_output)
    templates.dump(n=10)
예제 #18
0
    parser.add_argument('--transcripts',
                        nargs='*',
                        help='JSON transcripts to extract templates')
    parser.add_argument('--max-examples', default=-1, type=int)
    parser.add_argument('--templates', help='Path to load templates')
    parser.add_argument('--templates-output', help='Path to save templates')
    parser.add_argument('--model', help='Path to load model')
    parser.add_argument('--model-output',
                        help='Path to save the dialogue manager model')
    args = parser.parse_args()

    examples = read_examples(args.transcripts, args.max_examples, Scenario)
    parsed_dialogues = []
    templates = Templates()

    lexicon = Lexicon(['ball', 'hat', 'book'])
    for example in examples:
        utterances = parse_example(example, lexicon, templates)
        parsed_dialogues.append(utterances)

    templates.finalize()
    templates.save(args.templates_output)
    templates.dump(n=10)

    # Train n-gram model
    sequences = []
    for d in parsed_dialogues:
        sequences.append([u.lf.intent for u in d])

    manager = Manager.from_train(sequences)
    manager.save(args.model_output)
예제 #19
0
 def test_tex_write(self):
     import sys, os
     # Create LMF objects
     lexical_entry = LexicalEntry()
     lexical_entry.lemma = Lemma()
     lexical_entry.partOfSpeech = "toto"
     lexical_entry.status = "draft"
     lexical_entry.lemma.lexeme = "hello"
     lexicon = Lexicon()
     lexicon.add_lexical_entry(lexical_entry)
     lexical_resource = LexicalResource()
     lexical_resource.add_lexicon(lexicon)
     # Write LaTeX file and test result
     utest_path = sys.path[0] + '/'
     tex_filename = utest_path + "output.tex"
     tex_write(lexical_resource, tex_filename)
     tex_file = open(tex_filename, "r")
     begin_lines = [EOL,
         "\\begin{document}" + EOL,
         "\\maketitle" + EOL,
         "\\newpage" + EOL,
         EOL,
         "\\def\\mytextsc{\\bgroup\\obeyspaces\\mytextscaux}" + EOL,
         "\\def\\mytextscaux#1{\\mytextscauxii #1\\relax\\relax\\egroup}" + EOL,
         "\\def\\mytextscauxii#1{%" + EOL,
         "\\ifx\\relax#1\\else \\ifcat#1\\@sptoken{} \\expandafter\\expandafter\\expandafter\\mytextscauxii\\else" + EOL,
         "\\ifnum`#1=\\uccode`#1 {\\normalsize #1}\\else {\\footnotesize \\uppercase{#1}}\\fi \\expandafter\\expandafter\\expandafter\\mytextscauxii\\expandafter\\fi\\fi}" + EOL,
         EOL,
         "\\setlength\\parindent{0cm}" + EOL,
         EOL,
         "\\addmediapath{.}" + EOL,
         "\\addmediapath{./mp3}" + EOL,
         "\\addmediapath{./wav}" + EOL,
         "\\graphicspath{{" + os.path.abspath('.') + "/pylmflib/output/img/}}" + EOL,
         EOL,
         "\\newpage" + EOL,
         "\\begin{multicols}{2}" + EOL,
         EOL
     ]
     end_lines = [
         "\end{multicols}" + EOL,
         "\end{document}" + EOL
     ]
     expected_lines = [
         "\\newpage" + EOL,
         "\\section*{\\centering- \\textbf{\ipa{H}} \\textbf{\ipa{h}} -}" + EOL,
         #"\\pdfbookmark[1]{\ipa{ H h }}{ H h }" + EOL,
         "\\paragraph{\\hspace{-0.5cm} \\textbf{\ipa{hello}}} \\hypertarget{01}{}" + EOL,
         "\markboth{\\textbf{\\ipa{hello}}}{}" + EOL,
         "\\textit{Status:} draft" + EOL,
         "\lhead{\\firstmark}" + EOL,
         "\\rhead{\\botmark}" + EOL,
         EOL
     ]
     self.assertListEqual(begin_lines + expected_lines + end_lines, tex_file.readlines())
     tex_file.close()
     # Customize mapping
     my_lmf_tex = dict({
         "Lemma.lexeme" : lambda lexical_entry: "is " + lexical_entry.get_lexeme() + "." + EOL,
         "LexicalEntry.id" : lambda lexical_entry: "The lexical entry " + str(lexical_entry.get_id()) + " ",
         "LexicalEntry.partOfSpeech" : lambda lexical_entry: "Its grammatical category is " + lexical_entry.get_partOfSpeech() + "." + EOL,
         "LexicalEntry.status" : lambda lexical_entry: "Warning: " + lexical_entry.get_status() + " version!" + EOL
     })
     my_order = ["LexicalEntry.id", "Lemma.lexeme", "LexicalEntry.partOfSpeech", "LexicalEntry.status"]
     def lmf2tex(entry, font):
         result = ""
         for attribute in my_order:
             result += my_lmf_tex[attribute](entry)
         return result
     # Write LaTeX file and test result
     tex_write(lexical_resource, tex_filename, None, None, lmf2tex, font)
     tex_file = open(tex_filename, "r")
     expected_lines = [
         "\\newpage" + EOL,
         "\\section*{\\centering- \\textbf{\ipa{H}} \\textbf{\ipa{h}} -}" + EOL,
         #"\\pdfbookmark[1]{\ipa{ H h }}{ H h }" + EOL,
         "The lexical entry 01 is hello." + EOL,
         "Its grammatical category is toto." + EOL,
         "Warning: draft version!" + EOL,
         "\lhead{\\firstmark}" + EOL,
         "\\rhead{\\botmark}" + EOL,
         EOL
         ]
     self.assertListEqual(begin_lines + expected_lines + end_lines, tex_file.readlines())
     tex_file.close()
     del lexical_entry.lemma
     lexical_entry.lemma = None
     del lexical_entry, lexicon
     lexicon = None
     del lexical_resource
     # Remove LaTeX file
     os.remove(tex_filename)
예제 #20
0
class TestLexiconFunctions(unittest.TestCase):
    def setUp(self):
        # Instantiate a Lexicon object
        self.lexicon = Lexicon()

    def tearDown(self):
        # Release instantiated objects
        del self.lexicon

    def test_init(self):
        self.assertIsNone(self.lexicon.language)
        self.assertIsNone(self.lexicon.languageScript)
        self.assertIsNone(self.lexicon.id)
        self.assertIsNone(self.lexicon.label)
        self.assertIsNone(self.lexicon.lexiconType)
        self.assertIsNone(self.lexicon.entrySource)
        self.assertIsNone(self.lexicon.vowelHarmony)
        self.assertListEqual(self.lexicon.lexical_entry, [])
        self.assertIsNone(self.lexicon.localPath)

    def test_set_id(self):
        id = "English lexicon"
        self.assertEqual(self.lexicon.set_id(id), self.lexicon)
        self.assertEqual(self.lexicon.id, id)

    def test_get_id(self):
        self.assertIs(self.lexicon.get_id(), self.lexicon.id)

    def test_set_language(self):
        language = "eng"
        self.assertEqual(self.lexicon.set_language(language), self.lexicon)
        self.assertEqual(self.lexicon.language, language)

    def test_get_language(self):
        self.assertIs(self.lexicon.get_language(), self.lexicon.language)

    def test_set_languageScript(self):
        script = "latn"
        self.assertEqual(self.lexicon.set_languageScript(script), self.lexicon)
        self.assertEqual(self.lexicon.languageScript, script)

    def test_get_languageScript(self):
        self.assertIs(self.lexicon.get_languageScript(),
                      self.lexicon.languageScript)

    def test_set_label(self):
        label = "online dictionary"
        self.assertEqual(self.lexicon.set_label(label), self.lexicon)
        self.assertEqual(self.lexicon.label, label)

    def test_get_label(self):
        self.assertIs(self.lexicon.get_label(), self.lexicon.label)

    def test_set_lexiconType(self):
        type = "bilingual dictionary"
        self.assertEqual(self.lexicon.set_lexiconType(type), self.lexicon)
        self.assertEqual(self.lexicon.lexiconType, type)

    def test_get_lexiconType(self):
        self.assertIs(self.lexicon.get_lexiconType(), self.lexicon.lexiconType)

    def test_set_entrySource(self):
        source = "test.txt"
        self.assertEqual(self.lexicon.set_entrySource(source), self.lexicon)
        self.assertEqual(self.lexicon.entrySource, source)

    def test_get_entrySource(self):
        self.assertIs(self.lexicon.get_entrySource(), self.lexicon.entrySource)

    def test_set_vowelHarmony(self):
        test = False
        try:
            self.lexicon.set_vowelHarmony(None)
        except NotImplementedError:
            test = True
        self.assertTrue(test)

    def test_get_vowelHarmony(self):
        test = False
        try:
            self.lexicon.get_vowelHarmony()
        except NotImplementedError:
            test = True
        self.assertTrue(test)

    def test_set_localPath(self):
        path = "/full/local/path/to/audio/files/"
        self.assertEqual(self.lexicon.set_localPath(path), self.lexicon)
        self.assertEqual(self.lexicon.localPath, path)

    def test_get_localPath(self):
        self.assertIs(self.lexicon.get_localPath(), self.lexicon.localPath)

    def test_get_lexical_entries(self):
        # Create lexical entries
        entry1 = LexicalEntry()
        entry2 = LexicalEntry()
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2]
        # Test get lexical entries
        self.assertListEqual(self.lexicon.get_lexical_entries(),
                             [entry1, entry2])
        self.lexicon.lexical_entry.append(entry1)
        self.assertListEqual(self.lexicon.get_lexical_entries(),
                             [entry1, entry2, entry1])
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2

    def test_add_lexical_entry(self):
        # Create lexical entries
        entry1 = LexicalEntry()
        entry2 = LexicalEntry()
        # Test add entries to the lexicon
        self.assertEqual(self.lexicon.add_lexical_entry(entry1), self.lexicon)
        self.assertListEqual(self.lexicon.lexical_entry, [entry1])
        self.assertEqual(self.lexicon.add_lexical_entry(entry2), self.lexicon)
        self.assertListEqual(self.lexicon.lexical_entry, [entry1, entry2])
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2

    def test_remove_lexical_entry(self):
        # Create lexical entries
        entry1 = LexicalEntry()
        entry2 = LexicalEntry()
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2]
        # Test remove lexical entries
        self.assertEqual(self.lexicon.remove_lexical_entry(entry1),
                         self.lexicon)
        self.assertListEqual(self.lexicon.lexical_entry, [entry2])
        self.assertEqual(self.lexicon.remove_lexical_entry(entry2),
                         self.lexicon)
        self.assertListEqual(self.lexicon.lexical_entry, [])
        # Release LexicalEntry instances
        del entry1, entry2

    def test_count_lexical_entries(self):
        # Create lexical entries
        entry1 = LexicalEntry()
        entry2 = LexicalEntry()
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1]
        # Test count lexical entries
        self.assertEqual(self.lexicon.count_lexical_entries(), 1)
        self.lexicon.lexical_entry.append(entry2)
        self.assertEqual(self.lexicon.count_lexical_entries(), 2)
        self.lexicon.lexical_entry.append(entry1)
        self.assertEqual(self.lexicon.count_lexical_entries(), 3)
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2

    def test_sort_homonym_numbers(self):
        # Create several lexical entries
        entry1 = LexicalEntry().set_lexeme("aa").set_homonymNumber("2")
        entry2 = LexicalEntry().set_lexeme("aa").set_homonymNumber("1")
        entry3 = LexicalEntry().set_lexeme("ab")
        entry4 = LexicalEntry().set_lexeme("ba")
        entry5 = LexicalEntry().set_lexeme("bb").set_homonymNumber("6")
        entry6 = LexicalEntry().set_lexeme("bb").set_homonymNumber("5")
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [
            entry1, entry2, entry3, entry4, entry5, entry6
        ]
        # Test sort homonym numbers
        self.assertListEqual(self.lexicon.sort_homonym_numbers(),
                             [entry2, entry1, entry3, entry4, entry6, entry5])
        self.assertListEqual(self.lexicon.lexical_entry,
                             [entry2, entry1, entry3, entry4, entry6, entry5])
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2, entry3, entry4, entry5, entry6

    def test_sort_lexical_entries(self):
        # Create several lexical entries with different lexemes
        entry1 = LexicalEntry().set_lexeme("aa")
        entry2 = LexicalEntry().set_lexeme("ab")
        entry3 = LexicalEntry().set_lexeme("ba")
        entry4 = LexicalEntry().set_lexeme("bb")
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry4, entry1, entry2, entry3]
        # Test sort lexical entries
        self.assertListEqual(self.lexicon.sort_lexical_entries(),
                             [entry1, entry2, entry3, entry4])
        self.assertListEqual(self.lexicon.lexical_entry,
                             [entry1, entry2, entry3, entry4])
        # Provide a sort order
        my_order = dict({'A': 1.1, 'a': 1.2, 'B': 2.1, 'b': 2.2})
        my_unicode_order = ({})
        for key in my_order.keys():
            my_unicode_order.update(
                {key.decode(encoding='utf8'): my_order[key]})
        entry5 = LexicalEntry().set_lexeme("Aa")
        entry6 = LexicalEntry().set_lexeme("bB")
        self.lexicon.lexical_entry.append(entry5)
        self.lexicon.lexical_entry.append(entry6)
        self.assertListEqual(
            self.lexicon.sort_lexical_entries(sort_order=my_order),
            [entry5, entry1, entry2, entry3, entry6, entry4])
        self.assertListEqual(self.lexicon.lexical_entry,
                             [entry5, entry1, entry2, entry3, entry6, entry4])
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2, entry3, entry4, entry5, entry6

    def test_find_lexical_entries(self):
        # Create several lexical entries with different lexemes
        entry1 = LexicalEntry().set_lexeme("Hello")
        entry2 = LexicalEntry().set_lexeme("world!")
        entry3 = LexicalEntry().set_lexeme("hello")
        entry4 = LexicalEntry().set_lexeme("world")
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2, entry3, entry4]
        # Test find lexical entries
        self.assertListEqual(
            self.lexicon.find_lexical_entries(
                lambda entry: entry.get_lexeme() == "Hello"), [entry1])

        def test_filter(entry):
            return entry.get_lexeme().lower() == "hello"

        # List is randomly ordered => create a set to avoid random results
        self.assertEqual(set(self.lexicon.find_lexical_entries(test_filter)),
                         set([entry1, entry3]))
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2, entry3, entry4

    def test_check_cross_references(self):
        # Create lexical entries with lexemes and related lexemes
        entry1 = LexicalEntry().set_lexeme(
            "Hello").create_and_add_related_form("world!", "main entry")
        entry2 = LexicalEntry().set_lexeme(
            "world!").create_and_add_related_form("Hello", "subentry")
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2]
        # Test check cross references
        self.assertIs(self.lexicon.check_cross_references(), self.lexicon)
        self.assertIs(entry1.related_form[0].get_lexical_entry(), entry2)
        self.assertIs(entry2.related_form[0].get_lexical_entry(), entry1)
        # Test warning case: entry not found
        entry3 = LexicalEntry().set_lexeme(
            "hello").create_and_add_related_form("world", "main entry")
        self.lexicon.lexical_entry.append(entry3)
        self.lexicon.reset_check()
        self.lexicon.check_cross_references()
        # Retrieve nominal case
        entry4 = LexicalEntry().set_lexeme("world")
        self.lexicon.lexical_entry.append(entry4)
        self.lexicon.reset_check()
        self.assertIs(self.lexicon.check_cross_references(), self.lexicon)
        self.assertIs(entry3.related_form[0].get_lexical_entry(), entry4)
        # Test warning case: several entries found
        entry5 = LexicalEntry().set_lexeme("world")
        self.lexicon.lexical_entry.append(entry5)
        self.lexicon.reset_check()
        self.lexicon.check_cross_references()
        # Test check cross references with homonym number
        entry3.related_form[0].set_lexical_entry(None)
        entry3.related_form[0].targets = "world2"
        entry4.homonymNumber = "1"
        entry5.homonymNumber = "2"
        self.lexicon.reset_check()
        self.assertIs(self.lexicon.check_cross_references(), self.lexicon)
        self.assertIs(entry3.related_form[0].get_lexical_entry(), entry5)
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2, entry3, entry4, entry5

    def test_convert_to_latex(self):
        pass
예제 #21
0
 def setUp(self):
     # Instantiate a Lexicon object
     self.lexicon = Lexicon()
예제 #22
0
def config_read(filename):
    """! @brief Read an XML file giving the user configuration.
    @param filename The name of the XML file to read with full path, for instance 'pylmflib/pylmflib/config/default/config.xml'.
    @return A Lexical Resource.
    """
    import os
    import config.xml
    configuration = parse_xml(filename)
    # Parse XML elements
    for format in configuration:
        if format.tag == "Language":
            # XML element "Language" have several XML subelements "lang"
            for lang in format:
                # XML elements "lang" have 2 XML attributes: one for the nature of the language ("att"), a second for the language code ("val")
                exec("config.xml." + lang.attrib["att"] + " = '" + lang.attrib["val"] + "'")
        elif format.tag == "Font":
            config.xml.font = dict()
            # XML element "Font" have several XML subelements "font"
            for font in format:
                # XML elements "font" have 2 XML attributes: one for the nature of the language ("att"), a second for the variable name ("var")
                exec("l = lambda " + font.attrib['var'] + ": " + font.text)
                config.xml.font.update({font.attrib['att']: l})
        elif format.tag == "LMF":
            # Create lexical resource and set DTD version
            lexical_resource = LexicalResource(format[0].attrib["dtdVersion"])
            for object in format[0]:
                if object.tag == "GlobalInformation":
                    # Set global information
                    for feat in object:
                        if feat.attrib["att"] == "languageCode":
                            lexical_resource.set_language_code(feat.attrib["val"])
                        elif feat.attrib["att"] == "author":
                            lexical_resource.set_author(feat.attrib["val"])
                        elif feat.attrib["att"] == "version":
                            lexical_resource.set_version(feat.attrib["val"])
                        elif feat.attrib["att"] == "lastUpdate":
                            lexical_resource.set_last_update(feat.attrib["val"])
                        elif feat.attrib["att"] == "license":
                            lexical_resource.set_license(feat.attrib["val"])
                        elif feat.attrib["att"] == "characterEncoding":
                            lexical_resource.set_character_encoding(feat.attrib["val"])
                        elif feat.attrib["att"] == "dateCoding":
                            lexical_resource.set_date_coding(feat.attrib["val"])
                        elif feat.attrib["att"] == "creationDate":
                            lexical_resource.set_creation_date(feat.attrib["val"])
                        elif feat.attrib["att"] == "projectName":
                            lexical_resource.set_project_name(feat.attrib["val"])
                        elif feat.attrib["att"] == "description":
                            lexical_resource.set_description(feat.attrib["val"])
                elif object.tag == "Lexicon":
                    # Create lexicon and set identifier
                    lexicon = Lexicon(object.attrib["id"])
                    # Set lexicon attributes
                    for feat in object:
                        if feat.attrib["att"] == "language":
                            lexicon.set_language(feat.attrib["val"])
                        elif feat.attrib["att"] == "languageScript":
                            lexicon.set_languageScript(feat.attrib["val"])
                        elif feat.attrib["att"] == "label":
                            lexicon.set_label(feat.attrib["val"])
                        elif feat.attrib["att"] == "lexiconType":
                            lexicon.set_lexiconType(feat.attrib["val"])
                        elif feat.attrib["att"] == "entrySource":
                            lexicon.set_entrySource(feat.attrib["val"])
                        elif feat.attrib["att"] == "localPath":
                            lexicon.set_localPath(feat.attrib["val"])
                            # Set absolute path to audio files
                            config.xml.audio_path = os.path.abspath(os.path.abspath('.') + "/" + feat.attrib["val"]) + "/"
                    # Attach lexicon to the lexical resource
                    lexical_resource.add_lexicon(lexicon)
        elif format.tag == "MDF":
            for mdf in format:
                if mdf.tag == "mdf_lmf":
                    # XML elements "mdf_lmf" have 2 XML attributes: one for the name of the marker ("marker"), a second for the variable name ("var")
                    exec("l = lambda " + mdf.attrib['var'] + ": " + mdf.text)
                    mdf_lmf.update({mdf.attrib['marker']: l})
                elif mdf.tag == "ps_partOfSpeech":
                    # XML elements "ps_partOfSpeech" have 2 XML attributes: one for the MDF value ("ps"), a second for the LMF value ("partOfSpeech")
                    ps_partOfSpeech.update({mdf.attrib['ps']: mdf.attrib['partOfSpeech']})
                    # Also automatically update range of possible values allowed for LMF part of speech LexicalEntry attribute -->
                    partOfSpeech_range.add(mdf.attrib['partOfSpeech'])
                    # And automatically update the reverse operation
                    partOfSpeech_tex.update({mdf.attrib['partOfSpeech']: mdf.attrib['ps']})
                elif mdf.tag == "pdl_paradigmLabel":
                    # XML elements "pdl_paradigmLabel" have 2 XML attributes: one for the MDF value ("pdl"), a second for the LMF value ("paradigmLabel")
                    pdl_paradigmLabel.update({mdf.attrib['pdl']: mdf.attrib['paradigmLabel']})
                    # Also automatically update range of possible values allowed for LMF paradigm label Paradigm attribute -->
                    paradigmLabel_range.add(mdf.attrib['paradigmLabel'])
                    # And automatically update the reverse operation
                    paradigmLabel_tex.update({mdf.attrib['paradigmLabel']: mdf.attrib['pdl']})
                elif mdf.tag == "lmf_mdf":
                    # XML elements "lmf_mdf" have 2 XML attributes: one for the name of the marker ("marker"), a second for the variable name ("var")
                    exec("l = lambda " + mdf.attrib['var'] + ": " + mdf.text)
                    lmf_mdf.update({mdf.attrib['marker']: l})
                elif mdf.tag == "mdf_order":
                    mdf_order = []
                    for element in mdf:
                        mdf_order.append(element.tag)
                        list1 = []
                        for level1 in element:
                            list1.append(level1.tag)
                            list2 = []
                            for level2 in level1:
                                list2.append(level2.tag)
                            if len(list2) != 0:
                                list1.append(list2)
                        if len(list1) != 0:
                            mdf_order.append(list1)
        elif format.tag == "LaTeX":
            for param in format:
                if param.tag == "partOfSpeech_tex":
                    # XML elements "partOfSpeech_tex" have 2 or 3 XML attributes: one for the LMF value ("partOfSpeech"), a second for the LaTeX value ("tex"), and an optional one to define language
                    try:
                        partOfSpeech_tex.update({(param.attrib['lang'], param.attrib['partOfSpeech']): param.attrib['tex']})
                    except KeyError:
                        partOfSpeech_tex.update({param.attrib['partOfSpeech']: param.attrib['tex']})
                    # Also automatically update range of possible values allowed for LMF part of speech LexicalEntry attribute -->
                    partOfSpeech_range.add(param.attrib['partOfSpeech'])
                elif param.tag == "paradigmLabel_tex":
                    # XML elements "paradigmLabel_tex" have 2 XML attributes: one for the LMF value ("paradigmLabel"), a second for the LaTeX value ("tex")
                    paradigmLabel_tex.update({param.attrib['paradigmLabel']: param.attrib['tex']})
                    # Also automatically update range of possible values allowed for LMF paradigm label Paradigm attribute -->
                    paradigmLabel_range.add(param.attrib['paradigmLabel'])
        else:
            raise InputError(module_name + ".py", "XML file '%s' is not well-formatted." % filename)
    return lexical_resource
예제 #23
0
 def setUp(self):
     # Instantiate a Lexicon object
     self.lexicon = Lexicon()
예제 #24
0
파일: mdf.py 프로젝트: buret/pylmflib
def mdf_read(filename=None, mdf2lmf=mdf_lmf, lexicon=None, id=None, encoding=ENCODING):
    """! @brief Read an MDF file.
    @param filename The name of the MDF file to read with full path, for instance 'user/input.txt'.
    @param mdf2lmf A Python dictionary describing the mapping between MDF markers and LMF representation. Default value is 'mdf_lmf' dictionary defined in 'pylmflib/config/mdf.py'. Please refer to it as an example.
    @param lexicon An existing Lexicon to fill with lexical entries to read.
    @param id A Python string identifying the lexicon to create.
    @param encoding Use 'utf-8' encoding by default. Otherwise, user has to precise the native encoding of its document.
    @return A Lexicon instance containing all lexical entries.
    """
    import re
    # If not provided, create a Lexicon instance to contain all lexical entries
    if lexicon is None:
        lexicon = Lexicon(id)
    # Read in unicode
    if filename is None:
        filename = lexicon.get_entrySource()
    else:
        # Set lexicon attribute
        lexicon.set_entrySource(filename)
    # Read in unicode
    mdf_file = open_read(filename, encoding=encoding)
    # MDF syntax is the following: '\marker value'
    mdf_pattern = """^\\\(\w*) (<(.*)>)? ?(.*)$"""
    # Add each lexical entry to the lexicon
    current_entry = None
    sub_entry = None
    component = None
    main_entry = None
    for line in mdf_file.readlines():
        # Do not parse empty lines
        if line != EOL:
            result = re.match(mdf_pattern, line)
            if result is None:
                # Line is empty => continue parsing next line
                continue
            marker = result.group(1)
            attrs = result.group(3)
            value = result.group(4)
            # Do not consider markers starting with an underscore character (e.g. '_sh' and '_DateStampHasFourDigitYear')
            if marker[0] == '_':
                continue
            # Remove trailing spaces and end-of-line characters
            value = value.rstrip(' \r\n')
            # Do not consider empty fields
            if value == "":
                continue
            # Check if the current entry is a multiword expression
            is_mwe = False
            if marker == "lf":
                lf = value.split(" = ")
                if lf[0].startswith("Component"):
                    component_nb = lf[0].lstrip("Component")
                    value = lf[1]
                    is_mwe = True
            # 'lx' and 'se' markers indicate a new entry
            if marker == "lx" or marker == "se" or is_mwe:
                # Compute a unique identifier
                uid = uni2sampa(value)
                if marker == "se":
                    # Create a subentry
                    sub_entry = LexicalEntry(uid)
                    # An MDF subentry corresponds to an LMF lexical entry
                    mdf2lmf["lx"](value, sub_entry)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(sub_entry)
                    # Manage main entry
                    if main_entry is None:
                        main_entry = current_entry
                    else:
                        current_entry = main_entry
                    # Set main entry
                    homonym_nb = current_entry.get_homonymNumber()
                    if homonym_nb is None:
                        homonym_nb = ""
                    sub_entry.create_and_add_related_form(current_entry.get_lexeme() + homonym_nb, "main entry")
                elif is_mwe:
                    # Create a subentry
                    component = LexicalEntry(uid)
                    # An MDF subentry corresponds to an LMF lexical entry
                    mdf2lmf["lx"](value, component)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(component)
                    # Manage current entry
                    if sub_entry is not None:
                        current_entry = sub_entry
                    # Set component
                    homonym_nb = current_entry.get_homonymNumber()
                    if homonym_nb is None:
                        homonym_nb = ""
                    current_entry.create_and_add_component(component_nb, value)
                    component.create_and_add_related_form(current_entry.get_lexeme() + homonym_nb, "complex predicate")
                    component.set_independentWord(False)
                else:
                    # Create a new entry
                    current_entry = LexicalEntry(uid)
                    # Add it to the lexicon
                    lexicon.add_lexical_entry(current_entry)
                    # Reset main entry
                    main_entry = None
            # Map MDF marker and value to LMF representation
            try:
                if attrs is not None:
                    # There are attributes
                    attributes = {}
                    # Remove quotation marks from attributes if any
                    attrs = attrs.replace('"', '')
                    for attr in attrs.split(' '):
                        attributes.update({attr.split('=')[0] : attr.split('=')[1]})
                    # A customized marker starts with '__' characters
                    mdf2lmf["__" + marker](attributes, value, current_entry)
                else:
                    mdf2lmf[marker](value, current_entry)
                if sub_entry is not None:
                    current_entry = sub_entry
                    sub_entry = None
                if component is not None:
                    sub_entry = current_entry
                    current_entry = component
                    component = None
            except KeyError:
                # When printing, we need to convert 'unicode' into 'str' using 'utf-8' encoding:
                print Warning("MDF marker '%s' encountered for lexeme '%s' is not defined in configuration" % (marker.encode(ENCODING), current_entry.get_lexeme().encode(ENCODING)))
            except Error as exception:
                exception.handle()
    mdf_file.close()
    return lexicon
예제 #25
0
                        nargs='*',
                        help='JSON transcripts to extract templates')
    parser.add_argument('--max-examples', default=-1, type=int)
    parser.add_argument('--templates', help='Path to load templates')
    parser.add_argument('--reviews', help='Path to load templates')
    parser.add_argument('--templates-output', help='Path to save templates')
    parser.add_argument('--model', help='Path to load model')
    parser.add_argument('--model-output',
                        help='Path to save the dialogue manager model')
    args = parser.parse_args()

    examples = read_examples(args.transcripts, args.max_examples, Scenario)
    parsed_dialogues = []
    templates = Templates()

    lexicon = Lexicon.from_pickle(args.lexicon)

    for example in examples:
        utterances = parse_example(example, lexicon, templates)
        parsed_dialogues.append(utterances)

    # Train n-gram model
    sequences = []
    for d in parsed_dialogues:
        sequences.append([u.lf.intent for u in d])
    manager = Manager.from_train(sequences)
    manager.save(args.model_output)

    if args.reviews:
        print 'read reviews from', args.reviews
        templates.read_reviews(args.reviews)
예제 #26
0
class TestLexiconFunctions(unittest.TestCase):

    def setUp(self):
        # Instantiate a Lexicon object
        self.lexicon = Lexicon()

    def tearDown(self):
        # Release instantiated objects
        del self.lexicon

    def test_init(self):
        self.assertIsNone(self.lexicon.language)
        self.assertIsNone(self.lexicon.languageScript)
        self.assertIsNone(self.lexicon.id)
        self.assertIsNone(self.lexicon.label)
        self.assertIsNone(self.lexicon.lexiconType)
        self.assertIsNone(self.lexicon.entrySource)
        self.assertIsNone(self.lexicon.vowelHarmony)
        self.assertListEqual(self.lexicon.lexical_entry, [])
        self.assertIsNone(self.lexicon.localPath)

    def test_set_id(self):
        id = "English lexicon"
        self.assertEqual(self.lexicon.set_id(id), self.lexicon)
        self.assertEqual(self.lexicon.id, id)

    def test_get_id(self):
        self.assertIs(self.lexicon.get_id(), self.lexicon.id)

    def test_set_language(self):
        language = "eng"
        self.assertEqual(self.lexicon.set_language(language), self.lexicon)
        self.assertEqual(self.lexicon.language, language)

    def test_get_language(self):
        self.assertIs(self.lexicon.get_language(), self.lexicon.language)

    def test_set_languageScript(self):
        script = "latn"
        self.assertEqual(self.lexicon.set_languageScript(script), self.lexicon)
        self.assertEqual(self.lexicon.languageScript, script)

    def test_get_languageScript(self):
        self.assertIs(self.lexicon.get_languageScript(), self.lexicon.languageScript)

    def test_set_label(self):
        label = "online dictionary"
        self.assertEqual(self.lexicon.set_label(label), self.lexicon)
        self.assertEqual(self.lexicon.label, label)

    def test_get_label(self):
        self.assertIs(self.lexicon.get_label(), self.lexicon.label)

    def test_set_lexiconType(self):
        type = "bilingual dictionary"
        self.assertEqual(self.lexicon.set_lexiconType(type), self.lexicon)
        self.assertEqual(self.lexicon.lexiconType, type)

    def test_get_lexiconType(self):
        self.assertIs(self.lexicon.get_lexiconType(), self.lexicon.lexiconType)

    def test_set_entrySource(self):
        source = "test.txt"
        self.assertEqual(self.lexicon.set_entrySource(source), self.lexicon)
        self.assertEqual(self.lexicon.entrySource, source)

    def test_get_entrySource(self):
        self.assertIs(self.lexicon.get_entrySource(), self.lexicon.entrySource)

    def test_set_vowelHarmony(self):
        test = False
        try:
            self.lexicon.set_vowelHarmony(None)
        except NotImplementedError:
            test = True
        self.assertTrue(test)

    def test_get_vowelHarmony(self):
        test = False
        try:
            self.lexicon.get_vowelHarmony()
        except NotImplementedError:
            test = True
        self.assertTrue(test)

    def test_set_localPath(self):
        path = "/full/local/path/to/audio/files/"
        self.assertEqual(self.lexicon.set_localPath(path), self.lexicon)
        self.assertEqual(self.lexicon.localPath, path)

    def test_get_localPath(self):
        self.assertIs(self.lexicon.get_localPath(), self.lexicon.localPath)

    def test_get_lexical_entries(self):
        # Create lexical entries
        entry1 = LexicalEntry()
        entry2 = LexicalEntry()
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2]
        # Test get lexical entries
        self.assertListEqual(self.lexicon.get_lexical_entries(), [entry1, entry2])
        self.lexicon.lexical_entry.append(entry1)
        self.assertListEqual(self.lexicon.get_lexical_entries(), [entry1, entry2, entry1])
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2

    def test_add_lexical_entry(self):
        # Create lexical entries
        entry1 = LexicalEntry()
        entry2 = LexicalEntry()
        # Test add entries to the lexicon
        self.assertEqual(self.lexicon.add_lexical_entry(entry1), self.lexicon)
        self.assertListEqual(self.lexicon.lexical_entry, [entry1])
        self.assertEqual(self.lexicon.add_lexical_entry(entry2), self.lexicon)
        self.assertListEqual(self.lexicon.lexical_entry, [entry1, entry2])
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2

    def test_remove_lexical_entry(self):
        # Create lexical entries
        entry1 = LexicalEntry()
        entry2 = LexicalEntry()
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2]
        # Test remove lexical entries
        self.assertEqual(self.lexicon.remove_lexical_entry(entry1), self.lexicon)
        self.assertListEqual(self.lexicon.lexical_entry, [entry2])
        self.assertEqual(self.lexicon.remove_lexical_entry(entry2), self.lexicon)
        self.assertListEqual(self.lexicon.lexical_entry, [])
        # Release LexicalEntry instances
        del entry1, entry2

    def test_count_lexical_entries(self):
        # Create lexical entries
        entry1 = LexicalEntry()
        entry2 = LexicalEntry()
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1]
        # Test count lexical entries
        self.assertEqual(self.lexicon.count_lexical_entries(), 1)
        self.lexicon.lexical_entry.append(entry2)
        self.assertEqual(self.lexicon.count_lexical_entries(), 2)
        self.lexicon.lexical_entry.append(entry1)
        self.assertEqual(self.lexicon.count_lexical_entries(), 3)
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2

    def test_sort_homonym_numbers(self):
        # Create several lexical entries
        entry1 = LexicalEntry().set_lexeme("aa").set_homonymNumber("2")
        entry2 = LexicalEntry().set_lexeme("aa").set_homonymNumber("1")
        entry3 = LexicalEntry().set_lexeme("ab")
        entry4 = LexicalEntry().set_lexeme("ba")
        entry5 = LexicalEntry().set_lexeme("bb").set_homonymNumber("6")
        entry6 = LexicalEntry().set_lexeme("bb").set_homonymNumber("5")
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2, entry3, entry4, entry5, entry6]
        # Test sort homonym numbers
        self.assertListEqual(self.lexicon.sort_homonym_numbers(), [entry2, entry1, entry3, entry4, entry6, entry5])
        self.assertListEqual(self.lexicon.lexical_entry, [entry2, entry1, entry3, entry4, entry6, entry5])
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2, entry3, entry4, entry5, entry6

    def test_sort_lexical_entries(self):
        # Create several lexical entries with different lexemes
        entry1 = LexicalEntry().set_lexeme("aa")
        entry2 = LexicalEntry().set_lexeme("ab")
        entry3 = LexicalEntry().set_lexeme("ba")
        entry4 = LexicalEntry().set_lexeme("bb")
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry4, entry1, entry2, entry3]
        # Test sort lexical entries
        self.assertListEqual(self.lexicon.sort_lexical_entries(), [entry1, entry2, entry3, entry4])
        self.assertListEqual(self.lexicon.lexical_entry, [entry1, entry2, entry3, entry4])
        # Provide a sort order
        my_order = dict({'A':1.1, 'a':1.2, 'B':2.1, 'b':2.2})
        my_unicode_order = ({})
        for key in my_order.keys():
            my_unicode_order.update({key.decode(encoding='utf8'):my_order[key]})
        entry5 = LexicalEntry().set_lexeme("Aa")
        entry6 = LexicalEntry().set_lexeme("bB")
        self.lexicon.lexical_entry.append(entry5)
        self.lexicon.lexical_entry.append(entry6)
        self.assertListEqual(self.lexicon.sort_lexical_entries(sort_order=my_order), [entry5, entry1, entry2, entry3, entry6, entry4])
        self.assertListEqual(self.lexicon.lexical_entry, [entry5, entry1, entry2, entry3, entry6, entry4])
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2, entry3, entry4, entry5, entry6

    def test_find_lexical_entries(self):
        # Create several lexical entries with different lexemes
        entry1 = LexicalEntry().set_lexeme("Hello")
        entry2 = LexicalEntry().set_lexeme("world!")
        entry3 = LexicalEntry().set_lexeme("hello")
        entry4 = LexicalEntry().set_lexeme("world")
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2, entry3, entry4]
        # Test find lexical entries
        self.assertListEqual(self.lexicon.find_lexical_entries(lambda entry: entry.get_lexeme() == "Hello"), [entry1])
        def test_filter(entry):
            return entry.get_lexeme().lower() == "hello"
        # List is randomly ordered => create a set to avoid random results
        self.assertEqual(set(self.lexicon.find_lexical_entries(test_filter)), set([entry1, entry3]))
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2, entry3, entry4

    def test_check_cross_references(self):
        # Create lexical entries with lexemes and related lexemes
        entry1 = LexicalEntry().set_lexeme("Hello").create_and_add_related_form("world!", "main entry")
        entry2 = LexicalEntry().set_lexeme("world!").create_and_add_related_form("Hello", "subentry")
        # Add entries to the lexicon
        self.lexicon.lexical_entry = [entry1, entry2]
        # Test check cross references
        self.assertIs(self.lexicon.check_cross_references(), self.lexicon)
        self.assertIs(entry1.related_form[0].get_lexical_entry(), entry2)
        self.assertIs(entry2.related_form[0].get_lexical_entry(), entry1)
        # Test warning case: entry not found
        entry3 = LexicalEntry().set_lexeme("hello").create_and_add_related_form("world", "main entry")
        self.lexicon.lexical_entry.append(entry3)
        self.lexicon.reset_check()
        self.lexicon.check_cross_references()
        # Retrieve nominal case
        entry4 = LexicalEntry().set_lexeme("world")
        self.lexicon.lexical_entry.append(entry4)
        self.lexicon.reset_check()
        self.assertIs(self.lexicon.check_cross_references(), self.lexicon)
        self.assertIs(entry3.related_form[0].get_lexical_entry(), entry4)
        # Test warning case: several entries found
        entry5 = LexicalEntry().set_lexeme("world")
        self.lexicon.lexical_entry.append(entry5)
        self.lexicon.reset_check()
        self.lexicon.check_cross_references()
        # Test check cross references with homonym number
        entry3.related_form[0].set_lexical_entry(None)
        entry3.related_form[0].targets = "world2"
        entry4.homonymNumber = "1"
        entry5.homonymNumber = "2"
        self.lexicon.reset_check()
        self.assertIs(self.lexicon.check_cross_references(), self.lexicon)
        self.assertIs(entry3.related_form[0].get_lexical_entry(), entry5)
        # Release LexicalEntry instances
        del self.lexicon.lexical_entry[:]
        del entry1, entry2, entry3, entry4, entry5

    def test_convert_to_latex(self):
        pass