Exemplo n.º 1
0
def step_3(training_file):
    transducer = dict()
    with open(training_file) as f:
        for line in f:
            words = re.split('[ ]+', line.strip())
            if len(words) >= 3:
                word = words[1]
                concept = ""

                if words[2] == 'null':
                    # keep the non-terminal the same
                    concept = word
                else:
                    concept = words[2]

                if word not in transducer:
                    transducer[word] = set()
                transducer[word].add(concept)

    # treating special case
    null_set = set()
    null_set.add('null');
    transducer['null'] = null_set

    # all unknown words should be mapped to all possible concepts
    all_concepts = def_con.all_possible_concepts()
    all_concepts.add('null')
    transducer['<unk>'] = all_concepts

    return transducer
Exemplo n.º 2
0
def write_lexicon_to_file(filename, lexicon):
    """
    set(): lexicon
    """
    # remove empty string character
    try:
        lexicon.remove('')
    except:
        pass

    count = 0

    all_possible_concepts = def_con.all_possible_concepts()

    special_lex = ['<epsilon>']
    irregular_lex = ['<unk>', '<s>', '</s>', 'null']
    lexicon = lexicon.union(set(irregular_lex))
    lexicon = lexicon.union(all_possible_concepts)
    lexicon = list(lexicon) # just in case lexicon is a set

    with open(filename, 'w') as output:
        for lex in special_lex + lexicon:
            output.write('%s\t%d\n' % (lex, count))
            count += 1