def step_3(training_file): transducer = dict() with open(training_file) as f: for line in f: words = re.split('[ ]+', line.strip()) if len(words) >= 3: word = words[1] concept = "" if words[2] == 'null': # keep the non-terminal the same concept = word else: concept = words[2] if word not in transducer: transducer[word] = set() transducer[word].add(concept) # treating special case null_set = set() null_set.add('null'); transducer['null'] = null_set # all unknown words should be mapped to all possible concepts all_concepts = def_con.all_possible_concepts() all_concepts.add('null') transducer['<unk>'] = all_concepts return transducer
def write_lexicon_to_file(filename, lexicon): """ set(): lexicon """ # remove empty string character try: lexicon.remove('') except: pass count = 0 all_possible_concepts = def_con.all_possible_concepts() special_lex = ['<epsilon>'] irregular_lex = ['<unk>', '<s>', '</s>', 'null'] lexicon = lexicon.union(set(irregular_lex)) lexicon = lexicon.union(all_possible_concepts) lexicon = list(lexicon) # just in case lexicon is a set with open(filename, 'w') as output: for lex in special_lex + lexicon: output.write('%s\t%d\n' % (lex, count)) count += 1