Exemplo n.º 1
0
def prepare_set_of_lexicon():
    # Step 1: generate lexicon
    lexicon = lex.generate_lexicon_from_textfile(TRAINING_FILE)
    lex.write_lexicon_to_file(LEX_FILE, lexicon)

    cutoff_lexicon = lex.generate_cutoff_lexicon(lexicon, './files_from_outside/english.stop.txt')
    lex.write_lexicon_to_file(CUTOFF_LEX_FILE, cutoff_lexicon)

    refined_cutoff_lexicon = lex.generate_refined_cutoff_lexicon(lexicon, './files_from_outside/english.stop.txt')
    lex.write_lexicon_to_file(REFINED_CUTOFF_LEX_FILE, refined_cutoff_lexicon)
Exemplo n.º 2
0
def read_base_lex(filename):
    lexicon = []
    with open(filename) as f:
        for line in f:
            lexicon.append(line.strip())

    return lexicon

def write_lexicon_to_file(filename, lexicon):
    """
    set(): lexicon
    """
    # remove empty string character
    try:
        lexicon.remove('')
    except:
        pass

    count = 0

    special_lex = ['<epsilon>', '<unk>']

    with open(filename, 'w') as output:
        for lex in special_lex + lexicon:
            output.write('%s\t%d\n' % (lex, count))
            count += 1

if __name__ == '__main__':
    lexicon = lex.generate_lexicon_from_textfile(TRAINING_FILE)
    lex.write_lexicon_to_file(LEX_FILE, lexicon)