def merge(in_fname, new_fname, out_fname): in_file = codecs.open(in_fname, 'r', 'utf-8') existing_words = set() for line in in_file: try: word = line.split(' ', 1)[0] except: print line existing_words.add(word) new_file = codecs.open(new_fname, 'r', 'utf-8') new_words = set() for line in new_file: word = line.strip() new_words.add(word) new_words -= existing_words out_file = codecs.open(out_fname, 'w', 'utf-8') get_word_freq = get_search_engine('baidu') for w in new_words: py = word2pinyin(w) while True: try: freq = get_word_freq(w) except: continue else: break print w, py, freq print >> out_file, w, py, freq
def merge(in_fname, new_fname, out_fname): in_file = codecs.open(in_fname, 'r', 'utf-8') existing_words = set() for line in in_file: word, py, freq = line.split() existing_words.add(word) new_file = codecs.open(new_fname, 'r', 'utf-8') new_words = set() for line in new_file: word = line.strip() new_words.add(word) new_words -= existing_words out_file = codecs.open(out_fname, 'w', 'utf-8') get_word_freq = get_search_engine('baidu') for w in new_words: py = word2pinyin(w) while True: try: freq = get_word_freq(w) except: continue else: break print w, py, freq print >> out_file, w, py, freq
def extract_using_crf(): default_datadir = '../data' default_model = os.path.join(default_datadir , 'model', 'pku-6-tags.model') parser = OptionParser() parser.add_option("-s", "--search-engine", dest="search_engine") parser.add_option("-i", "--input") parser.add_option("-o", "--output", default="./newords.db") parser.add_option("-m", "--model", default=default_model) parser.add_option("-v", "--verbose", action="store_true", default=False) opts, args = parser.parse_args() if opts.input: input_files = [opts.input] else: input_files = args if opts.verbose: logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") if opts.search_engine is not None: get_word_freq = get_search_engine(opts.search_engine) else: get_word_freq = None word_extractor = WordExtractor(get_word_freq=get_word_freq, output_file=opts.output) baseseg.process(opts.model, input_files=input_files, dump_func=word_extractor) logging.info("%d words added" % word_extractor.n_added) logging.info("%d words killed" % word_extractor.n_killed)
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import with_statement import codecs import operator from optparse import OptionParser from search_filter import get_search_engine from pinyin import word2pinyin get_word_freq = get_search_engine('baidu') def merge(in_fname, new_fname, out_fname): in_file = codecs.open(in_fname, 'r', 'utf-8') existing_words = set() for line in in_file: try: word = line.split(' ', 1)[0] except: print line existing_words.add(word) new_file = codecs.open(new_fname, 'r', 'utf-8') new_words = set() for line in new_file: word = line.strip() new_words.add(word) new_words -= existing_words out_file = codecs.open(out_fname, 'w', 'utf-8')
#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import with_statement import codecs import operator from optparse import OptionParser from search_filter import get_search_engine from pinyin import word2pinyin get_word_freq = get_search_engine('baidu') def merge(in_fname, new_fname, out_fname): in_file = codecs.open(in_fname, 'r', 'utf-8') existing_words = set() for line in in_file: word, py, freq = line.split() existing_words.add(word) new_file = codecs.open(new_fname, 'r', 'utf-8') new_words = set() for line in new_file: word = line.strip() new_words.add(word) new_words -= existing_words out_file = codecs.open(out_fname, 'w', 'utf-8') get_word_freq = get_search_engine('baidu') for w in new_words: