示例#1
0
def merge(in_fname, new_fname, out_fname):
    in_file = codecs.open(in_fname, 'r', 'utf-8')
    existing_words = set()
    for line in in_file:
        try:
            word = line.split(' ', 1)[0]
        except:
            print line
        existing_words.add(word)
        
    new_file = codecs.open(new_fname, 'r', 'utf-8')
    new_words = set()
    for line in new_file:
        word = line.strip()
        new_words.add(word)

    new_words -= existing_words

    out_file = codecs.open(out_fname, 'w', 'utf-8')
    get_word_freq = get_search_engine('baidu')
    for w in new_words:
        py = word2pinyin(w)
        while True:
            try:
                freq = get_word_freq(w)
            except:
                continue
            else:
                break
        print w, py, freq
        print >> out_file, w, py, freq
示例#2
0
def merge(in_fname, new_fname, out_fname):
    in_file = codecs.open(in_fname, 'r', 'utf-8')
    existing_words = set()
    for line in in_file:
        word, py, freq = line.split()
        existing_words.add(word)

    new_file = codecs.open(new_fname, 'r', 'utf-8')
    new_words = set()
    for line in new_file:
        word = line.strip()
        new_words.add(word)

    new_words -= existing_words

    out_file = codecs.open(out_fname, 'w', 'utf-8')
    get_word_freq = get_search_engine('baidu')
    for w in new_words:
        py = word2pinyin(w)
        while True:
            try:
                freq = get_word_freq(w)
            except:
                continue
            else:
                break
        print w, py, freq
        print >> out_file, w, py, freq
示例#3
0
def extract_using_crf():
    default_datadir = '../data'
    default_model = os.path.join(default_datadir , 'model', 'pku-6-tags.model')

    parser = OptionParser()
    parser.add_option("-s", "--search-engine",
                      dest="search_engine")
    parser.add_option("-i", "--input")
    parser.add_option("-o", "--output", default="./newords.db")
    parser.add_option("-m", "--model", default=default_model)
    parser.add_option("-v", "--verbose", action="store_true", default=False)
    opts, args = parser.parse_args()
        
    if opts.input:
        input_files = [opts.input]
    else:
        input_files = args

    if opts.verbose:
        logging.basicConfig(level=logging.INFO,
                            format="%(levelname)s: %(message)s")
    if opts.search_engine is not None:
        get_word_freq = get_search_engine(opts.search_engine)
    else:
        get_word_freq = None

    word_extractor = WordExtractor(get_word_freq=get_word_freq, output_file=opts.output)
    baseseg.process(opts.model, 
                    input_files=input_files,
                    dump_func=word_extractor)
    logging.info("%d words added" % word_extractor.n_added)
    logging.info("%d words killed" % word_extractor.n_killed)
示例#4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import with_statement
import codecs
import operator
from optparse import OptionParser
from search_filter import get_search_engine
from pinyin import word2pinyin

get_word_freq = get_search_engine('baidu')

def merge(in_fname, new_fname, out_fname):
    in_file = codecs.open(in_fname, 'r', 'utf-8')
    existing_words = set()
    for line in in_file:
        try:
            word = line.split(' ', 1)[0]
        except:
            print line
        existing_words.add(word)
        
    new_file = codecs.open(new_fname, 'r', 'utf-8')
    new_words = set()
    for line in new_file:
        word = line.strip()
        new_words.add(word)

    new_words -= existing_words

    out_file = codecs.open(out_fname, 'w', 'utf-8')
示例#5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import with_statement
import codecs
import operator
from optparse import OptionParser
from search_filter import get_search_engine
from pinyin import word2pinyin

get_word_freq = get_search_engine('baidu')


def merge(in_fname, new_fname, out_fname):
    in_file = codecs.open(in_fname, 'r', 'utf-8')
    existing_words = set()
    for line in in_file:
        word, py, freq = line.split()
        existing_words.add(word)

    new_file = codecs.open(new_fname, 'r', 'utf-8')
    new_words = set()
    for line in new_file:
        word = line.strip()
        new_words.add(word)

    new_words -= existing_words

    out_file = codecs.open(out_fname, 'w', 'utf-8')
    get_word_freq = get_search_engine('baidu')
    for w in new_words: