示例#1
0
def train(filename, parser):
    fname = basename(filename)
    cache_path = join(CACHE_PATH, fname)
    if exists(cache_path):
        return

    word2tag_count = {}
    for tag_id_list, txt in parser(filename):
        if not txt.strip():
            continue

        tag_id_set = set(tag_id_list)
        if not tag_id_set:
            continue

        for tid in tuple(tag_id_set):
            tag_id_set.update(PTAG.get(tid, ()))

        word2count = defaultdict(int)
        word_list = list(seg_txt(utf8_ftoj(str(txt))))
        for i in word_list:
            word2count[i] += 1

        for k, v in word2count.iteritems():
            if k not in word2tag_count:
                word2tag_count[k] = {}
            t = word2tag_count[k]
            for id in tag_id_set:
                if id not in t:
                    t[id] = 0
                t[id] += (1 + log(float(v)))

    tofromfile.tofile(cache_path, word2tag_count)
示例#2
0
def train(filename, parser):
    fname = basename(filename)
    cache_path = join(CACHE_PATH, fname)
    if exists(cache_path):
        return

    word2tag_count = {}
    for tag_id_list, txt in parser(filename):
        if not txt.strip():
            continue

        tag_id_set = set(tag_id_list)
        if not tag_id_set:
            continue

        for tid in tuple(tag_id_set):
            tag_id_set.update(PTAG.get(tid, ()))

        word2count = defaultdict(int)
        word_list = list(seg_txt(utf8_ftoj(str(txt))))
        for i in word_list:
            word2count[i] += 1

        for k, v in word2count.iteritems():
            if k not in word2tag_count:
                word2tag_count[k] = {}
            t = word2tag_count[k]
            for id in tag_id_set:
                if id not in t:
                    t[id] = 0
                t[id] += (1+log(float(v)))

    tofromfile.tofile(cache_path, word2tag_count)
示例#3
0
def main():
    df = Df()

    def merge(filename):
        path = join(ZDATA_PATH_TRAIN_IDF, "%s.idf" % filename)

        if not exists(path):
            return
        print path
        df.extend_by_file(path)

    merge("zhihu.js")
    merge("review.txt")
    for i in glob(join(ZDATA_PATH_TRAIN_IDF, "wanfang", "Periodical_*")):
        merge(i)

    PATH = join(ZDATA_PATH, "data")
    if not exists(PATH):
        makedirs(PATH)

    tofile(join(PATH, "idf"), idf_dumps(df._count, df._df))
示例#4
0
def main():
    df = Df()

    def merge(filename):
        path = join(ZDATA_PATH_TRAIN_IDF, "%s.idf" % filename)

        if not exists(path):
            return
        print path
        df.extend_by_file(path)

    merge("zhihu.js")
    merge("review.txt")
    for i in glob(join(ZDATA_PATH_TRAIN_IDF, "wanfang", "Periodical_*")):
        merge(i)

    PATH = join(ZDATA_PATH, "data")
    if not exists(PATH):
        makedirs(PATH)

    tofile(
        join(PATH, "idf"),
        idf_dumps(df._count, df._df),
    )
示例#5
0
 def tofile(self, path):
     tofile(path, self._dict)
示例#6
0
 def tofile(self):
     word_id2tag_id = list(self.txt_tag_generator())
     path = DATA_DIR
     self.tag2id.tofile(join(path, "tag2id"))
     self.word2id.tofile(join(path, "word2id"))
     tofile(join(path, "word_id2tag_id"), word_id2tag_id)
示例#7
0
 def tofile(self, f):
     tofile(
         f, (self._count, dict(self._df.iteritems()))
     )
示例#8
0
 def tofile(self, path):
     tofile(path, self._dict)
示例#9
0
 def tofile(self):
     word_id2tag_id = list(self.txt_tag_generator())
     path = DATA_DIR
     self.tag2id.tofile(join(path, 'tag2id'))
     self.word2id.tofile(join(path, 'word2id'))
     tofile(join(path, 'word_id2tag_id'), word_id2tag_id)
示例#10
0
 def tofile(self, f):
     tofile(f, (self._count, dict(self._df.iteritems())))