def main(): label_cnt = defaultdict(int) db = get_mongo('item') data = [] for item in db.find(): desc = ItemDescBase.deserialize(item['desc']) labels = item['tag'] if not labels: continue for l in labels: label_cnt[l] += 1 doc = desc.render_content() data.append((doc, labels)) available_labels = set() total_cnt = sum(label_cnt.values()) print total_cnt for label, cnt in label_cnt.iteritems(): if cnt > total_cnt * 0.0015 and cnt < total_cnt * 0.1: available_labels.add(label) print 'remaining labels: ', len(available_labels) print("#documents: {}" . format(len(data))) print("training ...\n") random.shuffle(data) data = data[:MAX_DATA_SIZE] data = filter_data_label(data, available_labels) tagger = TextTagger(nr_min_word_count=3) tagger.fit(data) print("writing model...\n") tagger.dump(ukconfig.tagger_path)
def auto_tagging(ctx, doc): """auto tagging an item. It will load tagger model from `ukconfig.tagger_path`. Model should be trained prior to make this function work""" global _tagger if _tagger is None: try: log_info('loading tagger ...') _tagger = TextTagger.load(ukconfig.tagger_path) except IOError: log_info('tagger model not found.') return tags = _tagger.predict_one(doc['desc'].render_content()) declare_tag(tags) log_info('original tag: ' + str(doc['tag'])) log_info('autotagging: ' + str(tags)) doc['tag'] = list(set(doc['tag'] + tags)) """auto tag """