def main(): parser = argparse.ArgumentParser(description="take text feature") parser.add_argument("-t", "--type", type=str, choices=("db", "file"), default="file", help="db/file") parser.add_argument("-s", "--source", type=str, help="file path/sql script") parser.add_argument("-n", "--name", type=str, help="output file name") parser.add_argument("-k", "--topk", type=int, default=500, help="top k words") parser.add_argument("-w", "--word_category", default="v,vd,vn,vf,a,ad,an,ag,al", type=str, help="word category") args = parser.parse_args() source_from = args.type source = args.source name = args.name k_num = args.topk word_category = args.word_category.split(",") print word_category if source_from == "db": comments_df = preprocess.get_data_from_db(source) elif source_from == "file": comments_df = preprocess.read_comment_from_file(source) else: return comments_list = list(comments_df["comment"].values) cutted, word_category_list = utils.word_cut(comments_list) word_weight_flag = utils.tfidf(cutted, word_category_list, "tfidf_" + name) key_word = utils.get_topK(word_weight_flag, "top_k_" + name, k=k_num, category_list=word_category)
def main(): parser = argparse.ArgumentParser(description='take text feature') parser.add_argument('-t', '--type', type=str, choices=('db', 'file'), default='file', help='db/file') parser.add_argument('-s', '--source', type=str, help='file path/sql script') parser.add_argument('-n', '--name', type=str, help='output file name') parser.add_argument('-k', '--topk', type=int, default=500, help='top k words') parser.add_argument('-w', '--word_category', default='v,vd,vn,vf,a,ad,an,ag,al', type=str, help='word category') args = parser.parse_args() source_from = args.type source = args.source name = args.name k_num = args.topk word_category = args.word_category.split(',') print word_category if source_from == 'db': comments_df = preprocess.get_data_from_db(source) elif source_from == 'file': comments_df = preprocess.read_comment_from_file(source) else: return comments_list = list(comments_df['comment'].values) cutted, word_category_list = utils.word_cut(comments_list) word_weight_flag = utils.tfidf(cutted, word_category_list, 'tfidf_' + name) key_word = utils.get_topK(word_weight_flag, 'top_k_' + name, k=k_num, category_list=word_category)
#!/usr/bin/python # -*- coding: utf-8 -*- __author__ = 'Administrator' import sys import numpy as np sys.path.append('./lib') import utils import preprocess import match_comments as mc import string comments_df = preprocess.read_comment_from_file('data/order_review') comments = comments_df['comment'].iloc[:] keys = utils.read_in_keys("top_k_'test'") result = comments.apply(mc.tag_comments, args=(keys, )) result.to_csv('comment_with_tag', sep='\t', encoding='utf-8') bag_of_tags = set() for line in result: tmp = line.split('>>')[-1] tags = tmp.split('\t') tags = map(string.strip, tags) bag_of_tags = bag_of_tags.union(set(tags)) ff = open('tags', 'w') for t in bag_of_tags: ff.write(t + '\n') ff.close()
#!/usr/bin/python # -*- coding: utf-8 -*- __author__ = 'Administrator' import sys import numpy as np sys.path.append('./lib') import utils import preprocess import match_comments as mc import string comments_df = preprocess.read_comment_from_file('data/order_review') comments = comments_df['comment'].iloc[:] keys = utils.read_in_keys("top_k_'test'") result = comments.apply(mc.tag_comments,args = (keys,)) result.to_csv('comment_with_tag', sep='\t', encoding='utf-8') bag_of_tags = set() for line in result: tmp = line.split('>>')[-1] tags = tmp.split('\t') tags = map(string.strip,tags) bag_of_tags = bag_of_tags.union(set(tags)) ff = open('tags','w') for t in bag_of_tags: ff.write(t +'\n') ff.close()