def chinese(): hit = set([line.strip() for line in read_lines('stopwords_zh.txt')]) lagou = set([line.strip() for line in read_lines('stopwords_lagou.txt')]) en = set([line.strip() for line in read_lines('stopwords_en.txt')]) ws = {u'', u' ', u'\t', u'\n'} return en | hit | lagou | ws
def from_file(file_path): reviews = read_lines(file_path) i = 0 while reviews[i].strip() != title_sep: i += 1 reviews = reviews[i + 1:] review_found = False review = [] result = [] for r in reviews: line = r.strip() if line != review_sep: review.append(r.strip()) else: review_found = True if review_found: # if len(review) < 2: # continue rating = rating_dict[review[1]] comment = '.'.join(review[2:]) # if (rating > 0) and (len(comment.strip()) > 0): # result.append((rating, comment)) result.append((rating, comment)) review = [] review_found = False return result
def save_degree_dict(): dict_file = './data/degree_zh.txt' lines = chinese.read_lines(dict_file) degree = 0 degree_dict = defaultdict(int) for l in lines: l = l.strip() if l and (not is_comment(l)): if l[0].isdigit(): parts = l.split('-') assert len(parts) == 2 degree = int(parts[1]) else: degree_dict[l] = degree for k in degree_dict: print(k, degree_dict[k]) to_pickle(degree_dict, './data/degree_zh.pkl')
def save_chinese_stopwords(): dict_file = './data/stopwords_zh_hit.txt' lines = chinese.read_lines(dict_file) dict_data = [w.strip() for w in lines if w.strip()] to_pickle(dict_data, './data/stopwords_zh.pkl')
def save_inverse_dict(): dict_file = './data/inverse_zh.txt' lines = chinese.read_lines(dict_file) inv_dict = [w.strip() for w in lines if w.strip() and (not is_comment(w))] to_pickle(inv_dict, './data/inverse_zh.pkl')
def save_neg_sentiment_dict(): dict_file = './data/neg_sent_zh.txt' lines = chinese.read_lines(dict_file) dict_data = [w.strip() for w in lines if w.strip()] to_pickle(dict_data, '../douban/movies/dicts/neg_sent_zh.pkl')
def save_pos_sentiment_dict(): dict_file = './data/pos_sent_zh.txt' lines = chinese.read_lines(dict_file) dict_data = [w.strip() for w in lines if w.strip()] to_pickle(dict_data, './data/pos_sent_zh.pkl')
# coding=utf-8 import glob, os, path import jieba from common.chinese import read_lines, write from common.persistence import from_pickle stopwords = set(from_pickle('stopwords.pkl')) print len(stopwords) for fname in glob.glob('*.txt'): print fname + ' started' name_without_ext = os.path.splitext(fname)[0] segmented = [] for line in read_lines(fname): parts = line.strip().split('\t') if len(parts) < 3: continue seg_list = jieba.cut(parts[2], cut_all=False) seg_list = [seg for seg in seg_list if seg not in stopwords] s = ' '.join(seg_list) segmented.append(s) # print s write(name_without_ext + '.seg', '\n'.join(segmented)) print fname + ' done'
def chinese(): return set([line.strip() for line in read_lines('../dicts/data/stopwords_zh.txt')])