示例#1
0
def chinese():
    hit = set([line.strip() for line in read_lines('stopwords_zh.txt')])
    lagou = set([line.strip() for line in read_lines('stopwords_lagou.txt')])
    en = set([line.strip() for line in read_lines('stopwords_en.txt')])
    ws = {u'', u' ', u'\t', u'\n'}

    return en | hit | lagou | ws
示例#2
0
def from_file(file_path):
    reviews = read_lines(file_path)

    i = 0
    while reviews[i].strip() != title_sep:
        i += 1
    reviews = reviews[i + 1:]

    review_found = False
    review = []
    result = []
    for r in reviews:
        line = r.strip()
        if line != review_sep:
            review.append(r.strip())
        else:
            review_found = True

        if review_found:
            # if len(review) < 2:
            # continue
            rating = rating_dict[review[1]]
            comment = '.'.join(review[2:])

            # if (rating > 0) and (len(comment.strip()) > 0):
            #     result.append((rating, comment))
            result.append((rating, comment))

            review = []
            review_found = False

    return result
示例#3
0
def from_file(file_path):
    reviews = read_lines(file_path)

    i = 0
    while reviews[i].strip() != title_sep:
        i += 1
    reviews = reviews[i + 1:]

    review_found = False
    review = []
    result = []
    for r in reviews:
        line = r.strip()
        if line != review_sep:
            review.append(r.strip())
        else:
            review_found = True

        if review_found:
            # if len(review) < 2:
            # continue
            rating = rating_dict[review[1]]
            comment = '.'.join(review[2:])

            # if (rating > 0) and (len(comment.strip()) > 0):
            #     result.append((rating, comment))
            result.append((rating, comment))

            review = []
            review_found = False

    return result
示例#4
0
def save_degree_dict():
    dict_file = './data/degree_zh.txt'
    lines = chinese.read_lines(dict_file)

    degree = 0
    degree_dict = defaultdict(int)
    for l in lines:
        l = l.strip()

        if l and (not is_comment(l)):
            if l[0].isdigit():
                parts = l.split('-')
                assert len(parts) == 2
                degree = int(parts[1])
            else:
                degree_dict[l] = degree

    for k in degree_dict:
        print(k, degree_dict[k])

    to_pickle(degree_dict, './data/degree_zh.pkl')
示例#5
0
def save_chinese_stopwords():
    dict_file = './data/stopwords_zh_hit.txt'
    lines = chinese.read_lines(dict_file)
    dict_data = [w.strip() for w in lines if w.strip()]
    to_pickle(dict_data, './data/stopwords_zh.pkl')
示例#6
0
def save_inverse_dict():
    dict_file = './data/inverse_zh.txt'
    lines = chinese.read_lines(dict_file)

    inv_dict = [w.strip() for w in lines if w.strip() and (not is_comment(w))]
    to_pickle(inv_dict, './data/inverse_zh.pkl')
示例#7
0
def save_neg_sentiment_dict():
    dict_file = './data/neg_sent_zh.txt'
    lines = chinese.read_lines(dict_file)
    dict_data = [w.strip() for w in lines if w.strip()]
    to_pickle(dict_data, '../douban/movies/dicts/neg_sent_zh.pkl')
示例#8
0
def save_pos_sentiment_dict():
    dict_file = './data/pos_sent_zh.txt'
    lines = chinese.read_lines(dict_file)
    dict_data = [w.strip() for w in lines if w.strip()]
    to_pickle(dict_data, './data/pos_sent_zh.pkl')
示例#9
0
# coding=utf-8
import glob, os, path
import jieba
from common.chinese import read_lines, write
from common.persistence import from_pickle

stopwords = set(from_pickle('stopwords.pkl'))
print len(stopwords)

for fname in glob.glob('*.txt'):
    print fname + ' started'

    name_without_ext = os.path.splitext(fname)[0]

    segmented = []

    for line in read_lines(fname):
        parts = line.strip().split('\t')

        if len(parts) < 3:
            continue

        seg_list = jieba.cut(parts[2], cut_all=False)
        seg_list = [seg for seg in seg_list if seg not in stopwords]
        s = ' '.join(seg_list)
        segmented.append(s)
        # print s

    write(name_without_ext + '.seg', '\n'.join(segmented))

    print fname + ' done'
示例#10
0
def chinese():
    return set([line.strip() for line in read_lines('../dicts/data/stopwords_zh.txt')])