예제 #1
0
def extract_review():
    timed_reviews = {}
    num_docs = 0
    num_words = 0
    timed_reviews[app] = []

    with open(app_files) as fin:
        lines = fin.readlines()
    for l_id, line in enumerate(lines):
        line = line.strip()
        terms = line.split("******")
        if len(terms) != 6:
            logging.error("review format error at %s in %s" % (app, line))
            continue
        if not StoreNum:  # for ios
            date = terms[3]
            version = terms[4]
        else:  # for android
            date = terms[2]
            version = terms[3]

        review_o = terms[1]
        review_p, wc = extractSentenceWords(review_o, repeat=True)

        for list_text in review_p:
            for index, value in enumerate(list_text):
                list_text[index] = pycorrector.en_correct(value)
        review = list(build_phrase(review_p))
        review = [list(replace_digit(s)) for s in review]
        rate = float(terms[0]) if re.match(
            r'\d*\.?\d+', terms[0]) else 2.0  # 若评论星级缺失,则用平均评论星级2代替
        timed_reviews[app].append({
            "review": review,
            "date": date,
            "rate": rate,
            "version": version
        })
        num_docs += 1
        num_words += wc
        if l_id % 1000 == 0:
            logging.info("processed %d docs of %s" % (l_id, app))
    logging.info("total read %d reviews, %d words." % (num_docs, num_words))
    return timed_reviews
예제 #2
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

import sys

sys.path.append("..")

import pycorrector

if __name__ == '__main__':
    sent = "what happending ? how to speling it can you gorrect it"
    r = pycorrector.en_correct(sent)
    print(sent, '=>', r)

    sent_lst = ['what', 'hapenning', 'how', 'to', 'speling', 'it', 'you', 'can', 'gorrect', 'it']
    for i in sent_lst:
        print(i, '=>', pycorrector.en_correct(i))
예제 #3
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

import sys

sys.path.append("..")

import pycorrector

if __name__ == '__main__':
    # 1. 演示英文句子纠错
    sent = "what happending? how to speling it, can you gorrect it?"
    corrected_text, details = pycorrector.en_correct(sent)
    print(sent, '=>', corrected_text)
    print(details)
    print()

    # 2. 演示英文句子列表纠错
    sent_lst = ['what hapenning?','how to speling it', 'gorrect', 'i know']
    for sent in sent_lst:
        corrected_text, details = pycorrector.en_correct(sent)
        if details:
            print('[error] ', sent, '=>', corrected_text, details)
    print()

    # 3. 演示自定义英文词典
    from pycorrector.en_spell import EnSpell
예제 #4
0
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: 
"""

import sys

sys.path.append("..")

import pycorrector

if __name__ == '__main__':
    sent_lst = [
        'what', 'hapenning', 'how', 'to', 'speling', 'it', 'you', 'can',
        'gorrect', 'it'
    ]
    for i in sent_lst:
        print(i, '=>', pycorrector.en_correct(i))