def get_limit_appchina_comments(limit, json_path): weixin_json_str = open( json_path, 'r').read() weixin_json_root = json.loads(weixin_json_str) comments = weixin_json_root['comments'] result = [] temp_dict = {} for i in xrange(limit): refined_str = remove_illegal_characters(json.dumps(comments[i])) refined_comment = json.loads(refined_str) # md5标记去重 comment_md5 = hashlib.md5(refined_comment['comment'] + refined_comment['nickname']).hexdigest() if comment_md5 not in temp_dict: refined_comment[u'datetime'] = arrow.get(refined_comment['date']) result.append(refined_comment) temp_dict[comment_md5] = 1 return result
def get_limit_taobao_comments(limit, json_path, is_datetime=True): taobao_json_str = open( json_path, 'r').read() taobao_json_root = json.loads(taobao_json_str) comments = taobao_json_root['rateList'] result = [] temp_dict = {} limit = min(limit, len(comments)) for i in xrange(limit): refined_str = remove_illegal_characters(json.dumps(comments[i])) refined_comment = json.loads(refined_str) com_content = refined_comment.get('content', '') if not com_content: com_content = '' # md5标记去重 comment_md5 = hashlib.md5( str(com_content) + str(refined_comment['user']['nick'])).hexdigest() if comment_md5 not in temp_dict: if is_datetime: refined_comment[u'datetime'] = arrow.get(refined_comment['date'], u'YYYY年MM月DD日 HH:mm', tzinfo=tz.tzlocal()) result.append( {'comment': com_content, 'datetime': refined_comment['datetime'], # 'isBelievable': random.choice([True, False]) 'isBelievable': True if random.random() > 0.7 else False }) else: result.append( {'comment': com_content, 'datetime': refined_comment['date'], # 'isBelievable': random.choice([True, False]) 'isBelievable': True if random.random() > 0.7 else False }) temp_dict[comment_md5] = 1 if not refined_comment['content']: print refined_str return result