def preprocess(fin_path):
     fin = open(fin_path, 'r')
     manager_groups = {}
     for line in fin:
         splits = line.split(',')
         aa = re.findall('\d+', splits[0])[0]
         pid = int(aa)
         try:
             ll = splits[3].replace('\n', '')
             if pid in manager_groups:
                 manager_groups[pid].add_node(SentenceNode(splits[4], extra=int(ll)))
             else:
                 snm = SentenceNodeManager()
                 snm.add_node(SentenceNode(splits[4], extra=int(ll)))
                 manager_groups[pid] = snm
         except Exception, e:
             print e
             print line
def text_rank(sentences, num=10, sim_func=get_similarity, pagerank_config={'alpha': 0.85, }):
    """将句子按照关键程度从大到小排序

    Keyword arguments:
    sentences         --  列表,元素是句子
    words             --  二维列表,子列表和sentences中的句子对应,子列表由单词组成
    sim_func          --  计算两个句子的相似性,参数是两个由单词组成的列表
    pagerank_config   --  pagerank的设置
    """

    def cal_cos(vec1, vec2):
        def __cal_vecmod(vec):
            mod_result = 0.0
            for i in vec:
                mod_result += i ** 2
            return math.sqrt(mod_result)

        # 首先是分子的计算
        fenzi = 0.0
        for i in xrange(len(vec1)):
            fenzi += vec1[i] * vec2[i]
        # 计算分母
        fenmu = __cal_vecmod(vec1) * __cal_vecmod(vec2)
        if fenmu == 0:
            return 0
        else:
            return fenzi / fenmu

    sorted_sentences = []
    snm = SentenceNodeManager()
    for sent in sentences:
        snode = SentenceNode(sent)
        snm.add_node(snode)
    snm.normalize_all_sentnodes()
    vlist = snm.get_vec_list()
    # _source = words
    sentences_num = len(vlist)
    graph = np.zeros((sentences_num, sentences_num))

    for x in xrange(sentences_num):
        for y in xrange(x, sentences_num):
            similarity = cal_cos(vlist[x], vlist[y])
            graph[x, y] = similarity
            graph[y, x] = similarity

    nx_graph = nx.from_numpy_matrix(graph)
    scores = nx.pagerank(nx_graph, **pagerank_config)  # this is a dict
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)

    for index, score in sorted_scores:
        item = AttrDict(sentence=sentences[index], weight=score)
        sorted_sentences.append(item)

    return sorted_sentences[:num]
def save_csv(jd_result, fout_path=None):
    if not fout_path:
        fout_path = jd_result['title'][:10]
    with open('../data/csv/%s-%s.csv' % (arrow.utcnow().timestamp, fout_path), 'w') as fout:
        fout.write(codecs.BOM_UTF8)
        for item in jd_result['comments']:
            fout.write('%s,%s,%s,0,%s\n' % (
                jd_result['pid'], item['creationTime'], item['nickname'], item['content'].replace('\n', '')))
        print 'saved! path = %s' % fout_path


if __name__ == '__main__':
    ITEM_ID = 411082
    MAXPAGE = 20
    # RES = get_jd_rate(ITEM_ID,MAXPAGE)
    jd_res = jd_parser(ITEM_ID, MAXPAGE)
    open('%s-%s-parser_result.json' % (arrow.utcnow().timestamp, jd_res['title'][:10]), 'w').write(
        json.dumps(jd_res, ensure_ascii=False))
    save_csv(jd_res)
    # add sentence
    # st = open('parser_result.json', 'r').read()
    # jd_res = json.loads(st)
    snm = SentenceNodeManager()
    for item in jd_res['comments']:
        snm.add_node(SentenceNode(item['content']))
    import utils.node_vec_utils.node_cluster_utils as CU

    CU.APcluster(snm, '../data/clusters/%s-APresult.json' % jd_res['title'][:10])
    CU.DBSCANcluster(snm, '../data/clusters/%s-DBSCANresult.json' % jd_res['title'][:10])
def amazon_preprocess(start=0, end=10, label_rate=0.65, min_vote=0):
    """

    :param start:
    :param end:
    :param label_rate:
    :return:
    """
    # prepare train set
    db_inst = get_db_inst('AmazonReviews', 'AndroidAPP')
    # print len(db_inst.distinct('asin'))
    manager_groups = {}
    asin_file = open('%s/process/data/asin.list' % PROJECT_PATH, 'r')
    # for asin in db_inst.distinct('asin'):
    #     asin_file.write('%s\n' % asin)
    lines = asin_file.readlines()
    shuffle(lines)
    # for asin in db_inst.distinct('asin'):
    tlines = lines[start:end]
    review_dicts = {}
    asin_list = []
    for asin in tlines:
        asin = asin.replace('\n', '')
        asin_list.append(asin)
        print 'loading %s' % asin
        # snm.add_node(SentenceNode(splits[4], extra=int(ll)))

        # 计算每个APP下的评论
        a_reviews = []
        max_vote = 0  # 常量
        for find_item in db_inst.find({"asin": asin, 'total_vote': {"$gt": min_vote}}):
            max_vote = max(find_item['total_vote'], max_vote)
            a_reviews.append(find_item)
        # process item reviews VOTE RANK
        review_rank = []
        print '%s has %s reviews' % (asin, len(a_reviews))
        snm = SentenceNodeManager()
        for review in a_reviews:
            alpha_const = 0
            T = float(review['total_vote']) / max_vote
            V = 1 / (1.0 + math.exp(-0.01 * (2 * review['up_vote'] - review['total_vote'])))
            # V = float(review['up_vote']) / review['total_vote']
            vote_rank_value = 2 * (T + alpha_const) * (V + alpha_const) / (T + V + 2 * alpha_const)
            if vote_rank_value >= label_rate:
                snm.add_node(
                    SentenceNode(review['reviewText'].lower(), extra=(int(1), vote_rank_value, review['reviewerID']),
                                 get_pos_func=tag_sents,
                                 get_keywords_func=cal_en_tfidf))
            elif vote_rank_value < label_rate:
                snm.add_node(
                    SentenceNode(review['reviewText'].lower(), extra=(int(0), vote_rank_value, review['reviewerID']),
                                 get_pos_func=tag_sents,
                                 get_keywords_func=cal_en_tfidf))
            review_rank.append((review, vote_rank_value))
        manager_groups[asin] = snm
        review_dicts[asin] = review_rank
        # else:
        #     break
    veclist = []
    sentlist = []
    labellist = []
    tokenlist = []
    nodelist = []
    group_nodelist = []
    print 'start normalizing vecs'
    for pid in manager_groups.keys():
        manager = manager_groups[pid]
        # DBSCANcluster(manager, '%s_DBSCANcluster.json' % pid)
        # APcluster(manager, '%s_APcluster.json' % pid)
        manager.normalize_all_sentnodes(tfidf_func=tag_sents)
        veclist.extend(manager.get_vec_list())
        sentlist.extend(manager.get_sent_list())
        gnodelist = []
        for node in manager.node_list:
            labellist.append(node.extra[0])
            tokenlist.append(node.feature2token())
            nodelist.append(node)
            gnodelist.append(node)
        group_nodelist.append(gnodelist)
    print 'end normalizing vecs'
    return veclist, sentlist, labellist, tokenlist, nodelist, manager_groups