Exemplo n.º 1
0
def return_data(union_index_list, corpus, min_sample):
    group = {}
    for idx, cluster_index in enumerate(union_index_list):
        label = idx
        group[label] = {}
        group[label]["ids"] = []
        group[label]["title_list"] = {}
        group[label]["site_names"] = set()
        group[label]["publish_times"] = []

        for index in cluster_index:
            id = corpus[index].messageId
            title = corpus[index].messageTitle
            publish_time = corpus[index].messagePublishtime
            site_name = corpus[index].site_name

            group[label]["ids"].append(id)
            group[label]["site_names"].add(site_name)
            group[label]["publish_times"].append(publish_time)
            if title not in group[label]["title_list"].keys():
                group[label]["title_list"][title] = 1
            else:
                group[label]["title_list"][title] += 1

    cluster_result = []
    for label, value in group.items():
        # clusterId
        cluster_id = label

        # cluster_topic  对每一title降序排序
        titles = group[label]["title_list"]
        sorted_titles = sorted(titles.iteritems(), key=operator.itemgetter(1), reverse=True)
        cluster_topic = sorted_titles[0][0]

        # clusterPublishtimeRange 对时间排序
        publish_times = group[label]["publish_times"]
        publish_times.sort()  # 时间升序
        cluster_publish_beginTime = publish_times[0]
        cluster_publish_endTime = publish_times[-1]

        # cluster_member
        ids_list = map(str, group[label]["ids"])
        cluster_member = "^A".join(ids_list)

        # cluster_member_count
        cluster_member_count = len(ids_list)

        # siteCount
        siteCount = len(group[label]["site_names"])

        # 构造ClusterObj
        if cluster_member_count >= min_sample:
            cObj = ClusterObj(clusterId=cluster_id, clusterTopic=cluster_topic,
                              clusterPublishBeginTime=cluster_publish_beginTime,
                              clusterPublishEndTime=cluster_publish_endTime,
                              clusterMember=cluster_member, cluserMemeberCount=cluster_member_count,
                              siteCount=siteCount)
            cluster_result.append(cObj)
    return cluster_result
Exemplo n.º 2
0
def return_data(cluster_ids_list, first_cluster_results, min_sample):
    cObjArray = []
    for idx, cluster_ids in enumerate(cluster_ids_list):
        cluster_topic = ''
        max_ids_count = 0

        publishtimes = []
        ids = []
        sitenames = set()

        for cluster_id in cluster_ids:
            _ids = map(str, first_cluster_results[cluster_id]["ids"])
            _cluster_topic = first_cluster_results[cluster_id]["cluster_topic"]
            _sitenames = first_cluster_results[cluster_id]["site_names"]
            _publishtimes = first_cluster_results[cluster_id]["publish_times"]

            if len(_ids) > max_ids_count:
                max_ids_count = len(_ids)
                cluster_topic = _cluster_topic

            publishtimes.extend(_publishtimes)
            ids.extend(_ids)
            sitenames = sitenames | _sitenames

        # 构建返回结果
        # clusterId
        clusterId = idx

        # clusterTopic  对每一title降序排序
        clusterTopic = cluster_topic

        # clusterPublishtimeRange 对时间排序
        publishtimes.sort()  # 时间升序
        clusterPublishBeginTime = publishtimes[0]
        clusterPublishEndTime = publishtimes[-1]

        # clusterMember
        clusterMember = "^A".join(ids)

        # cluserMemeberCount
        cluserMemeberCount = len(ids)

        # siteCount
        siteCount = len(sitenames)

        # 构造ClusterObj
        if len(ids) >= min_sample:
            cObj = ClusterObj(clusterId=clusterId,
                              clusterTopic=clusterTopic,
                              clusterPublishBeginTime=clusterPublishBeginTime,
                              clusterPublishEndTime=clusterPublishEndTime,
                              clusterMember=clusterMember,
                              cluserMemeberCount=cluserMemeberCount,
                              siteCount=siteCount)
            cObjArray.append(cObj)
    return cObjArray
Exemplo n.º 3
0
def get_result_corpus(corpus, is_manual,
                      cluster_type, manual_id, subtopic_id, language_type, save_group_id):
    """
    当first_cluster_results 为空, 随机选择10条数据作为聚类结果返回.
    :return:
    """
    random_corpus = []
    if len(corpus) > 10:
        random_corpus = random.sample(corpus, 10)  # 从list中随机获取10个元素
    else:
        random_corpus = corpus

    cObjArray = []
    for idx, cluster_message_obj in enumerate(random_corpus):
        # id
        id = get_cluster_result_id()

        # clusterId
        clusterId = idx

        # clusterTopic  对每一title降序排序
        clusterTopic = cluster_message_obj.messageTitle

        # clusterPublishtimeRange 对时间排序
        clusterPublishBeginTime = cluster_message_obj.messagePublishtime
        clusterPublishEndTime = cluster_message_obj.messagePublishtime

        # clusterMember
        clusterMember = str(cluster_message_obj.messageId)

        # cluserMemeberCount
        cluserMemeberCount = 1

        # siteCount
        siteCount = 1

        # 构造ClusterObj
        cObj =  ClusterObj(id, clusterId, clusterTopic,
                           clusterPublishBeginTime, clusterPublishEndTime,
                           clusterMember, cluserMemeberCount,
                           siteCount, cluster_type,
                           language_type, save_group_id,
                           is_manual, manual_id, subtopic_id)
        cObjArray.append(cObj)

    # 增加排序
    sort_bywords(cObjArray)

    return cObjArray
Exemplo n.º 4
0
def get_result_corpus(corpus):
    """
    当first_cluster_results 为空, 随机选择10条数据作为聚类结果返回.
    :param corpus:
    :return:
    """
    random_corpus = []
    if len(corpus) > 10:
        random_corpus = random.sample(corpus, 10)  # 从list中随机获取10个元素
    else:
        random_corpus = corpus

    cObjArray = []
    for idx, cluster_message_obj in enumerate(random_corpus):
        # clusterId
        clusterId = idx

        # clusterTopic  对每一title降序排序
        clusterTopic = cluster_message_obj.messageTitle

        # clusterPublishtimeRange 对时间排序
        clusterPublishBeginTime = cluster_message_obj.messagePublishtime
        clusterPublishEndTime = cluster_message_obj.messagePublishtime

        # clusterMember
        clusterMember = str(cluster_message_obj.messageId)

        # cluserMemeberCount
        cluserMemeberCount = 1

        # siteCount
        siteCount = 1

        # 构造ClusterObj
        cObj = ClusterObj(clusterId=clusterId,
                          clusterTopic=clusterTopic,
                          clusterPublishBeginTime=clusterPublishBeginTime,
                          clusterPublishEndTime=clusterPublishEndTime,
                          clusterMember=clusterMember,
                          cluserMemeberCount=cluserMemeberCount,
                          siteCount=siteCount)
        cObjArray.append(cObj)
    return cObjArray
Exemplo n.º 5
0
def return_data(cluster_ids_list, first_cluster_results, is_manual,
                cluster_type, manual_id, subtopic_id, language_type, save_group_id):
    cObjArray = []
    for idx, cluster_ids in enumerate(cluster_ids_list):
        cluster_topic = ''
        max_ids_count = 0

        publishtimes = []
        ids = []
        sitenames = set()

        for cluster_id in cluster_ids:
            _ids = map(str, first_cluster_results[cluster_id]["ids"])
            _cluster_topic = first_cluster_results[cluster_id]["cluster_topic"]
            _sitenames = first_cluster_results[cluster_id]["site_names"]
            _publishtimes = first_cluster_results[cluster_id]["publish_times"]

            if len(_ids) > max_ids_count:
                max_ids_count = len(_ids)
                cluster_topic = _cluster_topic

            publishtimes.extend(_publishtimes)
            ids.extend(_ids)
            sitenames = sitenames | _sitenames

        # 构建返回结果
        # id
        id = get_cluster_result_id()

        # clusterId
        clusterId = idx

        # clusterTopic  对每一title降序排序
        clusterTopic = cluster_topic

        # clusterPublishtimeRange 对时间排序
        publishtimes.sort()  # 时间升序
        clusterPublishBeginTime = publishtimes[0]
        clusterPublishEndTime = publishtimes[-1]

        # clusterMember
        clusterMember = "^A".join(ids)

        # cluserMemeberCount
        cluserMemeberCount = len(ids)

        # siteCount
        siteCount = len(sitenames)

        # 构造ClusterObj
        cObj = ClusterObj(id, clusterId, clusterTopic,
                          clusterPublishBeginTime, clusterPublishEndTime,
                          clusterMember, cluserMemeberCount,
                          siteCount, cluster_type,
                          language_type, save_group_id,
                          is_manual, manual_id, subtopic_id)
        cObjArray.append(cObj)

    # 增加排序
    sort_bywords(cObjArray)

    return cObjArray
Exemplo n.º 6
0
def getReturnData(label, messageObjList):
    # title和MessageObj 拉链操作
    labelZipMessageObjList = zip(label, messageObjList)
    # 去除-1标签
    noImpurity = filter(lambda (x, y): x != -1, labelZipMessageObjList)

    # 按label分组,统计每个组出现次数最多的title,统计每个组的时间范围
    noImpurityGroup = {}
    for line in noImpurity:
        label = line[0]
        id = line[1].messageId
        title = line[1].messageTitle
        content = line[1].messageContent
        content_length = len(content)
        publish_time = line[1].messagePublishtime
        site_name = line[1].site_name

        # 如果不存在, 初始化
        if label not in noImpurityGroup.keys():
            noImpurityGroup[label] = {}
            noImpurityGroup[label]["id"] = []
            # 关注title  出现次数
            noImpurityGroup[label]["title"] = {}
            # 关注publishtime  最大最小
            noImpurityGroup[label]["publishtime"] = []
            # 关注site_name
            noImpurityGroup[label]["sitename"] = set()
            # 获得不同站点中, length最长的文章
            noImpurityGroup[label]["mlcpersite"] = {}

            noImpurityGroup[label]["id"].append(id)
            noImpurityGroup[label]["title"][title] = 1
            noImpurityGroup[label]["publishtime"].append(publish_time)
            noImpurityGroup[label]["sitename"].add(site_name)
            noImpurityGroup[label]["mlcpersite"][site_name] = {
                'content': content,
                'maxlength': content_length,
                'id': id
            }
        else:
            noImpurityGroup[label]["id"].append(id)
            if title not in noImpurityGroup[label]["title"].keys():
                noImpurityGroup[label]["title"][title] = 1
            else:
                noImpurityGroup[label]["title"][title] += 1
            noImpurityGroup[label]["publishtime"].append(publish_time)
            noImpurityGroup[label]["sitename"].add(site_name)
            if site_name not in noImpurityGroup[label]["mlcpersite"].keys():
                noImpurityGroup[label]["mlcpersite"][site_name] = {
                    'content': content,
                    'maxlength': content_length,
                    'id': id
                }
            else:
                old_mlc = noImpurityGroup[label]["mlcpersite"][site_name]
                if old_mlc['maxlength'] < content_length:
                    noImpurityGroup[label]["mlcpersite"][site_name] = {
                        'content': content,
                        'maxlength': content_length,
                        'id': id
                    }

    # 分组内部 title降序排序 publisttime比较大小 构造ClusterObj
    import operator
    from entity.ClusterObj import ClusterObj
    cObjArray = []
    for (label, value) in noImpurityGroup.items():
        # clusterId
        clusterId = label

        # clusterTopic  对每一title降序排序
        titleGroup = noImpurityGroup[label]["title"]
        sortedTitleGroup = sorted(titleGroup.iteritems(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
        clusterTopic = sortedTitleGroup[0][0]

        # clusterPublishtimeRange 对时间排序
        clusterPublishtime = noImpurityGroup[label]["publishtime"]
        clusterPublishtime.sort()  # 时间升序
        clusterPublishBeginTime = clusterPublishtime[0]
        clusterPublishEndTime = clusterPublishtime[-1]

        # clusterMember
        cluster_group = map(str, noImpurityGroup[label]["id"])
        clusterMember = "^A".join(cluster_group)

        # cluserMemeberCount
        cluserMemeberCount = len(cluster_group)

        # siteCount
        siteCount = len(noImpurityGroup[label]["sitename"])

        # 构造ClusterObj
        cObj = ClusterObj(clusterId=clusterId,
                          clusterTopic=clusterTopic,
                          clusterPublishBeginTime=clusterPublishBeginTime,
                          clusterPublishEndTime=clusterPublishEndTime,
                          clusterMember=clusterMember,
                          cluserMemeberCount=cluserMemeberCount,
                          siteCount=siteCount)
        cObjArray.append(cObj)
    return cObjArray