Пример #1
0
def clustering_txt(app, tag):
    all_records = get_all_sentence_records(app, tag)
    all_sentences = [x[2] for x in all_records]

    if len(all_records) == 0:  # there is no difference item candidate
        return
    elif len(all_records) == 1:  # there is only one difference item candidate
        clusters = [1]
    else:
        distance_matrix = calculate_distance_matrix(all_sentences)
        distArray = ssd.squareform(distance_matrix)

        Z = sch.linkage(distArray, method='single')
        clusters = sch.fcluster(Z, T_THRESHOLD, criterion='distance')

    # save result to database
    db = connect_db()
    cur = db.cursor()
    for i, cluster_id in enumerate(clusters):
        record = list(all_records[i]) + [
            int(''.join(['100', str(cluster_id)]))
        ]  # txt: 100xxx, img: 200xxx

        sql = "INSERT INTO cluster_txt " + \
         "(app, duplicate_tag, diff_sentence, diff_sentence_index, report_id, cluster_id) " + \
         "VALUES (%s, %s, %s, %s, %s, %s)"

        try:
            cur.execute(sql, record)
            db.commit()
        except Exception as e:
            traceback.print_exc()
    close_db(db)
Пример #2
0
def clustering_txt(app, tag):
	all_records = get_all_sentence_records(app, tag) # retrieve all different sentence candadite of a certain group. tag here is the group id
	all_sentences = [x[2] for x in all_records] 

	if len(all_records) == 0: # there is no difference item candidate
		return
	elif len(all_records) == 1: # there is only one difference item candidate
		clusters = [1]
	else: # more than one difference item candidate
		distance_matrix = calculate_distance_matrix(all_sentences)  
		distArray = ssd.squareform(distance_matrix)# numpy.ndarray : 返回上三角构成的数组

		Z = sch.linkage(distArray, method = 'single') #You can also plot this. # https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
		clusters = sch.fcluster(Z, T_THRESHOLD, criterion = 'distance') # The cluster result for the diff sentence candidates of a group

	# save result to database
	db = connect_db()
	cur = db.cursor()
	for i, cluster_id in enumerate(clusters):
		record = list(all_records[i]) + [int(''.join(['100',str(cluster_id)]))] # txt: 100xxx, img: 200xxx
		# what is this record ? why 100 + clusterid?  注意 duplicate_tag 就是 group id
		sql = "INSERT INTO cluster_txt " + \
			"(app, duplicate_tag, diff_sentence, diff_sentence_index, report_id, cluster_id) " + \
			"VALUES (?,?, ?, ?, ?, ?)"

		try:
			cur.execute(sql, record)
			db.commit()
		except Exception as e:
			traceback.print_exc()
	close_db(db)
Пример #3
0
def join(app, tag, all_txt_clusters, all_img_clusters, link_matrix):
    db = connect_db()
    cur = db.cursor()

    AUTO_CLUSTER_ID = 0
    linked_img_clusters = set()  # just_cluster_id

    for i, txt_cluster in enumerate(all_txt_clusters):
        txt_cluster_id = txt_cluster.get_cluster_id()
        linked_img_clusters_i = get_all_linked_img_clusters(
            link_matrix, i, all_img_clusters)

        if len(
                linked_img_clusters_i
        ) == 0:  # there is no image candidate cluster that is linked with this cluster
            cur.execute(
                "INSERT INTO cluster_combine (app, duplicate_tag, cluster_tag, "
                + "cluster_id_txt) VALUES (%s, %s, %s, %s)",
                (app, tag, AUTO_CLUSTER_ID, txt_cluster_id))
            db.commit()
            AUTO_CLUSTER_ID += 1

        for img_cluster in linked_img_clusters_i:  # save the relationship to database
            img_cluster_id = img_cluster.get_cluster_id()
            cur.execute(
                "INSERT INTO cluster_combine (app, duplicate_tag, cluster_tag, "
                +
                "cluster_id_txt, cluster_id_img) VALUES (%s, %s, %s, %s, %s)",
                (app, tag, AUTO_CLUSTER_ID, txt_cluster_id, img_cluster_id))
            db.commit()
            AUTO_CLUSTER_ID += 1
            linked_img_clusters.add(img_cluster_id)

    # save rest image candidate clusters to database
    unlinked_img_clusters = set([x.get_cluster_id() for x in all_img_clusters
                                 ]) - linked_img_clusters
    for img_cluster in unlinked_img_clusters:
        cur.execute(
            "INSERT INTO cluster_combine (app, duplicate_tag, cluster_tag, " +
            "cluster_id_img) VALUES (%s, %s, %s, %s)",
            (app, tag, AUTO_CLUSTER_ID, img_cluster))
        db.commit()
        AUTO_CLUSTER_ID += 1

    close_db(db)