Пример #1
0
    def print_feature_eval(self):
        '''
		For each feature type, calculate the means and the std dev for each cluster.  Then
			take the mean and std dev of those quantaties
		'''
        print "FEATURE EVAL"
        print

        stats = dict()

        for name in self.clusters[0].members[0].get_feature_set_names():
            stats[name + "_global"] = self.feature_eval_metrics(
                lambda cluster, _doc: cluster.center.global_sim(_doc, name))
            stats[name +
                  "_region_uniform_weights"] = self.feature_eval_metrics(
                      lambda cluster, _doc: utils.avg_val_mat(
                          cluster.center.region_sim(_doc, name)))
            stats[name + "_region_fixed_weights"] = self.feature_eval_metrics(
                lambda cluster, _doc: utils.avg_val_mat(
                    utils.mult_mats(
                        cluster.center.region_sim_weights(_doc, name))))

        stats['confirm'] = self.feature_eval_metrics(
            lambda cluster, _doc: self.confirm.cluster_doc_similarity(
                cluster, _doc))

        padding_len = 1 + max(map(len, stats.keys()))
        print "Columns are in order:"
        print "1) Mean Similarity between Document and assigned Clusters"
        print "2) Std. Dev of Document-Cluster similarities"
        print "3) Mean Average Cluster similarity (cluster members to cluster)"
        print "4) Std Dev of Average Cluster similarity"
        print "5) Mean of Std Dev of Cluster similarity (cluster members to cluster)"
        print "6) Std Dev of the Std Dev of Cluster similarity"
        print
        for name in sorted(list(stats.keys())):
            print utils.pad_to_len(name, padding_len), "\t", "\t".join(
                map(lambda x: "%.4f" % x, stats[name]))
            if "_uniform" in name or "overall" in name:
                print

        print
        print
Пример #2
0
 def collate_fn(self, datas):
     batch = {}
     batch['length'] = torch.LongTensor([data['length'] for data in datas])
     padded_len = min(self.max_seq_len, max(batch['length']))
     batch['context'] = torch.tensor([
         pad_to_len(data['context'], padded_len, self.padding)
         for data in datas
     ])
     batch['label'] = torch.LongTensor([data['label'] for data in datas])
     return batch
Пример #3
0
    def collate_fn(self, samples):
        batch = {}
        for key in ['id', 'len_text']:
            batch[key] = [sample[key] for sample in samples]

        for key in ['text', 'attention_mask']:
            to_len = max([len(sample[key]) for sample in samples])
            padded = pad_to_len([sample[key] for sample in samples], to_len,
                                self.padding)
            batch[key] = torch.tensor(padded)

        return batch
Пример #4
0
    def collate_fn(self, samples):
        batch = {}
        samples.sort(key=lambda x: x['len_text'], reverse=True)  #sort
        for key in ['id', 'len_text', 'len_summary']:
            batch[key] = [sample[key] for sample in samples]

        for key in ['text', 'summary', 'attention_mask']:
            to_len = max([len(sample[key]) for sample in samples])
            padded = pad_to_len([sample[key] for sample in samples], to_len,
                                self.padding)
            batch[key] = torch.tensor(padded)

        return batch
Пример #5
0
	def print_feature_eval(self):
		'''
		For each feature type, calculate the means and the std dev for each cluster.  Then
			take the mean and std dev of those quantaties
		'''
		print "FEATURE EVAL"
		print

		stats = dict()

		for name in self.clusters[0].members[0].get_feature_set_names():
			stats[name + "_global"] = self.feature_eval_metrics(
				lambda cluster, _doc: cluster.center.global_sim(_doc, name))
			stats[name + "_region_uniform_weights"] = self.feature_eval_metrics(
				lambda cluster, _doc: utils.avg_val_mat(cluster.center.region_sim(_doc, name)))
			stats[name + "_region_fixed_weights"] = self.feature_eval_metrics(
				lambda cluster, _doc: utils.avg_val_mat(utils.mult_mats(cluster.center.region_sim_weights(_doc, name))))


		stats['confirm'] = self.feature_eval_metrics(
			lambda cluster, _doc: self.confirm.cluster_doc_similarity(cluster, _doc))

		padding_len = 1 + max(map(len, stats.keys()))
		print "Columns are in order:"
		print "1) Mean Similarity between Document and assigned Clusters"
		print "2) Std. Dev of Document-Cluster similarities"
		print "3) Mean Average Cluster similarity (cluster members to cluster)"
		print "4) Std Dev of Average Cluster similarity"
		print "5) Mean of Std Dev of Cluster similarity (cluster members to cluster)"
		print "6) Std Dev of the Std Dev of Cluster similarity"
		print
		for name in sorted(list(stats.keys())):
			print utils.pad_to_len(name, padding_len), "\t", "\t".join(map(lambda x: "%.4f" % x, stats[name]))
			if "_uniform" in name or "overall" in name:
				print

		print
		print
Пример #6
0
    def collate_fn(self, samples):
        batch = {}
        for key in ['id', 'sent_range']:
            batch[key] = [sample[key] for sample in samples]

        for key in ['text', 'label']:
            if any(key not in sample for sample in samples):
                continue
            to_len = max([len(sample[key]) for sample in samples])
            padded = pad_to_len([sample[key] for sample in samples], to_len,
                                self.padding if key != 'label' else
                                SeqTaggingDataset.ignore_idx)
            batch[key] = torch.tensor(padded)

        return batch
Пример #7
0
    def collate_fn(self, samples):
        batch = {}
        samples.sort(key=lambda x: len(x['text']), reverse=True)
        for key in ['id', 'len_text', 'len_summary']:
            if any(key not in sample for sample in samples):
                continue
            batch[key] = [sample[key] for sample in samples]

        for key in ['text', 'summary', 'attention_mask']:
            if any(key not in sample for sample in samples):
                continue
            to_len = max([len(sample[key]) for sample in samples])
            padded = pad_to_len([sample[key] for sample in samples], to_len,
                                self.padding)
            batch[key] = torch.tensor(padded)
        batch['padding_len'] = [len(sample['text']) for sample in samples]
        return batch
Пример #8
0
	def print_doc_cluster_sim_mat(self):
		print "CLUSTER-DOC SIM MAT"
		print
		for x, cluster in enumerate(self.clusters):
			print "%d:\t%s" % (x, cluster.label)


		print
		print "documents labeled with # indicate that their most similar cluster has a different true label"
		print "documents labeled with ^ indicate that their assigned cluster is not the most similar cluster"
		print "cluster sim scores labeled with * indicate that the cluster shares the label with the document"
		print

		print (" " * 50) + "\t\t".join(map(str, xrange(len(self.clusters))))
		print

		num_closest_to_incorrect_cluster = 0
		doc_cluster_sim_mat = self.confirm.get_doc_cluster_sim_mat()
		for doc_idx in xrange(len(self.docs)):
			_doc = self.docs[doc_idx]
			to_print = list()
			best_cluster = None
			best_sim_score = -1
			post = ""
			for cluster_idx in xrange(len(self.clusters)):
				cluster = self.clusters[cluster_idx]
				sim_score = doc_cluster_sim_mat[doc_idx][cluster_idx]
				if sim_score > best_sim_score:
					best_cluster = cluster
					best_sim_score = sim_score
				to_print.append("%3.2f" % sim_score)
				if (cluster.label == _doc.label):
					to_print[-1] += '*'
			if _doc.label != best_cluster.label:	
				num_closest_to_incorrect_cluster += 1
				post += "#"
			if _doc not in best_cluster.members:
				post += "^"
			print "%s%s" % (utils.pad_to_len("%s %s" % (_doc._id, (_doc.label + post)), 50), "\t".join(to_print))

		#for _doc in self.docs:
		#	to_print = list()
		#	best_cluster = None
		#	best_sim_score = -1
		#	post = ""
		#	for cluster in self.clusters:
		#		sim_score = self.confirm.cluster_doc_similarity(cluster, _doc)
		#		if sim_score > best_sim_score:
		#			best_cluster = cluster
		#			best_sim_score = sim_score
		#		to_print.append("%3.2f" % sim_score)
		#		if (cluster.label == _doc.label):
		#			to_print[-1] += '*'
		#	if _doc.label != best_cluster.label:	
		#		num_closest_to_incorrect_cluster += 1
		#		post += "#"
		#	if _doc not in best_cluster.members:
		#		post += "^"
		#	print "%s%s" % (utils.pad_to_len("%s %s" % (_doc._id, (_doc.label + post)), 50), "\t".join(to_print))

		print
		print "Number of docs most similar to a wrong cluster: %d / %d = %2.1f%%" % (
			num_closest_to_incorrect_cluster, len(self.docs), 100.0 * num_closest_to_incorrect_cluster / len(self.docs))
		print
		print
Пример #9
0
    def print_doc_cluster_sim_mat(self):
        print "CLUSTER-DOC SIM MAT"
        print
        for x, cluster in enumerate(self.clusters):
            print "%d:\t%s" % (x, cluster.label)

        print
        print "documents labeled with # indicate that their most similar cluster has a different true label"
        print "documents labeled with ^ indicate that their assigned cluster is not the most similar cluster"
        print "cluster sim scores labeled with * indicate that the cluster shares the label with the document"
        print

        print(" " * 50) + "\t\t".join(map(str, xrange(len(self.clusters))))
        print

        num_closest_to_incorrect_cluster = 0
        doc_cluster_sim_mat = self.confirm.get_doc_cluster_sim_mat()
        for doc_idx in xrange(len(self.docs)):
            _doc = self.docs[doc_idx]
            to_print = list()
            best_cluster = None
            best_sim_score = -1
            post = ""
            for cluster_idx in xrange(len(self.clusters)):
                cluster = self.clusters[cluster_idx]
                sim_score = doc_cluster_sim_mat[doc_idx][cluster_idx]
                if sim_score > best_sim_score:
                    best_cluster = cluster
                    best_sim_score = sim_score
                to_print.append("%3.2f" % sim_score)
                if (cluster.label == _doc.label):
                    to_print[-1] += '*'
            if _doc.label != best_cluster.label:
                num_closest_to_incorrect_cluster += 1
                post += "#"
            if _doc not in best_cluster.members:
                post += "^"
            print "%s%s" % (utils.pad_to_len(
                "%s %s" % (_doc._id,
                           (_doc.label + post)), 50), "\t".join(to_print))

        #for _doc in self.docs:
        #	to_print = list()
        #	best_cluster = None
        #	best_sim_score = -1
        #	post = ""
        #	for cluster in self.clusters:
        #		sim_score = self.confirm.cluster_doc_similarity(cluster, _doc)
        #		if sim_score > best_sim_score:
        #			best_cluster = cluster
        #			best_sim_score = sim_score
        #		to_print.append("%3.2f" % sim_score)
        #		if (cluster.label == _doc.label):
        #			to_print[-1] += '*'
        #	if _doc.label != best_cluster.label:
        #		num_closest_to_incorrect_cluster += 1
        #		post += "#"
        #	if _doc not in best_cluster.members:
        #		post += "^"
        #	print "%s%s" % (utils.pad_to_len("%s %s" % (_doc._id, (_doc.label + post)), 50), "\t".join(to_print))

        print
        print "Number of docs most similar to a wrong cluster: %d / %d = %2.1f%%" % (
            num_closest_to_incorrect_cluster, len(self.docs),
            100.0 * num_closest_to_incorrect_cluster / len(self.docs))
        print
        print