Exemplo n.º 1
0
    def __init__(self, config_file=CONFIG_FILE):

        self.X = None
        self.config = ConfigParser.RawConfigParser()
        self.config.read(config_file)

        # self.total_query = self.config.getint('DBpedia','TotalQuery')
        # self.query_file = self.config.get('DBpedia','QueryFile')
        self.dbp_prefix_file = self.config.get("DBpedia", "Namespaces")
        # self.f_extractor = FeatureExtractor()
        self.sp_util = SarqlUtil(self.dbp_prefix_file)
        self.queries = list()
        self.distance_hungarian_script = self.config.get("QueryClustering", "DistanceHungarian")
        self.K = self.config.getint("QueryClustering", "K")
        self.random_shuffel_max_iters = self.config.getint("QueryClustering", "RandomShuffelMaxIters")
        self.kmediods_max_iters = self.config.getint("QueryClustering", "KmediodsMaxIters")

        self.cluster_cach_file = self.config.get("QueryClustering", "HungarianClusterCach")
        self.center_cach_file = self.config.get("QueryClustering", "HungarianCenterCach")

        self.training_query_file = self.config.get("Query", "TrainingQuery")
        self.training_algebra_feature_file = self.config.get("Query", "TrainingAlgebraFeatures")
        self.training_execution_times_file = self.config.get("Query", "TrainingQueryExecutionTimes")
        self.validation_query_file = self.config.get("Query", "ValidationQuery")
        self.test_query_file = self.config.get("Query", "TestQuery")

        self.distance_matrix_file = self.config.get("QueryClustering", "TrainingDistanceHungarianMatrix")

        self.center_idxs = None
        self.idx = None
Exemplo n.º 2
0
class ClusterSparql:
    """Cluster sparql"""

    def __init__(self, config_file=CONFIG_FILE):

        self.X = None
        self.config = ConfigParser.RawConfigParser()
        self.config.read(config_file)

        # self.total_query = self.config.getint('DBpedia','TotalQuery')
        # self.query_file = self.config.get('DBpedia','QueryFile')
        self.dbp_prefix_file = self.config.get("DBpedia", "Namespaces")
        # self.f_extractor = FeatureExtractor()
        self.sp_util = SarqlUtil(self.dbp_prefix_file)
        self.queries = list()
        self.distance_hungarian_script = self.config.get("QueryClustering", "DistanceHungarian")
        self.K = self.config.getint("QueryClustering", "K")
        self.random_shuffel_max_iters = self.config.getint("QueryClustering", "RandomShuffelMaxIters")
        self.kmediods_max_iters = self.config.getint("QueryClustering", "KmediodsMaxIters")

        self.cluster_cach_file = self.config.get("QueryClustering", "HungarianClusterCach")
        self.center_cach_file = self.config.get("QueryClustering", "HungarianCenterCach")

        self.training_query_file = self.config.get("Query", "TrainingQuery")
        self.training_algebra_feature_file = self.config.get("Query", "TrainingAlgebraFeatures")
        self.training_execution_times_file = self.config.get("Query", "TrainingQueryExecutionTimes")
        self.validation_query_file = self.config.get("Query", "ValidationQuery")
        self.test_query_file = self.config.get("Query", "TestQuery")

        self.distance_matrix_file = self.config.get("QueryClustering", "TrainingDistanceHungarianMatrix")

        self.center_idxs = None
        self.idx = None

        # print self.total_query
        # print self.query_file, type(self.query_file)

    def load_queries_dbp_log(self):

        f = open(self.query_file, "rb")
        count = 0

        for line in f:
            # print line
            if count >= self.total_query:
                break
            try:
                sparql_query = self.sp_util.dbp_log_to_sparql(line)
                # sparql_query = self.sp_util.get_dbp_sparql(sparql_query)
                # print sparql_query
                count += 1
                self.queries.append(sparql_query)
            except:
                pass
        self.X = np.array(self.queries).transpose()

    def load_training_queries(self, limit=None):
        # if limit == None:
        #     limit = int(self.total_query*0.6)
        print "loading training queries:", self.training_query_file
        f = open(self.training_query_file, "rb")
        count = 0

        for line in f:
            # print line
            # if count >= limit:
            #     break
            try:
                # sparql_query = self.sp_util.url_to_sparql(line)
                # for the dbpsb queries, had to add /sparql/? to make it valid for url parsing
                sparql_query = self.sp_util.url_to_sparql("/sparql/?" + line)
                # print sparql_query
                count += 1
                self.queries.append(sparql_query)
            except Exception as inst:
                print "Exception", inst
        self.X = np.array(self.queries).transpose()

    def distance_hungarian(self, q1, q2):

        tmp_q1_file = "tmp_q1_file~"
        # print query_str
        dbp_query1 = self.sp_util.get_dbp_sparql(q1)
        tq1 = open(tmp_q1_file, "w")
        tq1.write(dbp_query1)
        tq1.close()

        tmp_q2_file = "tmp_q2_file~"
        # print query_str
        dbp_query2 = self.sp_util.get_dbp_sparql(q2)
        tq2 = open(tmp_q2_file, "w")
        tq2.write(dbp_query2)
        tq2.close()

        cmd = self.distance_hungarian_script + " --file" + " " + tmp_q1_file + " " + tmp_q2_file
        (status, abs_query_str) = commands.getstatusoutput(cmd)
        # print abs_query_str
        if status != 0:
            # print "ged error", (status,abs_query_str)
            raise Exception("GED error status: " + str(status) + " " + abs_query_str)
        # print abs_query_str
        return float(abs_query_str)

    def compute_distance_matrix_real_time(self, distance_function=distance_hungarian):
        self.distance_matrix = k_mediods.compute_symmetric_distance(self.X, distance_function)

        distance_filename = self.distance_cach_filename(distance_function)
        np.save(distance_filename, self.distance_matrix)

    def compute_distance_matrix_from_cach(self, distance_function=distance_hungarian):

        distance_filename = self.distance_cach_filename(distance_function)
        self.distance_matrix = np.load(distance_filename + ".npy")

    def distance_cach_filename(self, distance_function):
        file_name = "distance_matrix"

        file_name = self.distance_function_name(distance_function) + "_hungarian"
        file_name = file_name + "_cach"
        return file_name

    def distance_function_name(self, distance_function):
        """
        MODIFY THIS WHEN NEW distance_function is added
        """

        if distance_function == self.distance_hungarian:
            return "_hungarian"
        return ""

    def save_clusters(self, distance_function):
        # df_name = self.distance_function_name(distance_function)

        # np.save(self.cluster_cach_file,self.idx)
        # np.save(self.center_cach_file,self.center_idxs)
        np.savetxt(self.cluster_cach_file, self.idx, fmt="%d")
        np.savetxt(self.center_cach_file, self.center_idxs, fmt="%d")

    def predict_cluster(self, Xi, distance_function, url_to_sparql=False):
        if url_to_sparql == True:
            Xi = self.sp_util.url_to_sparql(Xi)

        # if self.idx == None:
        #    self.idx = np.load(elf.cluster_cach_file+'.npy')
        if self.center_idxs == None:
            self.center_idxs = np.load(self.center_cach_file + df_name + ".npy")

        min_dist = np.inf
        min_k = -1

        for k in self.center_idxs:

            k_Xi = self.X[k]
            # print k_Xi
            d = distance_function(Xi, k_Xi)
            if min_dist > d:
                min_dist = d
                min_k = k

        # print "original:", Xi
        # print "prediction:",self.X[min_k]

        return min_k

    def cluster_queries(self, distance_function):

        # (min_center_idxs,min_cost) = k_mediods.initial_random_centers_cost_minimization(self.X ,self.K,self.distance_matrix,self.random_shuffel_max_iters,self.kmediods_max_iters)
        # print "min model cost: ", min_cost

        (initial_centers, min_center_idxs) = k_mediods.initial_random_centers(self.X, self.K)

        (self.center_idxs, self.idx) = k_mediods.k_mediods(
            self.X, min_center_idxs, self.kmediods_max_iters, self.distance_matrix
        )

        # k_mediods.print_clusters(self.X, self.idx, self.center_idxs)

        total_cost = k_mediods.model_cost(self.X, self.idx, self.center_idxs, self.distance_matrix)
        print "model cost: ", total_cost

        self.save_clusters(distance_function)

    def load_distaince_hungarian_matrix(self):
        """
        Must be called after loading training queries
        """
        m = np.size(self.X, 0)
        print "m:", m
        self.distance_matrix = np.zeros((m, m), dtype=float)
        f = open(self.distance_matrix_file)
        for line in f:
            row = line.split()
            i = int(row[0])
            j = int(row[1])
            d = float(row[2])
            # print i,j,d
            self.distance_matrix[i, j] = d
            self.distance_matrix[j, i] = d