Пример #1
0
    def load_queries(self):
        if not os.path.exists(DIRECTORY):
            os.makedirs(DIRECTORY)

        data_split = int(TOTAL_QUERY*0.6)
        validation_split = int(TOTAL_QUERY*0.2)
        test_split = int(TOTAL_QUERY*0.2)
        print "data_split", data_split
        print "validation_split", validation_split
        print "test_split", test_split

        f = open(DBPEDIA_QUERY_LOG,'rb')
        fq = open(DIRECTORY+"x_query.txt",'w')
        ft = open(DIRECTORY+"y_time.txt",'w')
        ff = open(DIRECTORY+"x_features.txt",'w')
        x_f_csv = csv.writer(ff)
        sparql = SPARQLWrapper(DBPEDIA_ENDPOINT)
        f_extractor = FeatureExtractor()
        
        
        
        sw1 = StopWatch()
        sw2 = StopWatch()
        print_log_split = int(TOTAL_QUERY/10)
        
      

        count =0 
        for line in f:
            if count%print_log_split==0:
                print count," queries processed in ",sw2.elapsed_seconds()," seconds"
        
            if(count>=TOTAL_QUERY):
                break

            if count == data_split:
                fq.close()
                ft.close()
                ff.close()
                fq = open(DIRECTORY+"xval_query.txt",'w')
                ft = open(DIRECTORY+"yval_time.txt",'w')
                ff = open(DIRECTORY+"xval_features.txt",'w')
                x_f_csv = csv.writer(ff)
            elif count == (data_split+validation_split):
                fq.close()
                ft.close()
                ff.close()
                fq = open(DIRECTORY+"xtest_query.txt",'w')
                ft = open(DIRECTORY+"ytest_time.txt",'w')
                ff = open(DIRECTORY+"xtest_features.txt",'w')
                x_f_csv = csv.writer(ff)


            try:
                row = line.split()
                query_log = row[6][1:-1]
                #print query_log
                par = urlparse.parse_qs(urlparse.urlparse(query_log).query)
                #util.url_decode(row[6])
                sparql_query = par['query'][0]

                if sparql._parseQueryType(sparql_query) != SELECT:
                    continue

                #print sparql_query
                #print row

                sparql_query = f_extractor.get_dbp_sparql(sparql_query)

                #print sparql_query
                


                feature_vector = f_extractor.get_features(sparql_query)

                if feature_vector == None:
                    print "feature vector not found"
                    continue



                sparql.setQuery(sparql_query)
                sparql.setReturnFormat(JSON)

                sw1.reset()
                results = sparql.query().convert()
                elapsed = sw1.elapsed_milliseconds()

                result_rows = len(results["results"]["bindings"])

                # if result_rows == 0:
                #     continue

                # print "QUERY =", sparql_query
                # print "feature vector:",feature_vector
                # print elapsed, "seconds"                
                # print results
                # print "rows", result_rows
                # print "-----------------------"

                fq.write(query_log+'\n')
                ft.write(str(elapsed)+'\n')
                x_f_csv.writerow(feature_vector)
                count += 1
            except Exception as inst:
                print "Exception", inst

            
        
        f.close()
        fq.close()
        ft.close()
        ff.close()
        print count, "queries processed"