def GetQueryFeatures(mask): """ get query features for one dataset. """ # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf', # 'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia'] data_source = [ 'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix', 'goa', 'linkedgeodata' ] feature_file = 'docs/feature/%s_feature.pkl' % data_source[mask] conn = GetConn(data_source[mask]) valuedSession = read_valuedSession(data_source[mask], filter_users=False) # get query texts queries = [] for sessi in valuedSession: sess = sessi['session'] queries.extend(sess['query'].tolist()) queries = list(set(queries)) query2feature = {} for queryi in queries: features = GetQueryUsedFeature(conn, queryi) if queryi not in query2feature.keys(): query2feature[queryi] = [] query2feature[queryi].extend(features) write_pkl(feature_file, query2feature) return query2feature
def writeSessionToDirectory(output_dir, name, data, dbpedia): """ output format: json { dataset: 'swdf' sessions: [ { session_id: 0 session_length: 10 user: xxxx queries: [ { query_id: 0 query_content: SPARQL query time_stamp: index_in_file: for dbpedia, the original index in file; for others, the original IRI } ] } ] } } """ if not dbpedia: conn, repo = GetConn(name, if_return_repo=True) else: conn = None output = {'dataset': name, 'sessions': []} index_in_file_key = 'idxInFile' if dbpedia else 'query' for session_idx, sessi in tqdm(enumerate(data), total=len(data)): sessioni = {'session_id': session_idx, 'session_length': len(sessi['session']), 'user': sessi['agent'], 'queries': []} texts = getTexts(sessi['session'], dbpedia, conn) for i in range(len(sessi['session'])): query_temp = {'query_id': i, 'query_content': texts[i], 'time_stamp': sessi['session'].iloc[i]['time'], 'index_in_file': sessi['session'].iloc[i][index_in_file_key]} sessioni['queries'].append(query_temp) output['sessions'].append(sessioni) if not os.path.exists(output_dir): os.mkdir(output_dir) file_name = os.path.join(output_dir, f'{name}_session_data.json') write_pkl(file_name, output) if not dbpedia: conn.close() repo.shutDown()
def getQueryFeature(data, name, dbpedia=False, output_dir='results/operator'): feature_file = os.path.join(output_dir, f'{name}_feature.pkl') query2text = sessions2Query2Text(data) idx2query = {} for sessi in data: sess = sessi['queries'] for idxi in range(len(sess)): index = sess[idxi]['index_in_file'] if index not in idx2query.keys(): idx2query[index] = query2text[index] query2feature = {} for idx, query in idx2query.items(): try: pq = parse_spl(query) except: continue res = GetFeatureDBpedia(pq) query2feature[idx] = res write_pkl(feature_file, query2feature)
from ipdb import set_trace set_trace() temp = res[i][key] if key not in res_all.keys(): res_all[key] = { 'add_count': 0, 'delete_count': 0, 'change_count': 0 } ite = ['add_count', 'delete_count', 'change_count'] for itei in ite: res_all[key][itei] += temp[itei] # print(res_all) res['all'] = res_all write_pkl('docs/compare_count.pkl', res) def count_changes_block(name, dict_key=['Triple', 'Filter'], list_key=[], pkl=None): """ name -> data source name count_changes about some operator, is this operator new? or old? inside this block, has some triples added? deleted? in the mappings of these triple, how did these change? where change and how to change? Triple/Filter/Other: { block_name: {new_count: xx, old_count: xx, add_count:xx, delete_count:xx, change_count:xx, change_type_count: {'type1': xx ...}}
if queryi.projectionNum == -1: from ipdb import set_trace; set_trace() query2vector[queryidx] = queryi.featureVec if debug: queryi.print_query() from ipdb import set_trace; set_trace() except: continue return query2vector if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--mask", '-m', type=int, help="choose which file to run") parser.add_argument("--data_dir", '-d', type=str, default='docs/exportSession', help="the directory of data") parser.add_argument("--output_dir", '-o', type=str, default='ISWC-extension/results', help="output directory") args = parser.parse_args() mask = args.mask data_source = ['ncbigene','ndc','orphanet','sgd','sider','swdf','affymetrix','goa','linkedgeodata', 'dbpedia.3.5.1.log','access.log-20151025', 'access.log-20151124','access.log-20151126', 'access.log-20151213','access.log-20151230','access.log-20160117','access.log-20160212', 'access.log-20160222','access.log-20160301','access.log-20160303','access.log-20160304', 'access.log-20160314','access.log-20160411'] data = readExportedData(args.data_dir, data_source[args.mask]) dbpedia = False if args.mask <= 8 else True query2vector = GetFeatureVectors(data['sessions'], data_source[args.mask], dbpedia=dbpedia) write_pkl(f'results/hyper_featureVector/{data_source[mask]}_Vector.pkl', query2vector)
'-d', type=str, default='exportSession/', help="the directory of data") parser.add_argument("--output_dir", '-o', type=str, default='results/', help="output directory") args = parser.parse_args() i = args.mask subdir = 'hypergraph' if args.hypergraph else 'normal_graph' output_directory = os.path.join(args.output_dir, subdir) if not os.path.exists(output_directory): os.mkdir(output_directory) data = readExportedData(args.data_dir, data_source[args.mask]) simi_conti = compGraphSimilarity_Session(data['sessions'], begin=args.begin, end=args.end, hyper=args.hypergraph) write_pkl(f'{output_directory}/{data_source[i]}_simi_conti.pkl', simi_conti) simi_first = compGraphSimilarity_Session_First(data['sessions'], begin=args.begin, end=args.end, hyper=args.hypergraph) write_pkl(f'{output_directory}/{data_source[i]}_simi_first.pkl', simi_first)
parser.add_argument("--output_dir", '-o', type=str, default='results', help="output directory") args = parser.parse_args() # repo_names = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf', # 'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia'] # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf', # 'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia'] data_source = [ 'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix', 'goa', 'linkedgeodata', 'dbpedia.3.5.1.log', 'access.log-20151025', 'access.log-20151124', 'access.log-20151126', 'access.log-20151213', 'access.log-20151230', 'access.log-20160117', 'access.log-20160212', 'access.log-20160222', 'access.log-20160301', 'access.log-20160303', 'access.log-20160304', 'access.log-20160314', 'access.log-20160411' ] # mask = args.mask for mask in range(len(data_source)): dbpedia = False if mask <= 8 else True data = readExportedData(args.data_dir, data_source[mask]) query2vector, IRI_table = getAllVector(data['sessions'], data_source[mask], dbpedia=dbpedia) write_pkl(f'results/IRI_vector/{data_source[mask]}_IRI_table.pkl', IRI_table) write_pkl(f'results/IRI_vector/{data_source[mask]}_Vector.pkl', query2vector)
def vector_ana(data, mask, dir_='vector', normalize=False, debug=False): # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf', # 'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia'] data_source = [ 'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix', 'goa', 'linkedgeodata', 'dbpedia.3.5.1.log', 'access.log-20151025', 'access.log-20151124', 'access.log-20151126', 'access.log-20151213', 'access.log-20151230', 'access.log-20160117', 'access.log-20160212', 'access.log-20160222', 'access.log-20160301', 'access.log-20160303', 'access.log-20160304', 'access.log-20160314', 'access.log-20160411' ] dbpedia = False if mask <= 8 else True query2text = sessions2Query2Text(data) query2vector = read_pkl( os.path.join(dir_, f'{data_source[mask]}_Vector.pkl')) confusionMtrix_dataset = [] for index, sess in tqdm(enumerate(data), total=len(data), leave=True): session = sess['queries'] session_len = sess['session_length'] flag = 0 infos = [] for ith in range(session_len): queryi = session[ith]['index_in_file'] texti = session[ith]['query_content'] try: infoi = GetInfo(texti) infos.append(infoi) except: flag = 1 break if flag: continue if normalize: maximum = np.zeros(10) for ith1 in range(session_len): query1 = session[ith1]['index_in_file'] vector1 = query2vector[query1] for i, num in enumerate(vector1): if num > maximum[i]: maximum[i] = num if debug: print(vector1) maximum = np.where(maximum == 0, 1, maximum) if debug: print(maximum) from ipdb import set_trace set_trace() mat_kl = np.zeros((session_len, session_len)) mat_cos = np.zeros((session_len, session_len)) for ith1 in range(session_len): for ith2 in range(session_len): key = 'index_in_file' query1 = session[ith1][key] query2 = session[ith2][key] vector1 = query2vector[query1] vector2 = query2vector[query2] if debug: print('before normalize') print(vector1) print(vector2) if normalize: vector1 = vector1 / maximum vector2 = vector2 / maximum if debug: print('after') print(vector1) print(vector2) from ipdb import set_trace set_trace() mat_kl[ith1][ith2] = kl_divergence(vector1, vector2) mat_cos[ith1][ith2] = cosine_distance(vector1, vector2) confusionMtrix_dataset.append({ 'index': index, 'mat_kl': mat_kl, 'mat_cos': mat_cos }) marker = '_normalized' if normalize else '' write_pkl( os.path.join(dir_, f'{data_source[mask]}_confusionMat{marker}.pkl'), confusionMtrix_dataset) return confusionMtrix_dataset