def compute_proxEmbedBySubgraph( wordsEmbeddings=None, wordsEmbeddings_path=None, word_dimension=0, dimension=0, wordsSize=0, subpaths_map=None, subpaths_file=None, subgraphs_file='', maxlen_subpaths=1000, maxlen=100, # Sequence longer then this get ignored test_data_file='', top_num=10, ideal_data_file='', func=None, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings,word_dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) else: exit(0) subgraphs_map=dataProcessTools.readAllSubgraphDependencyAndSequencesWithLengths(subgraphs_file) line_count=0 test_map={} print 'Compute MAP and nDCG for file ',test_data_file with open(test_data_file) as f: for l in f: arr=l.strip().split() query=int(arr[0]) map={} for i in range(1,len(arr)): candidate=int(arr[i]) sequences_data, mask_data, lens_data, subgraph_lens_data, buffer_tensor_data,nodesLens_data=dataProcessTools.prepareDataForTestForSubgraphSingleSequenceWithLengthsAsymmetric(query, candidate, subgraphs_map, dimension) if sequences_data is None and mask_data is None and lens_data is None: map[candidate]=-1000. else: value=func(sequences_data, mask_data, lens_data, subgraph_lens_data, wordsEmbeddings, buffer_tensor_data, nodesLens_data) map[candidate]=value tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count]=tops_in_line line_count+=1 line_count=0 ideal_map={} with open(ideal_data_file) as f: for l in f: arr=l.strip().split() arr=[int(x) for x in arr] ideal_map[line_count]=arr[1:] line_count+=1 MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) return MAP,MnDCG
def compute_proxEmbed( wordsEmbeddings=None, # words embeddings wordsEmbeddings_path=None, # the file path of words embeddings word_dimension=0, # dimension of words embeddings dimension=0, # the dimension of paths embeddings wordsSize=0, # the size of words vocabulary subpaths_map=None, # contains sub-paths subpaths_file=None, # the file which contains sub-paths maxlen_subpaths=1000, # the max length for sub-paths maxlen=100, # Sequence longer then this get ignored test_data_file='', # the file path of test data top_num=10, # the top num to predict ideal_data_file='', # ground truth func=None, # model function ): """ compute the result of the model """ model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map = dataProcessTools.loadAllSubPaths( subpaths_file, maxlen_subpaths) else: print 'There is not path for sub-paths, exit!!!' exit(0) line_count = 0 test_map = {} print 'Compute MAP and nDCG for file ', test_data_file with open(test_data_file) as f: for l in f: arr = l.strip().split() query = int(arr[0]) map = {} for i in range(1, len(arr)): candidate = int(arr[i]) subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data = dataProcessTools.prepareDataForTest( query, candidate, subpaths_map) if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: map[candidate] = -1000. else: value = func(subPaths_matrix_data, subPaths_mask_data, subPaths_lens_data, wordsEmbeddings) map[candidate] = value tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count] = tops_in_line line_count += 1 line_count = 0 ideal_map = {} with open(ideal_data_file) as f: for l in f: arr = l.strip().split() arr = [int(x) for x in arr] ideal_map[line_count] = arr[1:] line_count += 1 MAP = evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG = evaluateTools.get_MnDCG(top_num, ideal_map, test_map) return MAP, MnDCG
def compute_path2vec( wordsEmbeddings=None, wordsEmbeddings_path='None', typesEmbeddings=None, typesEmbeddings_path='None', word_dimension=0, type_dimension=0, dimension=0, attention_dimension=0, wordsSize=0, subpaths_map=None, subpaths_file='', sequences_map=None, sequences_file='', maxlen_subpaths=1000, maxlen=100, # Sequence longer then this get ignored alpha=0, beta=0, gamma=0, test_data_file='', top_num=10, ideal_data_file='', func=None, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings, dimension, wordsSize = dataProcessTools.getWordsEmbeddings( wordsEmbeddings_path) else: print 'Exit...' exit(0) if typesEmbeddings is None: if typesEmbeddings_path is not None: typesEmbeddings, type_dimension, wordsSize = dataProcessTools.getTypesEmbeddings( typesEmbeddings_path) else: print 'Exit...' exit(0) sequences_data = dataProcessTools.readAllSequencesFromFile(sequences_file) errCount = 0 line_count = 0 test_map = {} print 'Compute MAP and nDCG for file ', test_data_file with open(test_data_file) as f: for l in f: arr = l.strip().split() query = int(arr[0]) map = {} candidates = [] for i in range(1, len(arr)): key1 = arr[0] + '-' + arr[i] key2 = arr[i] + '-' + arr[0] if key1 in sequences_data or key2 in sequences_data: candidates.append(int(arr[i])) else: map[int(arr[i])] = -1000. errCount += 1 sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, masks_matrix, group_tensor = dataProcessTools.prepareDataForTestBatch( query, candidates, sequences_data, alpha, beta, gamma) if len(sequences_matrix) > 0: scores = func(sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix, wordsEmbeddings, typesEmbeddings, masks_matrix, group_tensor) for index in range(len(candidates)): map[candidates[index]] = scores[index] else: for i in range(1, len(arr)): map[int(arr[i])] = -1. tops_in_line = toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count] = tops_in_line line_count += 1 if line_count % 500 == 0: print '+', if line_count % 5000 == 0: print ' time ==', time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) line_count = 0 ideal_map = {} with open(ideal_data_file) as f: for l in f: arr = l.strip().split() arr = [int(x) for x in arr] ideal_map[line_count] = arr[1:] line_count += 1 MAP = evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG = evaluateTools.get_MnDCG(top_num, ideal_map, test_map) print 'errCount =', errCount return MAP, MnDCG
def compute_metagraphAttention( wordsEmbeddings=None, # words embeddings wordsEmbeddings_path=None, # the file path of words embeddings metagraphEmbeddings_path=None, # the file path of metagraph embeddings wordsSize=0, # the size of words vocabulary subpaths_map=None, # contains sub-paths subpaths_file=None, # the file which contains sub-paths maxlen_subpaths=1000, # the max length for sub-paths test_data_file='', # test data file top_num=10, # top num in experiments ideal_data_file='', # ideal data file func=None, # the MPE process model ): """ evaluate the MPE model """ model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) else: print 'There is not path for wordsEmbeddings, exit!!!' exit(0) if subpaths_map is None: if subpaths_file is not None: subpaths_map=dataProcessTools.loadAllSubPathsRomove0Path(subpaths_file, maxlen_subpaths, wordsEmbeddings) else: print 'There is not path for sub-paths, exit!!!' exit(0) metagraphEmbedding_data, metagraphDimension, metagraphSize=dataProcessTools.getMetagraphEmbeddings(metagraphEmbeddings_path) line_count=0 test_map={} print 'Compute MAP and nDCG for file ',test_data_file with open(test_data_file) as f: for l in f: arr=l.strip().split() query=int(arr[0]) map={} for i in range(1,len(arr)): candidate=int(arr[i]) subPaths_matrix_data,subPaths_mask_data,subPaths_lens_data=dataProcessTools.prepareDataForTest(query, candidate, subpaths_map) if subPaths_matrix_data is None and subPaths_mask_data is None and subPaths_lens_data is None: map[candidate]=-1000. else: value=func(metagraphEmbedding_data, subPaths_matrix_data, subPaths_mask_data, wordsEmbeddings) map[candidate]=value del subPaths_matrix_data del subPaths_mask_data del subPaths_lens_data tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count]=tops_in_line line_count+=1 map=None gc.collect() line_count=0 ideal_map={} with open(ideal_data_file) as f: for l in f: arr=l.strip().split() arr=[int(x) for x in arr] ideal_map[line_count]=arr[1:] line_count+=1 MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) return MAP,MnDCG
def compute_path2vec( wordsEmbeddings=None, wordsEmbeddings_path='None', typesEmbeddings=None, typesEmbeddings_path='None', word_dimension=0, type_dimension=0, dimension=0, attention_dimension=0, wordsSize=0, subpaths_map=None, subpaths_file='', sequences_map=None, sequences_file='', maxlen_subpaths=1000, maxlen=100, alpha=0, beta=0, gamma=0, test_data_file='', top_num=10, ideal_data_file='', func=None, ): model_options = locals().copy() if wordsEmbeddings is None: if wordsEmbeddings_path is not None: wordsEmbeddings,dimension,wordsSize=dataProcessTools.getWordsEmbeddings(wordsEmbeddings_path) else: exit(0) if typesEmbeddings is None: if typesEmbeddings_path is not None: typesEmbeddings,type_dimension,wordsSize=dataProcessTools.getTypesEmbeddings(typesEmbeddings_path) else: exit(0) sequences_data=dataProcessTools.readAllSequencesFromFile(sequences_file) errCount=0 line_count=0 test_map={} print 'Compute MAP and nDCG for file ',test_data_file with open(test_data_file) as f: for l in f: arr=l.strip().split() query=int(arr[0]) map={} for i in range(1,len(arr)): candidate=int(arr[i]) sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix=dataProcessTools.prepareDataForTest(query, candidate, sequences_data, alpha, beta, gamma) if sequences_matrix is None or len(sequences_matrix)==0: map[candidate]=-1000. errCount+=1 else: value=func(sequences_matrix, dependency_matrix, dependWeight_matrix, sequencesLen_vector, discountSeq_matrix, discountForEachNode_matrix,wordsEmbeddings,typesEmbeddings) map[candidate]=value tops_in_line=toolsFunction.mapSortByValueDESC(map, top_num) test_map[line_count]=tops_in_line line_count+=1 line_count=0 ideal_map={} with open(ideal_data_file) as f: for l in f: arr=l.strip().split() arr=[int(x) for x in arr] ideal_map[line_count]=arr[1:] line_count+=1 MAP=evaluateTools.get_MAP(top_num, ideal_map, test_map) MnDCG=evaluateTools.get_MnDCG(top_num, ideal_map, test_map) print 'errCount =',errCount return MAP,MnDCG