def trainListFile(self, listTrainFile, listmanualfiles): if len(listmanualfiles) != len(listTrainFile): print("Co loi") sys.exit() self.reset() queries = dlib.ranking_pairs() for index in range(0, len(listTrainFile)): self.reset() data = dlib.ranking_pair() inputNonRelevant = " ".join([line for line in open(listTrainFile[index], 'r').readlines()]) tpAllSent = myTokenizer(inputNonRelevant) self.inputFromString(inputNonRelevant) inputRelevant = " ".join([line for line in open(listmanualfiles[index], 'r').readlines()]) tpRelevant = myTokenizer(inputRelevant) tpNonRelevant = list(set(tpAllSent).difference(set(tpRelevant))) self.genAllVector() for sent in tpRelevant: data.relevant.append(dlib.vector(self.dicVector.get(sent.strip()))) for sent in tpNonRelevant: data.nonrelevant.append(dlib.vector(self.dicVector.get(sent.strip()))) queries.append(data) trainer = dlib.svm_rank_trainer() trainer.c = 10 rank = trainer.train(queries) _weight = [] for i in range(0, len(rank.weights)): _weight.append(rank.weights[i]) return _weight
# ranking function that gives every relevant vector a higher score than every # non-relevant vector. Sometimes what you want to do is a little more complex # than this. # # For example, in the web page ranking example we have to rank pages based on a # user's query. In this case, each query will have its own set of relevant and # non-relevant documents. What might be relevant to one query may well be # non-relevant to another. So in this case we don't have a single global set of # relevant web pages and another set of non-relevant web pages. # # To handle cases like this, we can simply give multiple ranking_pair instances # to the trainer. Therefore, each ranking_pair would represent the # relevant/non-relevant sets for a particular query. An example is shown below # (for simplicity, we reuse our data from above to make 4 identical "queries"). queries = dlib.ranking_pairs() queries.append(data) queries.append(data) queries.append(data) queries.append(data) # We can train just as before. rank = trainer.train(queries) # Now that we have multiple ranking_pair instances, we can also use # cross_validate_ranking_trainer(). This performs cross-validation by splitting # the queries up into folds. That is, it lets the trainer train on a subset of # ranking_pair instances and tests on the rest. It does this over 4 different # splits and returns the overall ranking accuracy based on the held out data. # Just like test_ranking_function(), it reports both the ordering accuracy and # mean average precision.
def train(self, directoryPlain, directoryManual): self.reset() listFile = [] listPlainFile = listAllFileInFolder(directoryPlain) listManualFile = listAllFileInFolder(directoryManual) dicPlainFile = {} dicManualFile = {} for file in listPlainFile: fname = file.strip().split('/')[-1] listFile.append(fname) dicPlainFile[fname] = file for file in listManualFile: fname = file.strip().split('/')[-1] listFile.append(fname) dicManualFile[fname] = file listFile = list(set(listFile)) queries = dlib.ranking_pairs() countt = 0 outfile = open("completefile.txt", 'w') for file in listFile: outvecfile = open("/home/hien/Data/Work/Wordnet_naiscorp/test/valuevector/"+file.strip().split('/')[-1], 'w') countt = countt + 1 outfile.write(file+'\n') print (file, countt) self.reset() data = dlib.ranking_pair() inputNonRelevant = " ".join([line for line in open(dicPlainFile.get(file), 'r').readlines()]) tpAllSent = myTokenizer(inputNonRelevant) self.inputFromString(inputNonRelevant) inputRelevant = " ".join([line for line in open(dicManualFile.get(file), 'r').readlines()]) tpRelevant = myTokenizer(inputRelevant) tpNonRelevant = list(set(tpAllSent).difference(set(tpRelevant))) self.genAllVector() for sent in tpAllSent: outvecfile.write(str(self.dicVector.get(sent.strip()))+"\t"+sent.strip()+'\n') outvecfile.close() for sent in tpRelevant: # print (sent) # print(self.dicVector.get(sent)) # print(type(self.dicVector.get(sent))) data.relevant.append(dlib.vector(self.dicVector.get(sent.strip()))) # outvecfile.write(str(self.dicVector.get(sent.strip()))+"\t"+sent.strip()+'\n') # outvecfile. for sent in tpNonRelevant: # print(self.dicVector.get(sent)) data.nonrelevant.append(dlib.vector(self.dicVector.get(sent.strip()))) queries.append(data) trainer = dlib.svm_rank_trainer() trainer.c = 10 rank = trainer.train(queries) _weight = [] for i in range(0, len(rank.weights)): _weight.append(rank.weights[i]) # print(type(rank.weights)) # print (rank.weights[0]) # print (rank.weights) # print(_weight) # return rank.weights return _weight
# relevant set and non-relevant set. The trainer is attempting to find a # ranking function that gives every relevant vector a higher score than every # non-relevant vector. Sometimes what you want to do is a little more complex # than this. # # For example, in the web page ranking example we have to rank pages based on a # user's query. In this case, each query will have its own set of relevant and # non-relevant documents. What might be relevant to one query may well be # non-relevant to another. So in this case we don't have a single global set of # relevant web pages and another set of non-relevant web pages. # # To handle cases like this, we can simply give multiple ranking_pair instances # to the trainer. Therefore, each ranking_pair would represent the # relevant/non-relevant sets for a particular query. An example is shown below # (for simplicity, we reuse our data from above to make 4 identical "queries"). queries = dlib.ranking_pairs() queries.append(data) queries.append(data) queries.append(data) queries.append(data) # We can train just as before. rank = trainer.train(queries) # Now that we have multiple ranking_pair instances, we can also use # cross_validate_ranking_trainer(). This performs cross-validation by splitting # the queries up into folds. That is, it lets the trainer train on a subset of # ranking_pair instances and tests on the rest. It does this over 4 different # splits and returns the overall ranking accuracy based on the held out data. # Just like test_ranking_function(), it reports both the ordering accuracy and # mean average precision.