def computeScoresOverride(self, query, docs): queryText = self.concatTextEntryWords(query) if self.debugPrint: print('getScores', queryText) queryIdArr = [] queryTextArr = [] docTextArr = [] docIdArr = [] labelArr = [] for e in docs: docIdArr.append(e.id) docTextArr.append(self.concatTextEntryWords(e)) queryTextArr.append(queryText) queryIdArr.append('fake_qid') labelArr.append(0) dataRaw = pd.DataFrame({ 'id_left': queryIdArr, 'text_left': queryTextArr, 'id_right': docIdArr, 'text_right': docTextArr, 'label': labelArr }) dataTestPacked = pack(dataRaw) dataTestProc = self.prep.transform(dataTestPacked) dataForModel, _ = dataTestProc.unpack() preds = self.model.predict(dataForModel) sampleRet = {} for k in range(len(docs)): e = docs[k] score = preds[k] if self.debugPrint: print(score, self.textEntryToStr(e)) sampleRet[e.id] = score return sampleRet
dataFileTrain = os.path.join('collections', 'derived_data', colName, 'match_zoo_train', 'tran_neg10.tsv') dataFileTest = os.path.join('collections', 'derived_data', colName, 'match_zoo_train', 'dev1_allCand.tsv') print( f'Collection: {colName} # of epochs: {epochQty} model file: {modelFile} data transform file: {dataTranFile}' ) if os.path.exists(modelFile): # Stupid hack for now, b/c save will fail if the model exists print('Model already exists, exiting!') sys.exit(1) # Note dtype! don't let Pandas guess column data types! dataTrainPacked = pack(readWhiteSpacedMatchZooData(dataFileTrain)) dataTestPacked = pack(readWhiteSpacedMatchZooData(dataFileTest)) #prep = mz.preprocessors.BasicPreprocessor() prep = WhiteSpacePreprocessor() import pdb, sys #try: if True: rankingTask = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss()) rankingTask.metrics = [ mz.metrics.NormalizedDiscountedCumulativeGain(k=3), mz.metrics.NormalizedDiscountedCumulativeGain(k=5), mz.metrics.MeanAveragePrecision()