Exemplo n.º 1
0
    def computeScoresOverride(self, query, docs):
        queryText = self.concatTextEntryWords(query)
        if self.debugPrint:
            print('getScores', queryText)

        queryIdArr = []
        queryTextArr = []
        docTextArr = []
        docIdArr = []
        labelArr = []

        for e in docs:
            docIdArr.append(e.id)
            docTextArr.append(self.concatTextEntryWords(e))
            queryTextArr.append(queryText)
            queryIdArr.append('fake_qid')
            labelArr.append(0)

        dataRaw = pd.DataFrame({
            'id_left': queryIdArr,
            'text_left': queryTextArr,
            'id_right': docIdArr,
            'text_right': docTextArr,
            'label': labelArr
        })

        dataTestPacked = pack(dataRaw)

        dataTestProc = self.prep.transform(dataTestPacked)

        dataForModel, _ = dataTestProc.unpack()

        preds = self.model.predict(dataForModel)

        sampleRet = {}
        for k in range(len(docs)):
            e = docs[k]
            score = preds[k]
            if self.debugPrint:
                print(score, self.textEntryToStr(e))
            sampleRet[e.id] = score

        return sampleRet
Exemplo n.º 2
0
dataFileTrain = os.path.join('collections', 'derived_data', colName,
                             'match_zoo_train', 'tran_neg10.tsv')
dataFileTest = os.path.join('collections', 'derived_data', colName,
                            'match_zoo_train', 'dev1_allCand.tsv')

print(
    f'Collection: {colName} # of epochs: {epochQty} model file: {modelFile} data transform file: {dataTranFile}'
)

if os.path.exists(modelFile):
    # Stupid hack for now, b/c save will fail if the model exists
    print('Model already exists, exiting!')
    sys.exit(1)

# Note dtype! don't let Pandas guess column data types!
dataTrainPacked = pack(readWhiteSpacedMatchZooData(dataFileTrain))
dataTestPacked = pack(readWhiteSpacedMatchZooData(dataFileTest))

#prep = mz.preprocessors.BasicPreprocessor()
prep = WhiteSpacePreprocessor()

import pdb, sys

#try:
if True:

    rankingTask = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())
    rankingTask.metrics = [
        mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=5),
        mz.metrics.MeanAveragePrecision()