def test_ranking_short(): parsed = None file = "data/cacmShort.txt" for path in ["./", "../"]: try: parsed = myParser.buildDocCollectionSimple(path + file, ".T") break except FileNotFoundError: pass assert parsed indexer = indexerSimple.IndexerSimple(parsed.docs) requete = "home computer microphotographi" models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] models.append(jelinekMercer.JelinekMercer(indexer, 1)) models.append(okapiBM25.OkapiBM25(indexer, 1.2, .75)) rankings = [m.getRanking(requete) for m in models] # 5 docs ont un score non nul qqsoit modèle for ranking in rankings: assert len(ranking) == 5 # ordre des résultats for ranking in rankings[:-1]: assert [x[0] for x in ranking] == ["7", "6", "4", "2", "10"] assert [x[0] for x in rankings[-1]] == ["6", "7", "4", "10", "2"]
def optimisationOkapi(debut1, fin1, nbPoint1, debut2, fin2, nbPoint2, donnees, labels): rangeK = np.linspace(debut1, fin1, nbPoint1) rangeB = np.linspace(debut2, fin2, nbPoint2) m = okapiBM25.OkapiBM25() mapKB = [] param = [] for k1 in rangeK: for b in rangeB: s = 0 for k in range(len(donnees)): predictionModele = m.getRanking(donnees[k], [k1, b]) s += avgPrec(predictionModele, labels[k]) mapKB.append(s / len(donnees)) param.append([k1, b]) leMeilleurKBDuMonde = param[np.argmax(mapKB)]
def test_ranking_long(): parsed = None file = "data/cacm/cacm.txt" for path in ["./", "../"]: try: parsed = myParser.buildDocCollectionSimple(path + file, ".T") break except FileNotFoundError: pass assert parsed indexer = indexerSimple.IndexerSimple(parsed.docs) requete = "home computer microphotographi" models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] models.append(jelinekMercer.JelinekMercer(indexer, .2)) models.append(okapiBM25.OkapiBM25(indexer, 1.2, .75)) rankings = [m.getRanking(requete) for m in models] #modèle O rang 0 à un score de 2 assert rankings[0][0][1] == 2 #modèle O rang 9 à un score de 1 assert rankings[0][9][1] == 1 #modèle 1 assert rankings[1][0][1] == 2 assert rankings[1][9][1] == 1 #meilleur docs assert rankings[0][0][0] == "80" assert rankings[1][0][0] == "80" assert rankings[2][0][0] == "3646" assert rankings[3][0][0] == "3646" assert rankings[4][0][0] == "80" assert rankings[5][0][0] == "866" assert rankings[6][0][0] == "3156"
def testLong(): parsedQuery = None parsedText = None file = "data/cisi/cisi" for path in ["./", "../"]: try: parsedQuery = queryParser.parse(path + file) parsedText = myParser.buildDocCollectionSimple( path + file + ".txt", ".W") break except FileNotFoundError: pass assert parsedQuery assert parsedText assert len(parsedQuery.queries) == 112 assert len(parsedQuery.queries["1"].pertient_list_id) == 46 #nombre de requêtes ayant au moins un document pertinent assert sum( len(q.pertient_list_id) > 0 for q in parsedQuery.queries.values()) == 76 print("calcul indexer") indexer = indexerSimple.IndexerSimple(parsedText.docs) models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] jelinek = jelinekMercer.JelinekMercer(indexer) models.append(jelinek) okapi = okapiBM25.OkapiBM25(indexer) models.append(okapi) data_fit = [q.T for q in parsedQuery.queries.values()] labels = [q.pertient_list_id for q in parsedQuery.queries.values()] print("fit") # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels) # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels) # train test print(len(data_fit)) n = 100 jelinek.fit(np.linspace(.2, .7, 3), data_fit[:n], labels[:n]) okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit[:n], labels[:n]) for i in range(len(models)): models.append(pagerank.PagerankMarcheAlea(indexer, models[i])) models[-2].fit(np.linspace(.2, .7, 3), data_fit[:n], labels[:n]) print("précisions") for m in models: pred = [m.getRanking(d) for d in data_fit[n:]] avgPrec = 0 for p, l in zip(pred, labels[n:]): avgPrec += m.avgPrec(p, l) print(m, avgPrec / len(pred))
def eval(): parsedQuery = None parsedText = None file = "data/cisi/cisi" for path in ["./", "../"]: try: parsedQuery = queryParser.parse(path + file) parsedText = myParser.buildDocCollectionSimple(path + file + ".txt", ".W") break except FileNotFoundError: pass indexer = indexerSimple.IndexerSimple(parsedText.docs) models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] jelinek = jelinekMercer.JelinekMercer(indexer) models.append(jelinek) okapi = okapiBM25.OkapiBM25(indexer) models.append(okapi) data_fit = [q.T for q in parsedQuery.queries.values()] labels = [q.pertient_list_id for q in parsedQuery.queries.values()] print("fit") # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels) # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels) for i in range(len(models)): models.append(pagerank.PagerankMarcheAlea(indexer, models[i])) k = 9 metrics = [ averagePrecision.AveragePrecision(), precisionAtK.PrecisionAtK(k), fMesureK.FMesureK(1, k), rappelAtK.RappelAtK(k), NDCG.NDCG(k), reciprocalRank.ReciprocalRank()] perf = [] print(models) print(metrics) for i, model in enumerate(models): print(i,"/", len(models)) perf.append([]) pred = [model.getRanking(data_fit[k]) for k in range(len(data_fit))] for metric in metrics: score, std = metric.eval_list_query(pred, labels) perf[-1].append(score) print([round(x, 4) for x in perf[-1]]) import matplotlib.pyplot as plt plt.imshow(perf) plt.colorbar() plt.xlabel("Metrique") plt.ylabel("Modèle") plt.figtext(0,0,"Metriques : 0 : averagePrecision,1 : precisionAtK,2 : fMesureK,3 : rappelAtK,4 : NDCG,5 : reciprocalRank;Modèles : 0-4: Vectoriel, 5 : jelinekMercer,6 : okapiBM25, 7-12 : avec pagerank") plt.show()