def test_ranking_short(): parsed = None file = "data/cacmShort.txt" for path in ["./", "../"]: try: parsed = myParser.buildDocCollectionSimple(path + file, ".T") break except FileNotFoundError: pass assert parsed indexer = indexerSimple.IndexerSimple(parsed.docs) requete = "home computer microphotographi" models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] models.append(jelinekMercer.JelinekMercer(indexer, 1)) models.append(okapiBM25.OkapiBM25(indexer, 1.2, .75)) rankings = [m.getRanking(requete) for m in models] # 5 docs ont un score non nul qqsoit modèle for ranking in rankings: assert len(ranking) == 5 # ordre des résultats for ranking in rankings[:-1]: assert [x[0] for x in ranking] == ["7", "6", "4", "2", "10"] assert [x[0] for x in rankings[-1]] == ["6", "7", "4", "10", "2"]
def testVeryShort(): docs = [ "the new home has been saled on top forecasts", "the home sales rise in july", "there is an increase in home sales in july", "july encounter a new home sales rise" ] parsed = myParser.loadCollection(docs) indexer = indexerSimple.IndexerSimple(parsed.docs) for d in indexer.ind, indexer.inv, indexer.ind_n, indexer.inv_n: for e in d.items(): print(e) print("\n")
def testShort(): parsed = None parsed2 = None file = "data/cacmShort.txt" for path in ["./", "../"]: try: parsed = myParser.buildDocCollectionSimple(path + file) parsed2 = myParser.buildDocumentCollectionRegex(path + file) break except FileNotFoundError: pass assert parsed and parsed2 # équivalence des deux méthodes de parsing for d1, d2 in zip(sorted(parsed.docs.values(), key=lambda x: x.I), sorted(parsed2.docs.values(), key=lambda x: x.I)): assert d1.I == d2.I assert d1.T == d2.T indexer = indexerSimple.IndexerSimple(parsed2.docs) assert 'algebra' in indexer.ind['1'] assert len(indexer.ind['2']) == 6 assert sum(indexer.ind['11'].values()) == 8 assert 'algebra' in indexer.ind_n['1'] assert abs(sum(indexer.ind_n['2'].values()) - 1) < 1e-4 assert indexer.inv['matrix'] == {'3': 1} assert len(indexer.inv['comput']) == 5 assert indexer.inv_n['matrix'] == {'3': .2} assert len(indexer.inv_n['comput']) == 5 tf_idf = indexer.create_tf_idf() #tfidf à la même structure que ind assert tf_idf.keys() == indexer.ind.keys() for i_doc in tf_idf.keys(): assert tf_idf[i_doc].keys() == indexer.ind[i_doc].keys() #contenu de tfidf assert abs(tf_idf['4']['programm'] - 0.875) < 1e-3
def testLong(): print("test long") print("lecture") parsed = None file = "data/cisi/cisi.txt" for path in ["./", "../"]: try: parsed = myParser.buildDocCollectionSimple(path + file, '.W') break except FileNotFoundError: pass assert parsed print("création index") indexer = indexerSimple.IndexerSimple(parsed.docs) assert len(indexer.ind) == 2459 print("création tfidf") tf_idf = indexer.create_tf_idf() assert len(tf_idf) == 2459
def main(): test_ranking_veryshort() test_ranking_short() test_ranking_long() parsed = None # exécution d'un modèle simple file = "data/cacm/cacm.txt" for path in ["./", "../"]: try: parsed = myParser.buildDocCollectionSimple(path + file, ".T", balise2=".X") break except FileNotFoundError: pass assert parsed indexer = indexerSimple.IndexerSimple(parsed.docs) test_full_pagerank(indexer) test_alea_pagerank(indexer)
def test_ranking_veryshort(): docs = [ "the new home has home been saled on top forecasts", "the home sales rise in july", "there is an increase in sales in july", "july encounter a new home sales rise" ] parsed = myParser.loadCollection(docs) indexer = indexerSimple.IndexerSimple(parsed.docs) weights_doc = [] scores = [] for clas in [ weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5 ]: w = clas(indexer) weights_doc.append(w.getWeightsForDoc(0)) v = vectoriel.Vectoriel(indexer, w, False) scores.append(v.getScores("home sales top")) #documents dans le même ordre qqsoit weighter list0 = sorted(scores[0].keys(), key=lambda x: scores[0][x]) for s in scores[1:]: assert list0 == sorted(s.keys(), key=lambda x: s[x])
def test_ranking_long(): parsed = None file = "data/cacm/cacm.txt" for path in ["./", "../"]: try: parsed = myParser.buildDocCollectionSimple(path + file, ".T") break except FileNotFoundError: pass assert parsed indexer = indexerSimple.IndexerSimple(parsed.docs) requete = "home computer microphotographi" models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] models.append(jelinekMercer.JelinekMercer(indexer, .2)) models.append(okapiBM25.OkapiBM25(indexer, 1.2, .75)) rankings = [m.getRanking(requete) for m in models] #modèle O rang 0 à un score de 2 assert rankings[0][0][1] == 2 #modèle O rang 9 à un score de 1 assert rankings[0][9][1] == 1 #modèle 1 assert rankings[1][0][1] == 2 assert rankings[1][9][1] == 1 #meilleur docs assert rankings[0][0][0] == "80" assert rankings[1][0][0] == "80" assert rankings[2][0][0] == "3646" assert rankings[3][0][0] == "3646" assert rankings[4][0][0] == "80" assert rankings[5][0][0] == "866" assert rankings[6][0][0] == "3156"
def testLong(): parsedQuery = None parsedText = None file = "data/cisi/cisi" for path in ["./", "../"]: try: parsedQuery = queryParser.parse(path + file) parsedText = myParser.buildDocCollectionSimple( path + file + ".txt", ".W") break except FileNotFoundError: pass assert parsedQuery assert parsedText assert len(parsedQuery.queries) == 112 assert len(parsedQuery.queries["1"].pertient_list_id) == 46 #nombre de requêtes ayant au moins un document pertinent assert sum( len(q.pertient_list_id) > 0 for q in parsedQuery.queries.values()) == 76 print("calcul indexer") indexer = indexerSimple.IndexerSimple(parsedText.docs) models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] jelinek = jelinekMercer.JelinekMercer(indexer) models.append(jelinek) okapi = okapiBM25.OkapiBM25(indexer) models.append(okapi) data_fit = [q.T for q in parsedQuery.queries.values()] labels = [q.pertient_list_id for q in parsedQuery.queries.values()] print("fit") # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels) # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels) # train test print(len(data_fit)) n = 100 jelinek.fit(np.linspace(.2, .7, 3), data_fit[:n], labels[:n]) okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit[:n], labels[:n]) for i in range(len(models)): models.append(pagerank.PagerankMarcheAlea(indexer, models[i])) models[-2].fit(np.linspace(.2, .7, 3), data_fit[:n], labels[:n]) print("précisions") for m in models: pred = [m.getRanking(d) for d in data_fit[n:]] avgPrec = 0 for p, l in zip(pred, labels[n:]): avgPrec += m.avgPrec(p, l) print(m, avgPrec / len(pred))
def eval(): parsedQuery = None parsedText = None file = "data/cisi/cisi" for path in ["./", "../"]: try: parsedQuery = queryParser.parse(path + file) parsedText = myParser.buildDocCollectionSimple(path + file + ".txt", ".W") break except FileNotFoundError: pass indexer = indexerSimple.IndexerSimple(parsedText.docs) models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5] models = [clas(indexer) for clas in models] models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models] jelinek = jelinekMercer.JelinekMercer(indexer) models.append(jelinek) okapi = okapiBM25.OkapiBM25(indexer) models.append(okapi) data_fit = [q.T for q in parsedQuery.queries.values()] labels = [q.pertient_list_id for q in parsedQuery.queries.values()] print("fit") # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels) # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels) for i in range(len(models)): models.append(pagerank.PagerankMarcheAlea(indexer, models[i])) k = 9 metrics = [ averagePrecision.AveragePrecision(), precisionAtK.PrecisionAtK(k), fMesureK.FMesureK(1, k), rappelAtK.RappelAtK(k), NDCG.NDCG(k), reciprocalRank.ReciprocalRank()] perf = [] print(models) print(metrics) for i, model in enumerate(models): print(i,"/", len(models)) perf.append([]) pred = [model.getRanking(data_fit[k]) for k in range(len(data_fit))] for metric in metrics: score, std = metric.eval_list_query(pred, labels) perf[-1].append(score) print([round(x, 4) for x in perf[-1]]) import matplotlib.pyplot as plt plt.imshow(perf) plt.colorbar() plt.xlabel("Metrique") plt.ylabel("Modèle") plt.figtext(0,0,"Metriques : 0 : averagePrecision,1 : precisionAtK,2 : fMesureK,3 : rappelAtK,4 : NDCG,5 : reciprocalRank;Modèles : 0-4: Vectoriel, 5 : jelinekMercer,6 : okapiBM25, 7-12 : avec pagerank") plt.show()