예제 #1
0
파일: test.py 프로젝트: keyber/RITAL
def test_ranking_short():
    parsed = None
    file = "data/cacmShort.txt"
    for path in ["./", "../"]:
        try:
            parsed = myParser.buildDocCollectionSimple(path + file, ".T")
            break
        except FileNotFoundError:
            pass
    assert parsed

    indexer = indexerSimple.IndexerSimple(parsed.docs)
    requete = "home computer microphotographi"

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    models.append(jelinekMercer.JelinekMercer(indexer, 1))
    models.append(okapiBM25.OkapiBM25(indexer, 1.2, .75))

    rankings = [m.getRanking(requete) for m in models]

    # 5 docs ont un score non nul qqsoit modèle
    for ranking in rankings:
        assert len(ranking) == 5

    # ordre des résultats
    for ranking in rankings[:-1]:
        assert [x[0] for x in ranking] == ["7", "6", "4", "2", "10"]
    assert [x[0] for x in rankings[-1]] == ["6", "7", "4", "10", "2"]
예제 #2
0
def optimisationOkapi(debut1, fin1, nbPoint1, debut2, fin2, nbPoint2, donnees,
                      labels):
    rangeK = np.linspace(debut1, fin1, nbPoint1)
    rangeB = np.linspace(debut2, fin2, nbPoint2)
    m = okapiBM25.OkapiBM25()
    mapKB = []
    param = []
    for k1 in rangeK:
        for b in rangeB:
            s = 0
            for k in range(len(donnees)):
                predictionModele = m.getRanking(donnees[k], [k1, b])
                s += avgPrec(predictionModele, labels[k])
            mapKB.append(s / len(donnees))
            param.append([k1, b])
    leMeilleurKBDuMonde = param[np.argmax(mapKB)]
예제 #3
0
파일: test.py 프로젝트: keyber/RITAL
def test_ranking_long():
    parsed = None
    file = "data/cacm/cacm.txt"
    for path in ["./", "../"]:
        try:
            parsed = myParser.buildDocCollectionSimple(path + file, ".T")
            break
        except FileNotFoundError:
            pass
    assert parsed

    indexer = indexerSimple.IndexerSimple(parsed.docs)
    requete = "home computer microphotographi"

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    models.append(jelinekMercer.JelinekMercer(indexer, .2))
    models.append(okapiBM25.OkapiBM25(indexer, 1.2, .75))

    rankings = [m.getRanking(requete) for m in models]

    #modèle O rang 0 à un score de 2
    assert rankings[0][0][1] == 2
    #modèle O rang 9 à un score de 1
    assert rankings[0][9][1] == 1
    #modèle 1
    assert rankings[1][0][1] == 2
    assert rankings[1][9][1] == 1

    #meilleur docs
    assert rankings[0][0][0] == "80"
    assert rankings[1][0][0] == "80"
    assert rankings[2][0][0] == "3646"
    assert rankings[3][0][0] == "3646"
    assert rankings[4][0][0] == "80"
    assert rankings[5][0][0] == "866"
    assert rankings[6][0][0] == "3156"
예제 #4
0
파일: test.py 프로젝트: keyber/RITAL
def testLong():
    parsedQuery = None
    parsedText = None
    file = "data/cisi/cisi"
    for path in ["./", "../"]:
        try:
            parsedQuery = queryParser.parse(path + file)
            parsedText = myParser.buildDocCollectionSimple(
                path + file + ".txt", ".W")
            break
        except FileNotFoundError:
            pass
    assert parsedQuery
    assert parsedText

    assert len(parsedQuery.queries) == 112
    assert len(parsedQuery.queries["1"].pertient_list_id) == 46

    #nombre de requêtes ayant au moins un document pertinent
    assert sum(
        len(q.pertient_list_id) > 0
        for q in parsedQuery.queries.values()) == 76

    print("calcul indexer")
    indexer = indexerSimple.IndexerSimple(parsedText.docs)

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    jelinek = jelinekMercer.JelinekMercer(indexer)
    models.append(jelinek)

    okapi = okapiBM25.OkapiBM25(indexer)
    models.append(okapi)

    data_fit = [q.T for q in parsedQuery.queries.values()]
    labels = [q.pertient_list_id for q in parsedQuery.queries.values()]

    print("fit")
    # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels)
    # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels)

    # train test
    print(len(data_fit))
    n = 100
    jelinek.fit(np.linspace(.2, .7, 3), data_fit[:n], labels[:n])
    okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit[:n],
              labels[:n])

    for i in range(len(models)):
        models.append(pagerank.PagerankMarcheAlea(indexer, models[i]))

    models[-2].fit(np.linspace(.2, .7, 3), data_fit[:n], labels[:n])

    print("précisions")
    for m in models:
        pred = [m.getRanking(d) for d in data_fit[n:]]
        avgPrec = 0
        for p, l in zip(pred, labels[n:]):
            avgPrec += m.avgPrec(p, l)
        print(m, avgPrec / len(pred))
예제 #5
0
def eval():
    parsedQuery = None
    parsedText = None
    file = "data/cisi/cisi"
    for path in ["./", "../"]:
        try:
            parsedQuery = queryParser.parse(path + file)
            parsedText = myParser.buildDocCollectionSimple(path + file + ".txt", ".W")
            break
        except FileNotFoundError:
            pass

    indexer = indexerSimple.IndexerSimple(parsedText.docs)

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    jelinek = jelinekMercer.JelinekMercer(indexer)
    models.append(jelinek)

    okapi = okapiBM25.OkapiBM25(indexer)
    models.append(okapi)

    data_fit = [q.T for q in parsedQuery.queries.values()]
    labels = [q.pertient_list_id for q in parsedQuery.queries.values()]

    print("fit")
    # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels)
    # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels)

    for i in range(len(models)):
        models.append(pagerank.PagerankMarcheAlea(indexer, models[i]))

    k = 9
    metrics = [
        averagePrecision.AveragePrecision(),
        precisionAtK.PrecisionAtK(k),
        fMesureK.FMesureK(1, k),
        rappelAtK.RappelAtK(k),
        NDCG.NDCG(k),
        reciprocalRank.ReciprocalRank()]

    perf = []
    print(models)
    print(metrics)
    for i, model in enumerate(models):
        print(i,"/", len(models))
        perf.append([])
        pred = [model.getRanking(data_fit[k]) for k in range(len(data_fit))]

        for metric in metrics:
            score, std = metric.eval_list_query(pred, labels)
            perf[-1].append(score)
        print([round(x, 4) for x in perf[-1]])

    import matplotlib.pyplot as plt
    plt.imshow(perf)
    plt.colorbar()
    plt.xlabel("Metrique")
    plt.ylabel("Modèle")
    plt.figtext(0,0,"Metriques : 0 : averagePrecision,1 : precisionAtK,2 : fMesureK,3 : rappelAtK,4 : NDCG,5 : reciprocalRank;Modèles : 0-4: Vectoriel, 5 : jelinekMercer,6 : okapiBM25, 7-12 : avec pagerank")
    plt.show()