Пример #1
0
def test_ranking_short():
    parsed = None
    file = "data/cacmShort.txt"
    for path in ["./", "../"]:
        try:
            parsed = myParser.buildDocCollectionSimple(path + file, ".T")
            break
        except FileNotFoundError:
            pass
    assert parsed

    indexer = indexerSimple.IndexerSimple(parsed.docs)
    requete = "home computer microphotographi"

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    models.append(jelinekMercer.JelinekMercer(indexer, 1))
    models.append(okapiBM25.OkapiBM25(indexer, 1.2, .75))

    rankings = [m.getRanking(requete) for m in models]

    # 5 docs ont un score non nul qqsoit modèle
    for ranking in rankings:
        assert len(ranking) == 5

    # ordre des résultats
    for ranking in rankings[:-1]:
        assert [x[0] for x in ranking] == ["7", "6", "4", "2", "10"]
    assert [x[0] for x in rankings[-1]] == ["6", "7", "4", "10", "2"]
Пример #2
0
def testVeryShort():
    docs = [
        "the new home has been saled on top forecasts",
        "the home sales rise in july",
        "there is an increase in home sales in july",
        "july encounter a new home sales rise"
    ]
    parsed = myParser.loadCollection(docs)
    indexer = indexerSimple.IndexerSimple(parsed.docs)

    for d in indexer.ind, indexer.inv, indexer.ind_n, indexer.inv_n:
        for e in d.items():
            print(e)
        print("\n")
Пример #3
0
def testShort():
    parsed = None
    parsed2 = None
    file = "data/cacmShort.txt"
    for path in ["./", "../"]:
        try:
            parsed = myParser.buildDocCollectionSimple(path + file)
            parsed2 = myParser.buildDocumentCollectionRegex(path + file)
            break
        except FileNotFoundError:
            pass
    assert parsed and parsed2

    # équivalence des deux méthodes de parsing
    for d1, d2 in zip(sorted(parsed.docs.values(), key=lambda x: x.I),
                      sorted(parsed2.docs.values(), key=lambda x: x.I)):
        assert d1.I == d2.I
        assert d1.T == d2.T

    indexer = indexerSimple.IndexerSimple(parsed2.docs)

    assert 'algebra' in indexer.ind['1']
    assert len(indexer.ind['2']) == 6
    assert sum(indexer.ind['11'].values()) == 8

    assert 'algebra' in indexer.ind_n['1']
    assert abs(sum(indexer.ind_n['2'].values()) - 1) < 1e-4

    assert indexer.inv['matrix'] == {'3': 1}
    assert len(indexer.inv['comput']) == 5

    assert indexer.inv_n['matrix'] == {'3': .2}
    assert len(indexer.inv_n['comput']) == 5

    tf_idf = indexer.create_tf_idf()

    #tfidf à la même structure que ind
    assert tf_idf.keys() == indexer.ind.keys()
    for i_doc in tf_idf.keys():
        assert tf_idf[i_doc].keys() == indexer.ind[i_doc].keys()

    #contenu de tfidf
    assert abs(tf_idf['4']['programm'] - 0.875) < 1e-3
Пример #4
0
def testLong():
    print("test long")
    print("lecture")

    parsed = None
    file = "data/cisi/cisi.txt"
    for path in ["./", "../"]:
        try:
            parsed = myParser.buildDocCollectionSimple(path + file, '.W')
            break
        except FileNotFoundError:
            pass
    assert parsed

    print("création index")
    indexer = indexerSimple.IndexerSimple(parsed.docs)
    assert len(indexer.ind) == 2459

    print("création tfidf")
    tf_idf = indexer.create_tf_idf()
    assert len(tf_idf) == 2459
Пример #5
0
def main():
    test_ranking_veryshort()
    test_ranking_short()
    test_ranking_long()

    parsed = None
    # exécution d'un modèle simple
    file = "data/cacm/cacm.txt"
    for path in ["./", "../"]:
        try:
            parsed = myParser.buildDocCollectionSimple(path + file,
                                                       ".T",
                                                       balise2=".X")
            break
        except FileNotFoundError:
            pass
    assert parsed

    indexer = indexerSimple.IndexerSimple(parsed.docs)

    test_full_pagerank(indexer)
    test_alea_pagerank(indexer)
Пример #6
0
def test_ranking_veryshort():
    docs = [
        "the new home has home been saled on top forecasts",
        "the home sales rise in july", "there is an increase in sales in july",
        "july encounter a new home sales rise"
    ]
    parsed = myParser.loadCollection(docs)
    indexer = indexerSimple.IndexerSimple(parsed.docs)
    weights_doc = []
    scores = []
    for clas in [
            weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5
    ]:
        w = clas(indexer)
        weights_doc.append(w.getWeightsForDoc(0))
        v = vectoriel.Vectoriel(indexer, w, False)
        scores.append(v.getScores("home sales top"))

    #documents dans le même ordre qqsoit weighter
    list0 = sorted(scores[0].keys(), key=lambda x: scores[0][x])
    for s in scores[1:]:
        assert list0 == sorted(s.keys(), key=lambda x: s[x])
Пример #7
0
def test_ranking_long():
    parsed = None
    file = "data/cacm/cacm.txt"
    for path in ["./", "../"]:
        try:
            parsed = myParser.buildDocCollectionSimple(path + file, ".T")
            break
        except FileNotFoundError:
            pass
    assert parsed

    indexer = indexerSimple.IndexerSimple(parsed.docs)
    requete = "home computer microphotographi"

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    models.append(jelinekMercer.JelinekMercer(indexer, .2))
    models.append(okapiBM25.OkapiBM25(indexer, 1.2, .75))

    rankings = [m.getRanking(requete) for m in models]

    #modèle O rang 0 à un score de 2
    assert rankings[0][0][1] == 2
    #modèle O rang 9 à un score de 1
    assert rankings[0][9][1] == 1
    #modèle 1
    assert rankings[1][0][1] == 2
    assert rankings[1][9][1] == 1

    #meilleur docs
    assert rankings[0][0][0] == "80"
    assert rankings[1][0][0] == "80"
    assert rankings[2][0][0] == "3646"
    assert rankings[3][0][0] == "3646"
    assert rankings[4][0][0] == "80"
    assert rankings[5][0][0] == "866"
    assert rankings[6][0][0] == "3156"
Пример #8
0
def testLong():
    parsedQuery = None
    parsedText = None
    file = "data/cisi/cisi"
    for path in ["./", "../"]:
        try:
            parsedQuery = queryParser.parse(path + file)
            parsedText = myParser.buildDocCollectionSimple(
                path + file + ".txt", ".W")
            break
        except FileNotFoundError:
            pass
    assert parsedQuery
    assert parsedText

    assert len(parsedQuery.queries) == 112
    assert len(parsedQuery.queries["1"].pertient_list_id) == 46

    #nombre de requêtes ayant au moins un document pertinent
    assert sum(
        len(q.pertient_list_id) > 0
        for q in parsedQuery.queries.values()) == 76

    print("calcul indexer")
    indexer = indexerSimple.IndexerSimple(parsedText.docs)

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    jelinek = jelinekMercer.JelinekMercer(indexer)
    models.append(jelinek)

    okapi = okapiBM25.OkapiBM25(indexer)
    models.append(okapi)

    data_fit = [q.T for q in parsedQuery.queries.values()]
    labels = [q.pertient_list_id for q in parsedQuery.queries.values()]

    print("fit")
    # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels)
    # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels)

    # train test
    print(len(data_fit))
    n = 100
    jelinek.fit(np.linspace(.2, .7, 3), data_fit[:n], labels[:n])
    okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit[:n],
              labels[:n])

    for i in range(len(models)):
        models.append(pagerank.PagerankMarcheAlea(indexer, models[i]))

    models[-2].fit(np.linspace(.2, .7, 3), data_fit[:n], labels[:n])

    print("précisions")
    for m in models:
        pred = [m.getRanking(d) for d in data_fit[n:]]
        avgPrec = 0
        for p, l in zip(pred, labels[n:]):
            avgPrec += m.avgPrec(p, l)
        print(m, avgPrec / len(pred))
Пример #9
0
def eval():
    parsedQuery = None
    parsedText = None
    file = "data/cisi/cisi"
    for path in ["./", "../"]:
        try:
            parsedQuery = queryParser.parse(path + file)
            parsedText = myParser.buildDocCollectionSimple(path + file + ".txt", ".W")
            break
        except FileNotFoundError:
            pass

    indexer = indexerSimple.IndexerSimple(parsedText.docs)

    models = [weighter.c1, weighter.c2, weighter.c3, weighter.c4, weighter.c5]
    models = [clas(indexer) for clas in models]
    models = [vectoriel.Vectoriel(indexer, weight, False) for weight in models]
    jelinek = jelinekMercer.JelinekMercer(indexer)
    models.append(jelinek)

    okapi = okapiBM25.OkapiBM25(indexer)
    models.append(okapi)

    data_fit = [q.T for q in parsedQuery.queries.values()]
    labels = [q.pertient_list_id for q in parsedQuery.queries.values()]

    print("fit")
    # jelinek.fit(np.linspace(0, 2, 2), data_fit, labels)
    # okapi.fit((np.linspace(0, 2, 2), np.linspace(0, 2, 2)), data_fit, labels)

    for i in range(len(models)):
        models.append(pagerank.PagerankMarcheAlea(indexer, models[i]))

    k = 9
    metrics = [
        averagePrecision.AveragePrecision(),
        precisionAtK.PrecisionAtK(k),
        fMesureK.FMesureK(1, k),
        rappelAtK.RappelAtK(k),
        NDCG.NDCG(k),
        reciprocalRank.ReciprocalRank()]

    perf = []
    print(models)
    print(metrics)
    for i, model in enumerate(models):
        print(i,"/", len(models))
        perf.append([])
        pred = [model.getRanking(data_fit[k]) for k in range(len(data_fit))]

        for metric in metrics:
            score, std = metric.eval_list_query(pred, labels)
            perf[-1].append(score)
        print([round(x, 4) for x in perf[-1]])

    import matplotlib.pyplot as plt
    plt.imshow(perf)
    plt.colorbar()
    plt.xlabel("Metrique")
    plt.ylabel("Modèle")
    plt.figtext(0,0,"Metriques : 0 : averagePrecision,1 : precisionAtK,2 : fMesureK,3 : rappelAtK,4 : NDCG,5 : reciprocalRank;Modèles : 0-4: Vectoriel, 5 : jelinekMercer,6 : okapiBM25, 7-12 : avec pagerank")
    plt.show()