Exemplos de CranFile em Python, exemplos de cran.CranFile em Python

Exemplo n.º 1

0

Exibir arquivo

def query( indexfilename,processingalgorithm,queryfilename, queryid, numresults=3):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    qrys = loadCranQry(queryfilename)
#    for q in qrys:
#        print(q, qrys[q].text)

    loadiindex = InvertedIndex()
    loadiindex = loadiindex.load(indexfilename)
#    print("index loaded")

    cf = CranFile('cran.all')

    queryProcessor = QueryProcessor(qrys, loadiindex, cf.docs, numresults)
    if processingalgorithm == '0' :
        queryProcessor.preprocessing()
        queryProcessor.queryId = queryid
        results = queryProcessor.booleanQuery()
    if processingalgorithm == '1':
        queryProcessor.queryId = queryid
        results = queryProcessor.vectorQuery(queryProcessor.numofresults)
    return results

Exemplo n.º 2

0

Exibir arquivo

Arquivo: index.py Projeto: Joeyipp/simple-search-engine

def indexingCranfield():
    # ToDo: indexing the Cranfield dataset and save the index to a file (Done)
    # command line usage: "python index.py cran.all index_file" (Done)
    # the index is saved to index_file (Done)

    # Import the cran.all collection
    cf = CranFile(sys.argv[1])

    # Instantiate an invertedIndex
    invertedIndex = InvertedIndex()

    # Loop through and index each document in the Cran collection
    for doc in cf.docs:
        print("Indexing document {}".format(doc.docID))
        invertedIndex.indexDoc(doc)

    print("\nTotal documents indexed: {}".format(invertedIndex.nDocs))

    # Sort the invertedIndex
    invertedIndex.sort()

    # Save the invertedIndex
    invertedIndex.save(sys.argv[2])

    print('Done')

Exemplo n.º 3

0

Exibir arquivo

def query():
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuer
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    # Ensure args are valid
    if len(argv) is not 5:
        print(
            "Syntax: python query.py <index-file-path> <processing-algorithm> <query.txt path> <query-id>"
        )
        return

    # Grab arguments
    index_file_loc = argv[1]
    processing_algo = argv[2]
    query_file_path = argv[3]
    query_id = argv[4]

    # Grab index file to restore II
    ii = InvertedIndex()
    ii.load(index_file_loc)

    # Get the document collection
    cf = CranFile("cran.all")

    # Get the query collection
    qc = loadCranQry(query_file_path)

    # Get the query
    if 0 < int(query_id) < 10:
        query_id = '00' + str(int(query_id))
    elif 9 < int(query_id) < 100:
        query_id = '0' + str(int(query_id))
    try:
        query = qc[query_id].text
    except KeyError:
        print("Invalid query id", query_id)
        return

    # Initialize a query processor
    qp = QueryProcessor(query, ii, cf)

    # Do query
    if int(processing_algo) is 0:
        result = qp.booleanQuery()
        if result:
            print("Results:", ", ".join(str(x) for x in qp.booleanQuery()))
        else:
            print("Results: None")
    elif int(processing_algo) is 1:
        result = qp.vectorQuery(k=3)
        print("Results:")
        for r in result:
            print("Doc", r[0], "Score", r[1])
    else:
        print("Invalid processing algorithm",
              processing_algo + ". Use 0 (boolean) or 1 (vector).")

Exemplo n.º 4

0

Exibir arquivo

Arquivo: index.py Projeto: adamhs1997/cs7800project1

def indexingCranfield():
    #ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file

    # Ensure args are valid
    if len(argv) != 3:
        print("Syntax: python index.py <cran.all path> <index-save-location>")
        return

    # Grab arguments
    file_to_index = argv[1]
    save_location = argv[2]

    # Index file
    print("Indexing documents from", file_to_index + "...")
    cf = CranFile(file_to_index)
    ii = InvertedIndex()
    for doc in cf.docs:
        ii.indexDoc(doc)

    # Sort index before saving
    ii.sort()

    # Compute tf-idf vector representations for each doc
    ii.compute_tfidf()

    # Save off index
    ii.save(save_location)
    print("Index saved to", save_location + "!")

Exemplo n.º 5

0

Exibir arquivo

Arquivo: index.py Projeto: Joeyipp/simple-search-engine

def test():
    ''' test your code thoroughly. put the testing cases here'''

    # Import the cran.all collection
    cf = CranFile(sys.argv[1])

    # Instantiate an invertedIndex
    invertedIndex_1 = InvertedIndex()

    # Index the first 2 documents
    for i in range(2):
        print("Indexing document {}\n".format(cf.docs[i].docID))
        invertedIndex_1.indexDoc(cf.docs[i], "test")

    # Check number of document indexed
    print("# of documents indexed: {}".format(invertedIndex_1.nDocs))

    # Check number of terms indexed
    print("# of terms indexed: {}\n".format(
        len([item for item in invertedIndex_1.items.iterkeys()])))

    # Sort the invertedIndex
    invertedIndex_1.sort()

    # Check the posting list, term frequency, and IDF
    print(
        "== Statistics for the term 'lift' BEFORE saving the index to disk (invertedIndex_1) =="
    )
    print("Posting list:\t{}".format(
        invertedIndex_1.find("lift").sorted_postings))
    print("Positions:\t{}".format(
        invertedIndex_1.find("lift").posting[1].positions))
    print("TF:\t\t{}".format(
        invertedIndex_1.find("lift").posting[1].term_freq()))
    print("IDF:\t\t{}\n".format(round(invertedIndex_1.idf("lift"), 5)))

    # Save the invertedIndex
    invertedIndex_1.save(sys.argv[2])

    # Instantiate a new invertedIndex
    invertedIndex_2 = InvertedIndex()

    # Load the invertedIndex
    invertedIndex_2.load(sys.argv[2])

    # Check the posting list, term frequency, and IDF
    print(
        "== Statistics for the term 'lift' AFTER loading the index from disk (invertedIndex_2) =="
    )
    print("Posting list:\t{}".format(
        invertedIndex_2.find("lift").sorted_postings))
    print("Positions:\t{}".format(
        invertedIndex_2.find("lift").posting["1"].positions))
    print("TF:\t\t{}".format(
        invertedIndex_2.find("lift").posting["1"].term_freq()))
    print("IDF:\t\t{}\n".format(round(invertedIndex_2.idf("lift"), 5)))

    print('Pass')

Exemplo n.º 6

0

Exibir arquivo

def indexingCranfield():
    #ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file
    input_filename = sys.argv[1]  #"cran.all" #sys.argv[1]
    ouput_filename = sys.argv[2]  #"Index.json" as INDEX_file #sys.argv[2]
    'creating Cranfile and inverted index objects'
    cf = CranFile(input_filename)
    x = InvertedIndex()
    'Iterating over crancollection to process and create index for all documents in collection'
    for i, doc in enumerate(cf.docs):
        if i < 1:
            'call to build index'
            x.indexDoc(doc)
            collectionfile.docs.update({doc.docID: doc})
    'Saving index to file'
    x.save(dictionary, ouput_filename)
    print("index created")

Exemplo n.º 7

0

Exibir arquivo

def indexingCranfield():
    #ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file

    filePath = sys.argv[1]
    fileName = sys.argv[2]

    #filePath = "src/CranfieldDataset/cran.all"
    #fileName = "src/Data/tempFile"
    #filePath = "./CranfieldDataset/cran.all"
    #fileName = "./Data/tempFile"

    invertedIndexer = InvertedIndex()
    data = CranFile(filePath)
    for doc in data.docs:
        invertedIndexer.indexDoc(doc)

    invertedIndexer.storeData(fileName)
    print("Done")

Exemplo n.º 8

0

Exibir arquivo

Arquivo: index.py Projeto: arjunsekar1991/searchenginepython

def indexingCranfield(collectionname, indexfilename):
    # ToDo: indexing the Cranfield dataset and save the index to a file
    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file
    cf = CranFile(collectionname)
    iindex = InvertedIndex()
    for doc in cf.docs:
        iindex.indexDoc(doc)
    # doing  stop word removal here ?
    with open("stopwords") as f:
        for line in f:
            if line.strip() in iindex.items:
                del iindex.items[line.strip()]
    # Do something with 'line'

    for terms in iindex.items:
        #        print(terms)
        iindex.idf(terms)

    iindex.save(indexfilename)
    print("Index builded successfully")

Exemplo n.º 9

0

Exibir arquivo

Arquivo: batch_eval.py Projeto: adamhs1997/cs7800project1

def eval():

    # Algorithm:
    # Pick N random samples from query.txt
    # Get top 10 results from bool query for each rnd query
    # Get top 10 results from vector query for each rnd query
    # Compute NDCG btn bool query results and qrels.txt
    # Compute NDCG btn vector query results and qrels.txt
    # Get p-value btn bool and vector

    # Get the query collection
    qc = loadCranQry(query_path)
    poss_queries = list(qc)

    # Load up the inverted index
    ii = InvertedIndex()
    ii.load(index_file)

    # Load up the document collection
    cf = CranFile("cran.all")

    # Get ground-truth results from qrels.txt
    with open(qrels_path) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    # Run over N random queries, collecting NDCGs
    bool_ndcgs = []
    vector_ndcgs = []
    for _ in range(n):
        # Get random query ID
        query_id = choice(poss_queries)

        # Get the query
        if 0 < int(query_id) < 10:
            query_id = '00' + str(int(query_id))
        elif 9 < int(query_id) < 100:
            query_id = '0' + str(int(query_id))
        try:
            query = qc[query_id].text
        except KeyError:
            print("Invalid query id", query_id)
            return

        # Initialize the query processor
        qp = QueryProcessor(query, ii, cf)

        # Run bool query
        bool_result = qp.booleanQuery()[:10]

        # Run vector query
        vector_result = qp.vectorQuery(10)

        # Pull top 10 ground-truth results from qrels dict
        gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10]

        # Compute NDCG for bool query
        # NOTE: There is no weighting on the bool query, so give all an even 1
        truth_vector = list(map(lambda x: x in gt_results, bool_result))
        bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector),
                               k=len(truth_vector))

        # Compute NDCG for vector query
        vector_docs = []
        vector_scores = []
        for v in vector_result:
            vector_docs.append(v[0])
            vector_scores.append(v[1])
        truth_vector = list(map(lambda x: x in gt_results, vector_docs))
        vector_ndcg = ndcg_score(truth_vector,
                                 vector_scores,
                                 k=len(truth_vector))

        # Accumulate NDCGs
        bool_ndcgs.append(bool_ndcg)
        vector_ndcgs.append(vector_ndcg)

    # Average out score lists
    bool_avg = 0
    for bool in bool_ndcgs:
        bool_avg += bool
    bool_avg /= len(bool_ndcgs)

    vector_avg = 0
    for vector in vector_ndcgs:
        vector_avg += vector
    vector_avg /= len(vector_ndcgs)

    # Present averages and p-values
    print("Boolean NDCG average:", bool_avg)
    print("Vector NDCG average:", vector_avg)
    if n > 19:
        print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue)
    else:
        print("Wilcoxon p-value: Sample size too small to be significant")
    print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)

Exemplo n.º 10

0

Exibir arquivo

def query():
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    #ndexFile       = "src/Data/tempFile"
    #model_selection = "0"
    #queryText       = 'src/CranfieldDataset/query.text'
    #query_id        = "226"
    docCollection = CranFile('CranfieldDataset/cran.all')
    indexFile = sys.argv[1]
    model_selection = sys.argv[2]
    queryText = sys.argv[3]
    query_id = sys.argv[4]
    query_id = str(query_id).zfill(3)  # need for number 001 or 050
    queryTest = ""
    queryFile = loadCranQry(queryText)

    #Data Need
    if not model_selection == '2':
        queryTuple = queryFile[query_id]

        if query_id == queryTuple.qid:
            queryTest = queryTuple.text

    queryProcessor = QueryProcessor(queryTest, indexFile, docCollection.docs)
    if model_selection == "0":
        docIDs = queryProcessor.booleanQuery()
        print("Boolean")
        print("Total number of documents is:",
              str(len(docIDs)) + "\nTheir DocIDs our:" + str(docIDs))

    elif model_selection == "1":
        print("Vector")
        print(queryProcessor.vectorQuery(3))

    elif model_selection == "2":
        numberOfTimeToLoop = 5
        numberOfQueries = int(query_id)
        k = 10
        bresults = []
        vresults = []
        #Data Need
        for _ in range(numberOfTimeToLoop):
            #get list of Query result from qrel.txt

            dictOfQuery = getRandomQuery(queryFile, numberOfQueries)
            queryProcessor = QueryProcessor(
                "", indexFile,
                docCollection.docs)  # This is an extremely expensive process\

            start = timer()
            for __, queryText in dictOfQuery.items():
                queryProcessor.loadQuery(queryText)
                #docIDs = queryProcessor.booleanQuery()
                queryProcessor.booleanQuery()
            end = timer()
            #           print("Run:",i+1, "\nTime for boolean model on Query (",numberOfQueries,") \nTime:", end - start, "\n")
            bresults.append(end - start)
            start = timer()
            for __, queryText in dictOfQuery.items():
                #listOfDocIDAndSimilarity = queryProcessor.vectorQuery(k)
                queryProcessor.vectorQuery(k)
            end = timer()
            #            print("Run:",i+1, "\nTime for Vector model on Query (",numberOfQueries,") \nTime:", end - start, "\n")
            vresults.append(end - start)

        print("Model\t\tRun:" +
              '\t\t\tRun:'.join(map(str,
                                    range(numberOfTimeToLoop + 1)[1:])))
        print()
        print("Boolean Model: \t" + '\t'.join(map(str, bresults)))
        print()
        print("Vector Model: \t" + '\t'.join(map(str, vresults)))
        print()

Exemplo n.º 11

0

Exibir arquivo

Arquivo: batch_eval.py Projeto: natcrossman/SimpleSearchEngine

def eval(testOn):
    k = 10  # k the number of top k pairs of (docID, similarity) to get from vectorQuery
    dictQ_ID = []
    indexFile = sys.argv[1]  #v "src/Data/tempFile"
    queryText = sys.argv[2]
    qrelsText = sys.argv[3]
    dictOfQuery = {}
    dictQrelsText = {}
    docCollection = CranFile('./CranfieldDataset/cran.all')
    NDCGScoreBool = []
    numberOfQueries = int(sys.argv[4])
    NDCGScoreVector = []
    #indexFile           = "src/Data/tempFile"
    #queryText           = 'src/CranfieldDataset/query.text'
    #qrelsText           = 'src/CranfieldDataset/qrels.text'
    #numberOfQueries     = 50
    numberOfTimeToLoop = 5

    #Loads Files
    listOfQueryRelsMaping = readFile(qrelsText)
    queryFile = loadCranQry(queryText)

    #Data Need
    for i in range(numberOfTimeToLoop):

        #Get random Queiry
        dictOfQuery = getRandomQuery(queryFile, numberOfQueries)
        if testOn:
            assert len(dictOfQuery
                       ) == numberOfQueries, "Error are getting random query"

        # Return all query
        # dictOfQuery = getAllDataItems(queryFile)
        # if testOn:
        #     assert len(dictOfQuery) == 225, "Error are getting random query"

        #get list of Query result from qrel.txt
        dictQrelsText = getResultsFrom_QrelsFile(listOfQueryRelsMaping,
                                                 dictOfQuery)
        if testOn:
            assert len(dictQrelsText
                       ) == numberOfQueries, "Error number Of Queries to large"

        start = timer()
        queryProcessor = QueryProcessor(
            "", indexFile,
            docCollection.docs)  # This is an extremely expensive process\
        end = timer()

        if testOn:
            print("Time for creating QueryProcessor:", end - start)
        countDoc = 0
        start = timer()

        dictQ_ID = []
        for qid, queryText in dictOfQuery.items():
            countDoc += 1

            dictQ_ID.append(qid)

            if testOn:
                print("QID:", qid)
            start = timer()
            queryProcessor.loadQuery(queryText)
            end = timer()
            if testOn:
                print("Time for Load:", end - start)
                print("qrels: ", dictQrelsText[qid])

            start = timer()
            docIDs = queryProcessor.booleanQuery(
            )  # data would need to be like this [12, 14, 78, 141, 486, 746, 172, 573, 1003]
            #docIDs_1 = queryProcessor.booleanQuery_1()
            end = timer()
            if testOn:
                print("Time for booleanQuery:", end - start)

            start = timer()
            listOfDocIDAndSimilarity = queryProcessor.vectorQuery(
                k
            )  # data need to look like k=3 [[625,0.8737006126353902],[401,0.8697643788341478],[943,0.8424991316663082]]
            #vectorQueryDict[qid] = dictOfDocIDAndSimilarity
            end = timer()
            if testOn:
                print("Time for vectorQuery:", end - start)
                print("booleanQuery:", docIDs)

            #For Boolean part
            start = timer()
            yTrue = []
            yScore = []
            for docID in docIDs:
                yScore.append(1)
                if docID in dictQrelsText[qid]:
                    yTrue.append(1)
                else:
                    yTrue.append(0)
            yTrue.sort(reverse=True)
            score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential")
            if math.isnan(score):
                NDCGScoreBool.append(0)
            else:
                NDCGScoreBool.append(score)
            end = timer()
            if testOn:
                print("Time for  Boolean ndcg:", end - start)

            #For Vector part
            start = timer()
            yTrue = []
            yScore = []
            if testOn:
                print("vectorQuery:", listOfDocIDAndSimilarity)
            for docID_Score in listOfDocIDAndSimilarity:
                yScore.append(float(docID_Score[1]))
                if docID_Score[0] in dictQrelsText[qid]:
                    yTrue.append(1)
                else:
                    yTrue.append(0)
            yTrue.sort(reverse=True)
            score = metrics.ndcg_score(yTrue[:k], yScore[:k], k, "exponential")
            if math.isnan(score):
                NDCGScoreVector.append(0)
            else:
                NDCGScoreVector.append(score)
            end = timer()
            if testOn:
                print("Time for  Vector ndcg:", end - start)
        print("\nRunning Querys iteration:(", str(i + 1), ")\n", dictQ_ID)

        if testOn:
            for QID, boolScore, vectorScore in zip(dictQ_ID, NDCGScoreBool,
                                                   NDCGScoreVector):
                print("QID", QID, "Boolean Model:", boolScore, "Vector Model",
                      vectorScore)

    print("\nThe Length Of Both NDCG Score is: ", len(NDCGScoreBool), "==",
          len(NDCGScoreVector))

    print('\nThe Avg NDCG Score')
    vectorAvg = avg(NDCGScoreVector)
    BoolAvg = avg(NDCGScoreBool)
    print("Avg NDCG Score for Bool:", BoolAvg, "\nAvg NDCG Score for Vector:",
          vectorAvg)
    end = timer()
    if testOn:
        print("\n\nTime for running ", countDoc, " queries:", end - start)

    print('\nThe P-Value')
    p_va_ttest = stats.ttest_ind(NDCGScoreBool, NDCGScoreVector)
    p_va_wilcoxon = stats.wilcoxon(NDCGScoreBool, NDCGScoreVector)
    print("T-Test P-value: ", p_va_ttest)
    print("Wilcoxon P-value: ", p_va_wilcoxon)
    print('Done')

Exemplo n.º 12

0

Exibir arquivo

Arquivo: query.py Projeto: vaishnavivisweswaraiah/InformationRetrieval_searchEngine

        print("Total number of retrieved document for search is", len(Bresult))
        print(Bresult)
        BoolenQueryResultDic.append({qid: Bresult})
    else:
        print("Vector Query TF-IDF calculation in progress")
        Topk, k = qprocessorobj.vectorQuery(3)
        #print("vector",qid,qrys[qid].text)
        print("Top", k, "(DocID Similarity)", Topk[:k])


''' ************this below code is reused in batch_eval also*******************'''
input_filename = "cran.all"
ouput_filename = sys.argv[1]  #"index_file" #sys.argv[2]
Queryfile = "query.text"  #sys.argv[3]#"query.text"
'''creating object for cranefile and collection file and inverted index class,postings class'''
cf = CranFile(input_filename)
collectionfile = Collection()
indexobj = InvertedIndex()
'iterating over cran file for document id'
for i, doc in enumerate(cf.docs):
    collectionfile.docs.update({doc.docID: doc})
postingobj = Posting(doc.docID)
'''reading index file which is stored while creating index'''
with open(ouput_filename, "r") as invertedindex:
    InvertedIndex.items = json.load(invertedindex)
'formatting the query id in qrel.text and finding common query id in qrery.text'
qidlist = {}
qrys = loadCranQry(Queryfile)
for position, q in enumerate(qrys):
    qidlist[q] = position + 1
'Below Variables are used for batch_eval.py file'

Exemplo n.º 13

0

Exibir arquivo

def test(index_loc, cran_loc, qrels_loc):
    ''' test your code thoroughly. put the testing cases here'''

    ##### SETUP ITEMS #####

    # Grab index file to restore II
    ii = InvertedIndex()
    ii.load(index_loc)

    # Get the document collection
    cf = CranFile(cran_loc)

    # Get ground-truth results from qrels.txt
    with open(qrels_loc) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    ##### INITIAL TEST ITEMS #####
    print("TESTS BASED ON SUGGESTED TESTING POINTS")

    # Ensure tf is correct
    #   Find a random word and check TF value against what is manually done
    posting_list = ii.find("experiment").posting
    tf_vector = []
    for posting in posting_list:
        tf_vector.append(len(posting_list[posting].positions) \
            == posting_list[posting].term_freq())
    print("TF is computed correctly:", all(tf_vector))

    # Ensure idf is correct
    print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \
        == ii.idf("experiment"))

    # As both tf and idf are correct, and tf-idf is a product of the two,
    #   it is reasonable to assume tf-idf is computed correctly

    ##### BOOL QUERY TESTS #####

    # Here, I use very specific boolean queries to ensure that a
    #   limited number of documents are returned
    print("\nBOOL QUERY TESTS")

    # Ensure that the exact title of doc 8 matches for doc 8
    doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition"
    qp1 = QueryProcessor(doc8, ii, cf)
    print("Bool query matches on exact title:", qp1.booleanQuery() == [8])

    # Ensure that bool query matches very specific AND query
    qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf)
    print(
        "Bool query matches on specific AND query ('hugoniot and infinitesimally'):",
        qp2.booleanQuery() == [329])

    # Test that an OR query is handled properly
    #   Both gravel and stagnation have completely distinct postings lists.
    #   OR should merge them.
    gravel_postings = ii.find("gravel").sorted_postings[:]
    stag_postings = ii.find("stagnat").sorted_postings[:]
    gravel_postings.extend(stag_postings)
    qp3 = QueryProcessor("gravel or stagnation", ii, cf)
    print("Bool query successfully handles OR ('gravel or stagnation'):",
          qp3.booleanQuery() == sorted(gravel_postings))

    # Test that NOT is handled properly
    #   The posting list for "diameter" is a subset of "slipstream" postings
    #   (oddly enough). To test this works, do "slipstream and not diameter"
    #   and we chould get slipstream's postings minus those of diameter.
    slip_postings = ii.find("slipstream").sorted_postings[:]
    diam_postings = ii.find("diamet").sorted_postings[:]
    slip_not_diam = [t for t in slip_postings if t not in diam_postings]
    print("Bool query successfully handles NOT ('slipstream and not diameter'):",
        QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \
          == slip_not_diam)

    # Ensure AND/OR order doesn't matter
    print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery())
    print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):",
        QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery())

    # Ensure that the presence of parens does not change query results
    print("Bool query can handle query regardless of parens ('slipstream and diameter'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery())

    # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries
    print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):",
        QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery())
    print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):",
        QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery())

    # Ensure parentheses properly group items
    #   Tested by doing the query "manually" by adding/orring the correct terms
    part_one = QueryProcessor("conduction and cylinder and gas", ii,
                              cf).booleanQuery()
    part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery()
    part_one.extend(part_two)
    expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery()
    expected_result.extend(part_one)
    print("Bool query parens successfully group conflicting operators:",
        QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \
          == sorted(list(set(expected_result))))

    ##### VECTOR QUERY TESTS #####

    # For this, just ensure that most of the results are in the expected list
    print("\nVECTOR QUERY TESTS")

    # Ensure vector query can match on exact title
    print("Vector query matches on exact title:",
          qp1.vectorQuery(1)[0][0] == 8)

    # Try a few example queries from query.text
    #   As long as one-fifth of t-10 are in gt_result, call it a pass
    # Note that queries with larger answer sets were chosen to
    #   ensure there were enough to get to one-fifth of ten
    qc = loadCranQry("query.text")
    poss_queries = list(qc)

    # Query 001
    result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("001") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 001:",
          sum(correct_vector) > 2)

    # Query 128
    result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("128") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 128:",
          sum(correct_vector) > 2)

    # Query 226
    result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("226") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 226:",
          sum(correct_vector) > 2)

    # Query 196
    result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("196") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 196:",
          sum(correct_vector) > 2)

    # Query 291
    result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("291") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 291:",
          sum(correct_vector) > 2)

Exemplo n.º 14

0

Exibir arquivo

def test():
    ''' test your code thoroughly. put the testing cases here'''
    dictTest_experiment = {
        '1': 3,
        '11': 1,
        '12': 1,
        '16': 1,
        '17': 1,
        '19': 1,
        '25': 1,
        '29': 1,
        '30': 2,
        '35': 1,
        '37': 1,
        '41': 1,
        '42': 1,
        '43': 1,
        '47': 1,
        '52': 2,
        '53': 1,
        '58': 1,
        '69': 1,
        '70': 1,
        '74': 2,
        '78': 2,
        '84': 3,
        '99': 2,
        '101': 1,
        '103': 1,
        '112': 1,
        '115': 1,
        '121': 1,
        '123': 3,
        '131': 1,
        '137': 1,
        '140': 1,
        '142': 1,
        '154': 1,
        '156': 1,
        '167': 1,
        '168': 1,
        '170': 1,
        '171': 2,
        '173': 2,
        '176': 1,
        '179': 2,
        '183': 1,
        '184': 1,
        '186': 3,
        '187': 1,
        '188': 1,
        '189': 2,
        '191': 1,
        '195': 3,
        '197': 2,
        '202': 1,
        '203': 1,
        '206': 2,
        '207': 2,
        '212': 1,
        '216': 1,
        '220': 1,
        '222': 1,
        '225': 2,
        '227': 1,
        '230': 1,
        '234': 4,
        '245': 1,
        '251': 1,
        '256': 3,
        '257': 1,
        '262': 1,
        '271': 3,
        '273': 1,
        '277': 1,
        '282': 1,
        '283': 1,
        '286': 1,
        '287': 1,
        '289': 1,
        '294': 1,
        '295': 1,
        '304': 1,
        '307': 1,
        '329': 2,
        '330': 2,
        '334': 2,
        '338': 1,
        '339': 2,
        '344': 3,
        '345': 1,
        '346': 3,
        '347': 1,
        '354': 1,
        '360': 1,
        '369': 1,
        '370': 1,
        '372': 3,
        '377': 1,
        '397': 1,
        '409': 1,
        '411': 2,
        '413': 2,
        '418': 1,
        '420': 2,
        '421': 1,
        '423': 2,
        '427': 1,
        '433': 1,
        '435': 1,
        '439': 1,
        '441': 2,
        '442': 3,
        '443': 1,
        '453': 1,
        '455': 2,
        '462': 1,
        '464': 1,
        '467': 1,
        '484': 3,
        '494': 2,
        '496': 1,
        '497': 2,
        '498': 1,
        '501': 1,
        '503': 1,
        '504': 1,
        '505': 1,
        '511': 1,
        '517': 1,
        '518': 2,
        '519': 1,
        '520': 2,
        '522': 3,
        '536': 1,
        '540': 1,
        '544': 3,
        '549': 2,
        '552': 2,
        '553': 1,
        '558': 2,
        '563': 1,
        '567': 1,
        '569': 2,
        '572': 4,
        '588': 1,
        '595': 1,
        '600': 1,
        '606': 1,
        '610': 1,
        '632': 1,
        '634': 1,
        '635': 1,
        '636': 1,
        '644': 1,
        '645': 1,
        '649': 1,
        '658': 1,
        '662': 2,
        '663': 2,
        '666': 2,
        '670': 1,
        '675': 1,
        '678': 1,
        '679': 1,
        '685': 3,
        '688': 4,
        '689': 2,
        '694': 1,
        '704': 2,
        '712': 1,
        '713': 1,
        '717': 1,
        '720': 1,
        '725': 1,
        '728': 1,
        '729': 1,
        '739': 1,
        '740': 1,
        '743': 1,
        '753': 1,
        '760': 3,
        '764': 1,
        '766': 4,
        '767': 3,
        '772': 3,
        '781': 2,
        '790': 1,
        '801': 3,
        '802': 1,
        '806': 1,
        '816': 2,
        '820': 1,
        '823': 2,
        '825': 1,
        '827': 2,
        '829': 1,
        '830': 1,
        '836': 5,
        '844': 1,
        '845': 2,
        '846': 1,
        '847': 2,
        '856': 4,
        '857': 3,
        '858': 3,
        '863': 2,
        '866': 2,
        '867': 1,
        '869': 1,
        '878': 2,
        '881': 1,
        '887': 1,
        '891': 2,
        '907': 1,
        '911': 2,
        '912': 2,
        '923': 1,
        '924': 1,
        '927': 3,
        '928': 3,
        '932': 2,
        '935': 2,
        '946': 2,
        '950': 4,
        '951': 1,
        '954': 2,
        '955': 1,
        '959': 1,
        '961': 1,
        '964': 1,
        '965': 1,
        '973': 1,
        '974': 1,
        '984': 2,
        '986': 4,
        '996': 1,
        '997': 4,
        '999': 1,
        '1006': 1,
        '1008': 1,
        '1016': 1,
        '1019': 3,
        '1028': 1,
        '1039': 3,
        '1040': 1,
        '1045': 1,
        '1046': 1,
        '1049': 2,
        '1051': 1,
        '1062': 3,
        '1066': 4,
        '1069': 1,
        '1070': 1,
        '1074': 3,
        '1075': 3,
        '1076': 1,
        '1078': 1,
        '1080': 1,
        '1081': 1,
        '1082': 1,
        '1083': 1,
        '1092': 1,
        '1097': 3,
        '1098': 2,
        '1110': 1,
        '1112': 1,
        '1118': 2,
        '1122': 2,
        '1125': 1,
        '1127': 1,
        '1145': 1,
        '1146': 1,
        '1151': 1,
        '1153': 1,
        '1155': 2,
        '1156': 3,
        '1158': 1,
        '1159': 2,
        '1160': 2,
        '1161': 2,
        '1167': 2,
        '1171': 1,
        '1172': 1,
        '1177': 1,
        '1185': 1,
        '1186': 1,
        '1187': 1,
        '1192': 1,
        '1195': 1,
        '1196': 2,
        '1198': 1,
        '1199': 1,
        '1204': 3,
        '1205': 1,
        '1209': 3,
        '1212': 1,
        '1213': 2,
        '1214': 3,
        '1216': 1,
        '1218': 2,
        '1220': 1,
        '1222': 1,
        '1225': 3,
        '1227': 1,
        '1228': 1,
        '1230': 1,
        '1231': 1,
        '1234': 1,
        '1237': 1,
        '1261': 1,
        '1262': 1,
        '1263': 2,
        '1264': 2,
        '1268': 1,
        '1269': 2,
        '1277': 2,
        '1290': 1,
        '1298': 1,
        '1302': 2,
        '1310': 1,
        '1314': 2,
        '1317': 1,
        '1319': 1,
        '1324': 1,
        '1337': 3,
        '1338': 2,
        '1339': 1,
        '1341': 1,
        '1352': 2,
        '1363': 2,
        '1364': 2,
        '1369': 1,
        '1372': 1,
        '1374': 1,
        '1378': 1,
        '1384': 1,
        '1390': 1,
        '1392': 1,
        '1396': 1,
        '1397': 1
    }
    dictTest_bifurc = {'957': 1, '1232': 1}

    filePath = "src/CranfieldDataset/cran.all"
    fileName = "src/Data/Test.json"
    fileNameO = "src/Data/TestPickle"
    #filePath = "./CranfieldDataset/cran.all"
    #fileName = "./Data/tempFile.json"
    invertedIndexer = InvertedIndex()
    data = CranFile(filePath)
    for doc in data.docs:
        invertedIndexer.indexDoc(doc)

    #TF-IDF TEST
    TEMP = invertedIndexer.idf("experiment")
    Temp1 = invertedIndexer.idf("opportun")
    t = str(TEMP)
    t2 = str(Temp1)
    assert t == "0.6172", " Wrong idf."
    assert t2 == "2.8451", " Wrong idf."

    assert len(invertedIndexer.find("experiment").get_posting_list()
               ) == 338, "Worng Lenght for experiment term find does not work."
    assert invertedIndexer.get_total_number_Doc(
    ) == 1400, "Worng total nubmer of Doc in Corpus"

    for docID, post in invertedIndexer.find(
            "experiment").get_posting_list().items():
        assert docID in dictTest_experiment and post.term_freq(
        ) == dictTest_experiment[docID], "For Term experiment wrong value"

    dictTest_bifurc = {'957': 1, '1232': 1}
    for docID, post in invertedIndexer.find(
            "bifurc").get_posting_list().items():
        assert docID in dictTest_bifurc and post.term_freq(
        ) == dictTest_bifurc[docID], "For Term experiment wrong value"

    invertedIndexer.save(fileName)
    assert path.exists(fileName), "error in saving json data."

    invertedIndexer.storeData(fileNameO)
    assert path.exists(fileNameO), "error in saving json data."
    Temp = invertedIndexer.loadData(fileNameO)
    idfScore = Temp.idf("experiment")
    assert str(idfScore) == "0.6172", " Error in Load the picle file."

    print("test Passed")

Exemplo n.º 15

0

Exibir arquivo

    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        #constructing document vector for document 1
        vectorResult = []
        cf = CranFile('cran.all')
        documentVector = {}
        queryVector = {}
        ps = PorterStemmer()
        finalResult = {}
        for q in self.raw_query:
            if q == self.queryId:
                query_tokens = []
                stemmed_query_tokens = []
                #            print(q, self.raw_query[q].text)
                #   query_tokens = re.split(" ", self.raw_query[q].text.replace('\n', ' '))
                query_tokens = word_tokenize(self.raw_query[q].text)
                query_tokens = [element.lower() for element in query_tokens];
                tempcounter = 0
                while tempcounter < len(query_tokens):
                    query_tokens[tempcounter] = correction(query_tokens[tempcounter]);
                    tempcounter = tempcounter + 1
                ps = PorterStemmer()
                temp = 0
                querytokentemp = 0
                while temp < len(query_tokens):

                    query_tokens[temp] = ps.stem(query_tokens[temp])
                    querytokentemp = querytokentemp + 1
                    with open("stopwords") as f:
                        for line in f:
                            if line.strip() == query_tokens[temp]:
                                query_tokens.remove(line.strip())
                                temp = temp - 1
                    temp = temp + 1

                #block to calculate query vector start

                temp2 = 0
                while temp2 < len(query_tokens):
                    if query_tokens[temp2] in self.index.items:
                        wordfreq = [query_tokens.count(query_tokens[temp2])]
       #                 print(wordfreq)
                        queryVector[query_tokens[temp2]] = (self.index.items[query_tokens[temp2]].get('idf') )* (1 + math.log( wordfreq[0] , 10))
                        temp2 = temp2 + 1
                    else:
                        queryVector[query_tokens[temp2]] = 0;
                        temp2 = temp2 + 1
                #block to calculate query vector end
                docidScorepair = {}
                for doc in cf.docs:
    #                print(doc.docID, doc.title, doc.body)

    #                print("generating document vector here")
                    titletoken = word_tokenize(doc.title)
                    bodytoken = word_tokenize(doc.body)
                    tokens = titletoken + bodytoken
                    tokens = [element.lower() for element in tokens];
                    temp3 = 0
                    while temp3 < len(tokens):
                        with open("stopwords") as f:
                            for line in f:
                                if line.strip() == tokens[temp3]:
                                    tokens.remove(line.strip())
                                    temp3 = temp3 - 1
                        temp3 = temp3 + 1
                    temp = 0
                    while temp < len(tokens):
                        tokens[temp] = ps.stem(tokens[temp])
                        temp = temp + 1
                    temp2 = 0
                    while temp2 < len(tokens):
                        if tokens[temp2] in self.index.items:
                            documentVector[tokens[temp2]] = (1 + math.log(self.index.items[tokens[temp2]].get('posting').get(doc.docID).get('termfreq'),10)) * (self.index.items[tokens[temp2]].get('idf'))
                            temp2 = temp2 + 1
                        else:
                            documentVector[tokens[temp2]] = 0;
                            temp2 = temp2 + 1
    #                print('document vector complete')
                    #print(documentVector)
                    # without normalization

                    #normalize query vector and document vector start
                    normalizequeryvectorcounter = 0
                    queryVectornormalized = []
#                    sumofsquaresquery = 0
#                    for z in queryVector:
#                        sumofsquaresquery =  sumofsquaresquery + np.multiply(queryVector[z] , queryVector[z])
#
#                    sumofsquaresquery = 1 / math.sqrt(sumofsquaresquery)


#                    for r in queryVector:
#                        queryVector[r] = queryVector[r] *  sumofsquaresquery

                    sumofsquaresdocument = 0
                    for l in documentVector:
                        sumofsquaresdocument = sumofsquaresdocument + np.multiply(documentVector[l], documentVector[l])
                    try:
                        sumofsquaresdocument = 1 / math.sqrt(sumofsquaresdocument)

                    except:
                        sumofsquaresdocument = 0
                    for h in documentVector:
                        documentVector[h] = documentVector[h] * sumofsquaresdocument
                    #noramlize ends

                    cosineVector = queryVector.copy()
                    for u in queryVector:
                        if u in documentVector:
                            cosineVector[u] = np.multiply(documentVector[u], queryVector[u])
                        else:
                            #below line is wrong
#                            cosineVector[k] = queryVector[k]
                            cosineVector[u] = 0
#                    print ("query vector -->")
#                    print(queryVector)
#                    print ("document vector -->")
#                    print( documentVector)
#                    print ("cosine vector -->")
#                    print(cosineVector)
#                    print ("****************************")
                    # document score

                    docidScorepair[doc.docID] = sum(cosineVector.values())
                    #end of document score

                    self.intermediateResultVectorQuery[q] = docidScorepair

                    cosineVector = {}
                    #end without normalization

                    documentVector = {}
                queryVector = {}

#                print(query_tokens)
                counterObject = Counter(self.intermediateResultVectorQuery[q])
                high = counterObject.most_common(k)
#                print('*** query id ***'+q + "***** query text *****" +self.raw_query[q].text)
                if k == 3:
                    print(high)
                vectorResult = [i[0] for i in counterObject.most_common(k)]
                #                print(vectorResult)
        return vectorResult

Exemplo n.º 16

0

Exibir arquivo

Arquivo: index.py Projeto: vidhlakh/Information-retrieval

    # command line usage: "python index.py cran.all index_file"
    # the index is saved to index_file
    coll={}
    collect= Collection()
    #creating object     
    invertindex=InvertedIndex() 
    
    #adding all documents to collection class
    for docu in cf.docs:
        coll={docu.docID:[docu.title,docu.author,docu.body]}
        collect.docs.update(coll)
        #invertindex.docs.update(coll)
    
    for docu in cf.docs:                 
        invertindex.indexDoc(docu)   
    
    #save to json file    
    invertindex.save(indexfile)
    # load from json file
    invertindex.load(indexfile)     
    
    
if __name__ == '__main__':
    
    #input cran file
    crfile =sys.argv[1]  
    #output index file 
    indexfile=sys.argv[2] 
    cf = CranFile (crfile)
    indexingCranfield()
    test()

Exemplo n.º 17

0

Exibir arquivo

Arquivo: index.py Projeto: adamhs1997/cs7800project1

def test():
    ''' test your code thoroughly. put the testing cases here'''

    ####### TEST CASES FOR INVERTED INDEX CLASS #######

    # Get all documents from cran.all--let Cranfile object handle this
    cf = CranFile("cran.all")

    # Build an inverted index object
    ii = InvertedIndex()

    # Index one document
    ii.indexDoc(cf.docs[0])

    # The first temr should be "experiment" (verified by printing contents of II)
    #   We want to ensure that find() finds it
    index_item = ii.find("experiment")
    print("Result of find:", index_item.term, index_item.posting)

    # Next, sort to ensure that it works
    # TODO: figure out what this should doc
    ii.sort()
    print("Sorted!")

    # Get the IDF of the term "experiment"
    #   Following the formula from our slides, this should be 0
    print("IDF:", ii.idf("experiment"))

    # Add back in the rest of Cranfield dataset
    for doc in cf.docs[1:]:
        ii.indexDoc(doc)

    # Re-do find now that we have more things in the index
    index_item = ii.find("experiment")
    print("Result of find:", index_item.term, index_item.posting)

    # Ensure sort works on larger index
    # Next, sort to ensure that it works
    # TODO: figure out what this should doc
    ii.sort()
    print("Sorted!")

    # Calculate IDF with larger index
    # Get the IDF of the term "experiment"
    #   Following the formula from our slides, this should be 0
    print("IDF:", ii.idf("experiment"))

    # Get the tfidf dict
    ii.compute_tfidf()

    # Save off our index
    ii.save("index.pkl")

    # Read back in the index, ensure they are the same
    ii_from_file = InvertedIndex()
    ii_from_file.load("index.pkl")

    # Cannot determine if the actual items are equal objects,
    #   so just ensure the stats are the same
    # print("Load matches saved items:", ii.items == ii_from_file.items)
    print("Load matches saved number of docs:", ii.nDocs == ii_from_file.nDocs)
    print("Load matches saved IDF for 'experiment':",
          ii.idf("experiment") == ii_from_file.idf("experiment"))
    print("Load matches saved find term for 'experiment':",
          ii.find("experiment").term == ii_from_file.find("experiment").term)
    print(
        "Load matches saved find posting for 'experiment':",
        str(ii.find("experiment").posting) == str(
            ii_from_file.find("experiment").posting))

    ####### TEST CASES FOR POSTING CLASS #######

    # Create test posting
    p = Posting(docID=1)

    # Test adding a position
    p.append(3)
    print("Position appended to posting:", p.positions == [3])

    # Add position out of order, ensure sort works
    p.append(1)
    print("Append is initially out-of-order:", p.positions == [3, 1])
    p.sort()
    print("Sort correctly sorts postings:", p.positions == [1, 3])

    # Ensure we can merge in new postings
    to_merge = [4, 5, 6]
    p.merge(to_merge)
    print("Merge correctly merges:", p.positions == [1, 3, 4, 5, 6])

    # Ensure term frequency is correctly
    print("Term frequency correctly counts postings:", p.term_freq() == 5)

    ####### TEST CASES FOR INDEX ITEM CLASS #######

    # Create index item
    iitem = IndexItem("abc")

    # Add value to index item
    iitem.add(0, 40)
    print("Document added to item:", 0 in iitem.posting)
    print("Posting created for document in item:",
          type(iitem.posting[0]) == type(Posting(5)))

    ####### ADDITIONAL TEST CASES #######

    print("\nTHE FOLLOWING ARE BASED ON THE GIVEN TEST QUESTIONS")

    # Act on the assumption all words are stemmed
    #   This should be done in the tokenize part of util
    #   The idea was to re-stem all words and ensure they equal the words
    #     in the index, but some double-stemmings differ anyway.

    # Ensure stopwords were removed
    from nltk.stem.porter import PorterStemmer
    with open("stopwords") as f:
        stopwords = f.readlines()
    s = PorterStemmer()
    stopword_vector = list(
        map(lambda x: s.stem(x.strip()) in ii.items.items(), stopwords))
    print("All stopwords removed from index:", not any(stopword_vector))

    # Print number of terms in dict--Dr. Chen can ensure this is right
    print("Number of terms in dictionary:", len(ii.items))

    # Print average size of postings--Dr. Chen can ensure this makes sense
    sum = 0
    posting_count = 0
    for item in ii.items.values():
        for posting in item.posting.values():
            sum += len(posting.positions)
            posting_count += 1
    print("Average posting length:", sum / posting_count)

Exemplo n.º 18

0

Exibir arquivo

def eval(indexfilename, queryfilename, queryrefilename, numberofrandomqueries):

    # ToDo
    actual = []
    #
    if numberofrandomqueries > 225:
        raise Exception('please enter query count less than or equal to 225')
    qrys = loadCranQry("query.text")
    validqueries = []
    querycounter = 0
    for q in qrys:
        validqueries.append(int(q))

    loadiindex = InvertedIndex()
    loadiindex = loadiindex.load("index_file.pickle")
    #    print("index loaded")
    cf = CranFile('cran.all')
    #QueryProcessor.numberofresult =10
    #qp = QueryProcessor(qrys,loadiindex,cf.docs,10)
    queryRelevence = dict()
    for line in open(queryrefilename):

        fields = line.split(" ")
        fields[0] = '%0*d' % (3, int(fields[0]))
        if fields[0] in queryRelevence:
            # and let's extract the data:
            queryRelevence[fields[0]].append(fields[1])
        else:
            # create a new array in this slot
            queryRelevence[fields[0]] = [fields[1]]
    replacecounter = 0
    queryRelevenceUpdated = {}
    for k in queryRelevence:

        queryRelevenceUpdated['%0*d' % (3, int(
            validqueries[replacecounter]))] = queryRelevence.get(k)
        replacecounter = replacecounter + 1

#  relevent = list(queryRelevence.keys())
# relevent = list(map(int, relevent))
#samplespace = np.intersect1d(relevent, validqueries)
    list_of_random_items = random.sample(validqueries, numberofrandomqueries)
    tempcounter2 = 0
    booleanndcg = []
    vectorndcg = []

    while tempcounter2 < numberofrandomqueries:

        list_of_random_items[tempcounter2] = '%0*d' % (
            3, int(list_of_random_items[tempcounter2]))
        print('query for which ndcg is calculated ' +
              str(list_of_random_items[tempcounter2]))
        y = str(list_of_random_items[tempcounter2])
        vectorresult = query(indexfilename, '1', queryfilename,
                             str(list_of_random_items[tempcounter2]), 10)
        #       vectorresult = ['573', '51', '944', '878', '12', '486', '875', '879', '746', '665']
        #       print(vectorresult)
        tempcounter = 0
        for z in vectorresult:

            if z in queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]:
                vectorresult[tempcounter] = 1
            else:
                vectorresult[tempcounter] = 0

            tempcounter = tempcounter + 1
        #print(vectorresult)
        idealvectorresult = vectorresult.copy()
        idealvectorresult.sort(reverse=True)
        #print(idealvectorresult)
        if sum(idealvectorresult) == 0:
            ndcgscore = 0
        else:
            ndcgscore = ndcg_score(idealvectorresult, vectorresult)
    # print(ndcgscore)
        vectorndcg.append(ndcgscore)
        tempcounter3 = 0

        booleanqueryresult = query(indexfilename, '0', queryfilename,
                                   str(list_of_random_items[tempcounter2]), 10)
        #booleanqueryresult = ['462','462','462','462','462','462','462','462','462']
        booleanquery = booleanqueryresult.copy()
        for g in booleanquery:

            if g in queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]:
                booleanquery[tempcounter3] = 1
            else:
                booleanquery[tempcounter3] = 0

            tempcounter3 = tempcounter3 + 1
        #print(booleanquery)
        tempcounter4 = len(booleanquery)
        while tempcounter4 < 10:
            booleanquery.append(0)
            tempcounter4 = tempcounter4 + 1
        idealbooleanresult = []
        for i in range(0, 10):
            if i < len(queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]):
                idealbooleanresult.append(1)
            else:
                idealbooleanresult.append(0)

        idealbooleanresult.sort(reverse=True)
        if sum(booleanquery) == 0:
            ndcgscoreboolean = 0
        else:
            ndcgscoreboolean = ndcg_score(booleanquery, idealbooleanresult)
        booleanndcg.append(ndcgscoreboolean)
        tempcounter2 = tempcounter2 + 1
    print('P value for all the queries processed is:')
    print(
        scipy.stats.wilcoxon(vectorndcg,
                             booleanndcg,
                             zero_method='wilcox',
                             correction=False))
    print('Done')