예제 #1
0
def query():
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuer
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    # Ensure args are valid
    if len(argv) is not 5:
        print(
            "Syntax: python query.py <index-file-path> <processing-algorithm> <query.txt path> <query-id>"
        )
        return

    # Grab arguments
    index_file_loc = argv[1]
    processing_algo = argv[2]
    query_file_path = argv[3]
    query_id = argv[4]

    # Grab index file to restore II
    ii = InvertedIndex()
    ii.load(index_file_loc)

    # Get the document collection
    cf = CranFile("cran.all")

    # Get the query collection
    qc = loadCranQry(query_file_path)

    # Get the query
    if 0 < int(query_id) < 10:
        query_id = '00' + str(int(query_id))
    elif 9 < int(query_id) < 100:
        query_id = '0' + str(int(query_id))
    try:
        query = qc[query_id].text
    except KeyError:
        print("Invalid query id", query_id)
        return

    # Initialize a query processor
    qp = QueryProcessor(query, ii, cf)

    # Do query
    if int(processing_algo) is 0:
        result = qp.booleanQuery()
        if result:
            print("Results:", ", ".join(str(x) for x in qp.booleanQuery()))
        else:
            print("Results: None")
    elif int(processing_algo) is 1:
        result = qp.vectorQuery(k=3)
        print("Results:")
        for r in result:
            print("Doc", r[0], "Score", r[1])
    else:
        print("Invalid processing algorithm",
              processing_algo + ". Use 0 (boolean) or 1 (vector).")
예제 #2
0
def query( indexfilename,processingalgorithm,queryfilename, queryid, numresults=3):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents

    qrys = loadCranQry(queryfilename)
#    for q in qrys:
#        print(q, qrys[q].text)

    loadiindex = InvertedIndex()
    loadiindex = loadiindex.load(indexfilename)
#    print("index loaded")

    cf = CranFile('cran.all')

    queryProcessor = QueryProcessor(qrys, loadiindex, cf.docs, numresults)
    if processingalgorithm == '0' :
        queryProcessor.preprocessing()
        queryProcessor.queryId = queryid
        results = queryProcessor.booleanQuery()
    if processingalgorithm == '1':
        queryProcessor.queryId = queryid
        results = queryProcessor.vectorQuery(queryProcessor.numofresults)
    return results
예제 #3
0
    def crawl(self):
        """
        Performs a crawl process on each pending page. 

        Args:
            None
        
        Returns:
            None
        """

        index = InvertedIndex()

        for url in self.urls:
            if url not in self.crawled_urls:

                # build page
                response = requests.get(f"{self.url}{url}")
                page_object = BeautifulSoup(response.content, "html.parser")

                # begin crawling
                print(f"Crawling: {url}")
                self.links(page_object)
                page = f"{self.url}{url}"
                words = self.words(page_object)
                index.create_index(page, words)

                # mark page as visited
                self.crawled_urls.append(url)
                self.urls.remove(url)

                time.sleep(5)
예제 #4
0
def test_apriori():
    data = ("a,b,c,d,e,f\n"
            "g,h,i,j,k,l\n"
            "z,x\n"
            "z,x\n"
            "z,x,y\n"
            "z,x,y,i\n")

    expectedItemSets = {
        ItemSet("i"): 2 / 6,
        ItemSet("z"): 4 / 6,
        ItemSet("x"): 4 / 6,
        ItemSet("y"): 2 / 6,
        ItemSet("xz"): 4 / 6,
        ItemSet("yz"): 2 / 6,
        ItemSet("xy"): 2 / 6,
        ItemSet("xyz"): 2 / 6
    }

    index = InvertedIndex()
    index.load(data)
    itemsets = apriori(index, 2 / 6)
    assert (set(expectedItemSets.keys()) == set(itemsets))
    for itemset in itemsets:
        assert (expectedItemSets[itemset] == index.support(itemset))

    print("Itemsets={}".format([i for i in itemsets if len(i) > 1]))

    # (antecedent, consequent, confidence, lift, support)
    expectedRules = {
        (frozenset({Item("x"),
                    Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("z"),
                                            Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("x")}), frozenset({Item("z")}), 1, 1.5, 2 / 3),
        (frozenset({Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("y")}), frozenset({Item("z"),
                                            Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("y")}), frozenset({Item("z")}), 1, 1.5, 1 / 3),
        (frozenset({Item("z"),
                    Item("x")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("z"),
                    Item("y")}), frozenset({Item("x")}), 1, 1.5, 1 / 3),
        (frozenset({Item("z")}), frozenset({Item("x"),
                                            Item("y")}), 0.5, 1.5, 1 / 3),
        (frozenset({Item("z")}), frozenset({Item("x")}), 1, 1.5, 2 / 3),
        (frozenset({Item("z")}), frozenset({Item("y")}), 0.5, 1.5, 1 / 3),
    }

    rules = set(generate_rules(itemsets, 0, 0, index))

    for (antecedent, consequent, confidence, lift, support) in rules:
        print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".format(
            antecedent, consequent, confidence, lift, support))

    assert (rules == expectedRules)
예제 #5
0
 def __init__(self, query, index_file, collection):
     ''' index is the inverted index; collection is the document collection'''
     self.raw_query = query
     self.index = InvertedIndex()
     self.index = self.index.loadData(index_file)
     self.docs = collection
     self.tokenizer = Tokenizer(
         known_words=set(self.index.get_items_inverted().keys()))
     if self.raw_query:
         self.processed_query = self.preprocessing(self.raw_query)
    def booleanQuery(self):
        ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" 
        for retrieving posting lists and merge them'''
        #ToDo: return a list of docIDs
        PostingDict = {
        }  #store key value pair of query term and postings by processing index file
        boolen = []  #stores list of docid for each queryterm key
        booleanResult = set()
        tempDic = {}
        QueryDic = {}

        for qterm in Queryterm:
            plist = InvertedIndex.getPostingsList(qterm)
            '''since every term in inverted index is unique below code adds the qterm:postings list 
                to Postings Dictionary'''
            PostingDict.update({qterm: plist})
        for qterms in PostingDict.keys():
            tempDic[qterms] = len(PostingDict[qterms])
        for qterms, cf in tempDic.items():
            if cf > 0:
                if cf < 300:
                    QueryDic[qterms] = cf
        '''checking for length of query term is it contains only single word it directly posts 
                the result read from inverted index file'''
        if len(QueryDic) == 1:
            for key in QueryDic.keys():
                booleanResult = PostingDict[key]
                if not booleanResult:
                    print("Given query has no matched Document",
                          ''.join(Query))
                else:
                    print("Result of the search query ", booleanResult)
        else:
            keylist = list(QueryDic.keys())
            '''iterating over query terms as keys and merging postings list over intersection 
                to find list of postings that contains all query terms'''
            for key in QueryDic.keys():
                'adding postings list of each queryterm'
                boolen.append(sorted(PostingDict[key], key=int))
            '''checking the intersection result boolean result set '''
            booleanResult = set.intersection(*map(set, boolen))
        'If first boolean result is null then we process pairwise intersection of query terms'
        if booleanResult == set():
            for i in range(len(QueryDic) - 1):
                if not i == len(QueryDic) - 1:
                    p1 = PostingDict[keylist[i]]
                    p2 = PostingDict[keylist[i + 1]]
                    temp = InvertedIndex.mergeList(p1, p2)
                    '''checking for empty result post merge if result is not empty set adding 
                    the intersection result boolean result set '''
                    if not temp == set():
                        booleanResult.update(temp)
        return sorted(booleanResult, key=int)
예제 #7
0
 def booleanQuery(self):
     """ boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C"
     for retrieving posting lists and merge them"""
     ivObj = InvertedIndex()
     ivObj.load(self.filename)
     index_item = ivObj.items[self.tokens[0]]
     # Get the doc ids from the sorted postings in the same order.
     docs = index_item.get_sorted_doc_ids()
     for token in self.tokens:
         index_item = ivObj.items[token]
         # Find intersection between the current docs and the index_item for the current token.
         docs = index_item.intersection(docs)
     return docs
예제 #8
0
def test_stress():
    datasets = [
        ("datasets/UCI-zoo.csv", 0.3),
        ("datasets/mushroom.csv", 0.4),
        # ("datasets/BMS-POS.csv", 0.05),
        # ("datasets/kosarak.csv", 0.05),
    ]

    for (csvFilePath, min_support) in datasets:
        # Run Apriori and FP-Growth and assert both have the same results.
        print("Running Apriori for {}".format(csvFilePath))
        start = time.time()
        index = InvertedIndex()
        index.load_csv(csvFilePath)
        apriori_itemsets = apriori(index, min_support)
        apriori_duration = time.time() - start
        print(
            "Apriori complete. Generated {} itemsets in {:.2f} seconds".format(
                len(apriori_itemsets),
                apriori_duration))

        print("Running FPTree for {}".format(csvFilePath))
        start = time.time()
        with open(csvFilePath, newline='') as csvfile:
            test_transactions = list(csv.reader(csvfile))
            fptree_itemsets = mine_fp_tree(test_transactions, min_support)
        fptree_duration = time.time() - start
        print(
            "fp_growth complete. Generated {} itemsets in {:.2f} seconds".format(
                len(fptree_itemsets),
                fptree_duration))

        if set(fptree_itemsets) == set(apriori_itemsets):
            print("SUCCESS({}): Apriori and fptree results match".format(csvFilePath))
        else:
            print("FAIL({}): Apriori and fptree results differ!".format(csvFilePath))
        assert(set(fptree_itemsets) == set(apriori_itemsets))

        if apriori_duration > fptree_duration:
            print(
                "FPTree was faster by {:.2f} seconds".format(
                    apriori_duration -
                    fptree_duration))
        else:
            print(
                "Apriori was faster by {:.2f} seconds".format(
                    fptree_duration -
                    apriori_duration))
        print("")
예제 #9
0
def build_index():
    if not os.path.exists(INDEX_FOLDER):
        os.mkdir(INDEX_FOLDER)

    index = InvertedIndex.load(INDEX_FOLDER, InvertedIndex.NAME)

    if index:
        logging.debug("Index is successfully loaded")
        return

    logging.debug("Building index...")
    articles = select(article.id for article in Article)[:]
    index = InvertedIndex()
    IndexBuilder(processes=1).build(index, articles)
    logging.debug("Saving index...")
    index.save(INDEX_FOLDER)
def query(index_file, algorithm, query_file, query_id):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents
    query_file = cranqry.loadCranQry(query_file)  # loading file
    index_items = InvertedIndex()
    index_items = index_items.load(index_file)
    cran_file = cran.CranFile('cran.all')
    query_verify = QueryProcessor(query_file, index_items, cran_file.docs)
    query_verify.preprocessing()
    results = None
    if algorithm == '0':  # if algorithm is 0 it represents boolean model
        results = query_verify.booleanQuery(query_id)
    elif algorithm == '1':  # if algorithm is 1 it is vector model
        results = query_verify.vectorQuery(3, query_id)
    print(results)
예제 #11
0
def setup_ranker():
    global ranker

    text_processor = TextProcessor()
    docs = []
    index = InvertedIndex.load(INDEX_FOLDER, "inverted_index")
    articles = select(article.id for article in Article)
    for article_id in articles:
        article = Article[article_id]
        docs.append(
            AbstractAndArticle(article,
                               _read_file(article.processed_abstract_path)))

    ranker = TfIdf(index, text_processor, docs, VECTORS_PER_FILE,
                   VECTORS_SAVE_FOLDER)
예제 #12
0
    def vectorQuery(self, k):
        """ vector query processing, using the cosine similarity. """
        # ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        result = {}
        ivObj = InvertedIndex()
        ivObj.load(self.filename)  # loading the InvertedIndex
        doc_set = set()
        term_idf_list = []
        for term in self.tokens:  # for every term in the query finding the document IDs where the term is present
            if term in self.index:
                doc_set = doc_set.union(set(self.index[term].posting.keys()))
            term_idf_list.append(
                ivObj.idf(term) * 1.0 /
                len(self.tokens))  # calculating tf-idf weights for query
        doc_list = list(doc_set)
        for docID in doc_list:  # Calculating tf-idf weights for the above documents
            for term in self.tokens:
                if term in self.index:
                    if docID in result.keys():
                        result[docID].append(ivObj.tfidf(term, docID))
                    else:
                        result[docID] = [ivObj.tfidf(term, docID)]
                else:
                    if docID in result.keys():
                        result[docID].append(0.0)
                    else:
                        result[docID] = [0.0]

        score_dict = {}
        term_idf_list_np = np.array(self.unitVector(
            term_idf_list))  # calculating unit vector for each document
        for docID in doc_list:
            unit_result = self.unitVector(result[docID])
            unit_np = np.array(unit_result)
            score_dict[docID] = np.dot(
                term_idf_list_np,
                unit_np)  # dot product for query and each document
        score_list = score_dict.items()
        final = sorted(score_list, key=itemgetter(1), reverse=True)
        similarity = []
        for i in range(0, k):
            similarity.append(final[i])
        return similarity  # list of (docID,cosine similarity) in order of ranking
예제 #13
0
def run_rank():
    text_processor = TextProcessor()
    docs = []
    index = InvertedIndex.load(INDEX_FOLDER, "inverted_index")
    articles = select(article.id for article in Article)
    for article_id in articles:
        article = Article[article_id]
        docs.append(AbstractAndArticle(article, _read_file(article.processed_abstract_path)))

    ranker = TfIdf(index,
                   text_processor,
                   docs,
                   vectors_per_file=VECTORS_PER_FILE,
                   vectors_save_folder=VECTORS_SAVE_FOLDER)

    while True:
        query = input("Enter query: ")
        top_ids = ranker.rank(query, 5)
        for article_id in top_ids:
            article = Article[article_id]
            print(article.title, article.document.url)
예제 #14
0
def query(index_file, model_type, query_file, query_id):
    ''' the main query processing program, using QueryProcessor'''

    # ToDo: the commandline usage: "echo query_string | python query.py index_file processing_algorithm"
    # processing_algorithm: 0 for booleanQuery and 1 for vectorQuery
    # for booleanQuery, the program will print the total number of documents and the list of docuement IDs
    # for vectorQuery, the program will output the top 3 most similar documents
    #load documents
    inputdocument = cran.CranFile("cran.all")
    #load the index file saved at from part 1
    index = InvertedIndex().load(index_file)
    #load query processed files
    queries = loadCranQry(query_file)

    qp = QueryProcessor(queries, index, inputdocument, query_id)

    if model_type == 0:
        Booleanres = qp.booleanQuery()
        print(Booleanres)
    if model_type == 1:
        vectorres = qp.vectorQuery(3)
        print(vectorres)
    if model_type == 2:
        qp.BatchEvaluation()
예제 #15
0
def main():

    #########
    # SETUP #
    #########

    # Get input args
    newsgroups_root_dir = argv[1]
    feat_def_path = argv[2]
    class_def_path = argv[3]
    training_data_path = argv[4]

    # Generate index
    #index_newsgroups(newsgroups_root_dir, "idx_save.pkl")
    ii = InvertedIndex()
    ii.load("idx_save.pkl")

    # Write out feature/term pairs to feat_def_path
    feature_id = 0
    with open(feat_def_path, 'w') as outf:
        for item in ii.items:
            outf.write(str(feature_id) + " " + str(item) + "\n")
            feature_id += 1

    # Read back in the feature/term pairs for later
    with open(feat_def_path, 'r') as inf:
        ft_pairs = inf.readlines()

    # Put the ft_pairs into a dictionary for quick lookup
    ft_dict = {}
    for pair in ft_pairs:
        ft_dict[pair.split()[1].strip()] = pair.split()[0]

    # Map the different newsgroups to a given class
    # This is fairly manual...
    with open(class_def_path, 'w') as outf:
        for dir in listdir(newsgroups_root_dir):
            outf.write(class_def_helper(dir) + " " + dir + "\n")

    ############################
    # TRAINING DATA GENERATION #
    ############################

    # Create the training data
    # For each document:
    # Find its containing folder, and extract class from class def
    # For each term in document
    # Compute tfidf, tf or idf
    current_file_id = 1
    with open(training_data_path + ".TFIDF", 'w') as outf:
        # Compute tf-idf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":")

                    # Calculate and write out TF-IDF
                    # Note current_file_id is our doc_id
                    tf = ii.find(word).posting[current_file_id].term_freq()
                    idf = ii.idf(word)
                    #outf.write(str(log10(1 + tf) * idf) + " ")
                    data_set.add(ft_dict[word] + ":" +
                                 str(log10(1 + tf) * idf))

                # Write newline to signify end of file
                #outf.write("\n")
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")
                outf.flush()

                # Increment our current doc
                current_file_id += 1

    current_file_id = 1
    with open(training_data_path + ".TF", 'w') as outf:
        # Compute tf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":")

                    # Write the TF
                    # Note current_file_id is our doc_id
                    # outf.write(str(ii.find(word).posting[
                    # current_file_id].term_freq()) + " ")
                    data_set.add(ft_dict[word] + ":" + str(
                        ii.find(word).posting[current_file_id].term_freq()))

                # Write newline to signify end of file
                # outf.write("\n")
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")
                # outf.flush()

                # Increment our current doc
                current_file_id += 1

    current_file_id = 1
    with open(training_data_path + ".IDF", 'w') as outf:
        # Compute idf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":" + str(ii.idf(word))
                    #    + " ")
                    data_set.add(ft_dict[word] + ":" + str(ii.idf(word)))

                # Write newline to signify end of file
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")
예제 #16
0
class QueryProcessor:
    ##
    #
    #    @param         self
    #    @param         query
    #    @param         index
    #    @param         collection
    #    @return        None
    #    @brief         The constructor.
    #                   This process is extremely expensive because it loads the entire pickle object into memory.
    #                   If we are only executing this for one query it is fine but if we are doing it
    #                   for the evaluation used the load query instead
    #    @exception     None documented yet
    ##
    def __init__(self, query, index_file, collection):
        ''' index is the inverted index; collection is the document collection'''
        self.raw_query = query
        self.index = InvertedIndex()
        self.index = self.index.loadData(index_file)
        self.docs = collection
        self.tokenizer = Tokenizer(
            known_words=set(self.index.get_items_inverted().keys()))
        if self.raw_query:
            self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         query
    #   @return        None
    #   @exception     None
    ##
    def loadQuery(self, query):
        self.raw_query = query
        self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         raw_query
    #   @return        None
    #   @exception     None
    ##
    def preprocessing(self, raw_query):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''
        return self.tokenizer.transpose_document_tokenized_stemmed_spelling(
            raw_query)

    ##
    #   @brief         This method does the boolean query processing
    #   @param         self
    #   @return        results:list[docID]
    #   @bug           Fixed
    #   @exception     None
    ##
    def booleanQuery(self):
        ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them'''
        ''' This method would likely be faster due to the use of  hashes, but I wanted to do what was shown in the slides
            from functools import reduce
            docs = [set(self.index[w]) for w in self.processed_query]
            docs.sort(key=len) # notice it is still smart to order by size 
            return reduce(set.intersection,docs) 
        '''
        if len(self.processed_query) == 0:
            return []

        ## checks that all of our query words are in the index, if not return [] ##
        for w in self.processed_query:
            if not w in self.index.get_items_inverted():
                return []

        ## checks if we only have 1 term in the query and returns its posting list if we do ##
        if len(self.processed_query) == 1:
            return list(self.index.get_items_inverted()[
                self.processed_query[0]].get_posting_list().keys())

        #### document_ids is a list of lists containing only document ids ####
        document_ids = [
            list(self.index.get_items_inverted()[w].get_posting_list().keys())
            for w in self.processed_query
        ]

        # by sorting so that we start with the shortest list of documents we get a potential speed up
        document_ids.sort(key=len)
        results = document_ids[0]

        ## iterates through each query word and does the intersection of docids from its posting list with all those before it ##
        ## could be done faster if index was implemented as set or some other hash data structure
        for p in document_ids[1:]:
            intermediate = []
            i, j = 0, 0
            while i < len(results) and j < len(p):
                if int(results[i]) < int(p[j]):
                    i += 1
                elif int(results[i]) > int(p[j]):
                    j += 1
                else:
                    intermediate.append(p[j])
                    j += 1
                    i += 1
            results = intermediate

            ## checks if we have already found terms totally disjoint from one another
            if len(results) == 0:
                return results

        return results

    ##
    #   @brief         This method compute cosine similarity for two vectors
    #   @param         self
    #   @param         vec1
    #   @param         vec2
    #   @return        score cosine: int
    #   @exception     None
    ##
    def cosine_similarity(self, vec1, vec2):
        # "compute cosine similarity: (vec1*vec2)/(||vec1||*||vec2||)"
        AA, AB, BB = 0, 0, 0
        for i in range(len(vec1)):
            x = vec1[i]
            y = vec2[i]
            AA += x * x
            BB += y * y
            AB += x * y
        return round(AB / math.sqrt(AA * BB), 4)

    ##
    #   @brief         This method compute vector model
    #   @param         self
    #   @param         k
    #   @return        cosines: dict{docID: score}
    #   @bug           Fixed
    #   @exception     ValueError
    ##
    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        if len(self.processed_query) == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        query_words = list(set(self.processed_query))
        idfs = [self.index.idf(w) for w in query_words]

        # undefined behavior from document on what to do if k is larger than the corpus
        try:
            if k > self.index.get_total_number_Doc():
                raise ValueError('k is greater than number of documents')
        except ValueError as err:
            print(err.args)
            return

        # below we define behavior if none of the words in the query are in any documents
        # this behavior was not defined in instructions so no documents seems most appropriate
        # if you used google and got 0 cosine it would return 0 documents even if you wanted the 50 most relevant
        if set(idfs) == {0}:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        # removes any words that have 0 idf as that means they didn't appear in the corpus, means save memory
        # probably not necessary to turn it into lists, and may actually be more appropriate to leave as tuples
        idfs, query_words = map(
            list,
            zip(*[i for i in list(zip(idfs, query_words)) if not i[0] == 0]))

        #Calculates tfs of relevant words
        query_term_counter = Counter(self.processed_query)
        query_tf_vector = [
            round(math.log10(query_term_counter[w] + 1), 4)
            for w in query_words
        ]

        #Other way of doing tf
        #query_tf_vector = [round(1 + math.log10(query_term_counter[w]),4) if query_term_counter[w] > 0 else 0 for w in query_words]

        ### NCC change if a term in a quiry does not appear in our inverted index Forget/Discount term
        #### postings should be a list of lists which contains word postings

        postings = [
            self.index.get_items_inverted()[w].get_posting_list()
            for w in query_words if w in self.index.get_items_inverted()
        ]

        document_ids = set().union(*postings)
        document_tfs = {d: [0] * len(query_words) for d in document_ids}

        for inx, term in enumerate(postings):
            for document_id, posting in term.items():
                #log normalization
                document_tfs[document_id][inx] = math.log10(
                    posting.term_freq() + 1)

                #Other
                # tf = posting.term_freq()
                # if tf > 0 :
                #     tf = 1 + math.log10(tf)
                # else:
                #     tf = 0
                # document_tfs[document_id][inx] = tf

        query_tfidf = np.multiply(query_tf_vector, idfs)

        cosines = Counter({
            d: self.cosine_similarity(query_tfidf, np.multiply(d_tf, idfs))
            for d, d_tf in document_tfs.items()
        })
        # this has to be a list as dict are not sorted...
        # need a consistent ordering of documents when multiple documents have the same score we first sort on score then docid, very slow
        # if we know k or know the number of documents we could use numpy to preallocate memory which means we would not have to use append and could just use copy
        temp_k = k
        scores = sorted(list(set(cosines.values())), reverse=True)
        ret = []
        for s in scores:
            docs_with_score_s = sorted(
                [int(d) for d, v in cosines.items() if v == s])
            if len(docs_with_score_s) >= temp_k:
                docs_with_score_s = docs_with_score_s[:temp_k]
                ret.extend([(str(d), s) for d in docs_with_score_s])
                temp_k = 0
                break
            else:
                temp_k = temp_k - len(docs_with_score_s)
                ret.extend([(str(d), s) for d in docs_with_score_s])
        if not temp_k == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())

            ret.extend([(str(j), 0) for j in sorted(
                list(map(int, all_docids.difference({i[0]
                                                     for i in ret}))))[:temp_k]
                        ])
        return ret
예제 #17
0
def eval(index_file, query_file, qrels_File, number_of_queries):
    #read queryfile,indexfile
    # ToDo
    queries = loadCranQry(query_file)
    queries_id_list = [str(int(x)) for x in queries.keys()]
    #print(queries_id_list)
    #read querls.txt
    qrels_dict = process_querls_file(qrels_File, queries_id_list)
    inputdocument = cran.CranFile("cran.all")
    # load the index file saved at from part 1
    index = InvertedIndex().load(index_file)
    qp = QueryProcessor(queries, index, inputdocument, number_of_queries)
    queries_id_list_int = [int(x) for x in qrels_dict.keys()]
    queries_id_ls = [int(x) for x in queries.keys()]
    #IdeaVectorsforQuery_ids={}
    sumbooleanNADC = []
    sumvectorNADC = []
    with open('Evaluation_search.csv', 'w') as f:
        f.write("%s,%s,%s,%s\n" % ("Iteration", "AverageNDCG-booleanModel",
                                   "AverageNDCG-vectorModel", "P-value"))
        for i in range(0, 5):
            vectorNADC = []
            booleanNADC = []
            intersection_queries = list(
                set(queries_id_list_int) & set(queries_id_ls))
            random_query_id_list = random.sample(queries_id_list_int,
                                                 number_of_queries)
            #random_query_id_list=[153, 18]
            #print(random_query_id_list)
            for q_id in random_query_id_list:
                print("Processing for Query ID ::", q_id)
                qp.querynumber = q_id
                #boolean_res=qp.booleanQuery()
                vector_top3 = qp.vectorQuery(5)
                #vector_top3=[('12',0.34),('746',0.33),('875',0.24)]
                #print(boolean_res)
                print("Output for Vector Model Result::", vector_top3)
                if (vector_top3.__len__() < 1):
                    vectorNADC.append(0)
                else:
                    vector_label = [x[0] for x in vector_top3]
                    score = [x[1] for x in vector_top3]
                    print("DocumentIDs of Vector Model Result:: ",
                          vector_label)
                    print("Scores of Vector Model Result::", score)
                    true_label = vector_label.copy()
                    query_id = str(q_id)
                    for x in vector_label:
                        #str_x="{0:0=3d}".format(x)
                        ind = vector_label.index(x)
                        if (x in qrels_dict.get(query_id)):
                            true_label[ind] = 1
                        else:
                            true_label[ind] = 0
                    if true_label.__len__() < 5:
                        len_val = 10 - (true_label.__len__())
                        true_label.extend([0] * len_val)
                    print("Actual Vector:: ", true_label)
                    print("Predicted Vector:: ", score)
                    if sum(true_label) == 0:
                        vectorNADC.append(0)
                    else:
                        ndcg = metrics.ndcg_score(true_label, score, 5)
                        print("Calculated ndcg for Vector::", ndcg)
                        vectorNADC.append(ndcg)
                boolean_res = qp.booleanQuery()
                print("output of boolean_res:: ", boolean_res)
                if boolean_res.__len__() < 1:
                    booleanNADC.append(0)
                else:
                    score = [1] * len(boolean_res)
                    if (score.__len__() < 5):
                        leng = 5 - (score.__len__())
                        score.extend([0] * leng)
                    true_label = boolean_res.copy()
                    query_id = str(q_id)
                    for x in boolean_res:
                        ind = boolean_res.index(x)
                        if (x in qrels_dict.get(query_id)):
                            true_label[ind] = 1
                        else:
                            true_label[ind] = 0
                    if true_label.__len__() < 5:
                        len_val = 10 - (true_label.__len__())
                        true_label.extend([0] * len_val)
                    print("Actual boolean:: ", true_label)
                    print("Predicted boolean:: ", score)
                    if sum(true_label) == 0:
                        booleanNADC.append(0)
                    else:
                        ndcg = metrics.ndcg_score(true_label, score, 5)
                        print("Calculated ndcg for Boolean::", ndcg)
                        booleanNADC.append(ndcg)
            print("Calculated NADC sum for all queries", vectorNADC)
            avergae_vectorNADC = float(sum(vectorNADC) / number_of_queries)
            print("Calculated NADC sum for all queries", booleanNADC)
            avergae_booleanNADC = float(sum(booleanNADC) / number_of_queries)
            print("Avergae NADC Vector::", avergae_vectorNADC)
            print("Avergae NADC boolean::", avergae_booleanNADC)
            p_value = scipy.stats.wilcoxon(vectorNADC,
                                           booleanNADC,
                                           zero_method='wilcox',
                                           correction=False)
            print(i, str(avergae_booleanNADC), str(avergae_vectorNADC),
                  str(p_value[1]))
            p = "%.20f" % float(str(p_value[1]))
            print('P value for all the queries processed is:', p)
            f.write("%s,%s,%s,%s\n" % (i + 1, str(avergae_booleanNADC),
                                       str(avergae_vectorNADC), str(p)))
    print('Done')
        BoolenQueryResultDic.append({qid: Bresult})
    else:
        print("Vector Query TF-IDF calculation in progress")
        Topk, k = qprocessorobj.vectorQuery(3)
        #print("vector",qid,qrys[qid].text)
        print("Top", k, "(DocID Similarity)", Topk[:k])


''' ************this below code is reused in batch_eval also*******************'''
input_filename = "cran.all"
ouput_filename = sys.argv[1]  #"index_file" #sys.argv[2]
Queryfile = "query.text"  #sys.argv[3]#"query.text"
'''creating object for cranefile and collection file and inverted index class,postings class'''
cf = CranFile(input_filename)
collectionfile = Collection()
indexobj = InvertedIndex()
'iterating over cran file for document id'
for i, doc in enumerate(cf.docs):
    collectionfile.docs.update({doc.docID: doc})
postingobj = Posting(doc.docID)
'''reading index file which is stored while creating index'''
with open(ouput_filename, "r") as invertedindex:
    InvertedIndex.items = json.load(invertedindex)
'formatting the query id in qrel.text and finding common query id in qrery.text'
qidlist = {}
qrys = loadCranQry(Queryfile)
for position, q in enumerate(qrys):
    qidlist[q] = position + 1
'Below Variables are used for batch_eval.py file'
BoolenQueryResultDic = []
VectorResult = []
예제 #19
0
import cran
import query
from cranqry import loadCranQry
from index import InvertedIndex, test
from query import QueryProcessor

print("***************Test Cases Running for Index File****************")
invertedobj = InvertedIndex()
test(invertedobj)

print("***************Test Cases Running for Query File****************")
# load documents
inputdocument = cran.CranFile("cran.all")
# load the index file saved at from part 1
index = InvertedIndex().load("index_file")
# load query processed files
queries = loadCranQry("query.text")

qp = QueryProcessor(queries, index, inputdocument, 29)
query.test(qp)

qp = QueryProcessor(queries, index, inputdocument, 29)
qp.vectorQuery(3)
    def extractfeature(self, directoryOfNewsgroup, featureDefinitionFile,
                       classDefinitionFile, trainingDataFile):
        iindexObject = InvertedIndex()
        invertedIndex = iindexObject.indexingCranfield(directoryOfNewsgroup)
        f = open(featureDefinitionFile, "w")
        counter = 0
        for x in invertedIndex.items.keys():
            counter = counter + 1
            formattedData = str(counter) + " " + x + "\n"
            f.write(formattedData)
            self.termIdLookup[x] = counter
        f.close()

        #as per the proejct requirement hardcoding the class files here and outputting
        classDefinitiontuple = ("1 comp.graphics", "1 comp.os.ms-windows.misc",
                                "1 comp.sys.ibm.pc.hardware",
                                "1 comp.sys.mac.hardware", "1 comp.windows.x",
                                "2 rec.autos", "2 rec.motorcycles",
                                "2 rec.sport.baseball", "2 rec.sport.hockey",
                                "3 sci.crypt", "3 sci.electronics",
                                "3 sci.med", "3 sci.space", "4 misc.forsale",
                                "5 talk.politics.misc", "5 talk.politics.guns",
                                "5 talk.politics.mideast",
                                "6 talk.religion.misc", "6 alt.atheism",
                                "6 soc.religion.christian")

        classfile = open(classDefinitionFile, "w")
        for x in classDefinitiontuple:
            classfile.write(x + "\n")
        classfile.close()
        #end of hardcoded class files

        print('tf start')
        libsvmtf = {}
        if os.path.exists("training_data_file.TF"):
            os.remove("training_data_file.TF")
        newsgroup = self.getNewsGroupFile(directoryOfNewsgroup)

        for x in invertedIndex.items.keys():

            for postingobject in invertedIndex.items.get(x).posting.keys():
                #libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')
                libsvmtf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID,
                    []).append(self.getKeysByValue(self.termIdLookup, x))
                libsvmtf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID, []).append(':')
                #libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')
                libsvmtf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID, []).append(
                            round(
                                invertedIndex.items.get(x).posting.get(
                                    postingobject).termfreq, 5))
            # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')

        for x in libsvmtf:
            tfdata = ''
            libsvmtffile = open("training_data_file.TF", "a")
            if x in newsgroup.class1items1:
                classid = 1
            if x in newsgroup.class1items2:
                classid = 2
            if x in newsgroup.class1items3:
                classid = 3
            if x in newsgroup.class1items4:
                classid = 4
            if x in newsgroup.class1items5:
                classid = 5
            if x in newsgroup.class1items6:
                classid = 6
        #    print('\t '.join(libsvmtf))
        #tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n"

        #        for row in reader:  # read a row as {column1: value1, column2: value2,...}
        #           for (k, v) in row.items():  # go over each column name and value
        #               columns[k].append(v)  # append the value into the appropriate list

        #     saved_column = df.column_name  # you can also use df['column_name']

        #print (str(tempstr).split("'",""))
            tfdata = str(classid) + " " + str(''.join(
                str(libsvmtf[x]).split(",")))[1:-1] + "\n"
            tfdata = str.replace(tfdata, " ':' ", ":")
            print(tfdata)
            libsvmtffile.write(tfdata)
        libsvmtffile.close()

        print('tf complete')

        print('idf start')
        libsvmidf = {}
        if os.path.exists("training_data_file.IDF"):
            os.remove("training_data_file.IDF")
        newsgroup = self.getNewsGroupFile(directoryOfNewsgroup)

        for x in invertedIndex.items.keys():

            for postingobject in invertedIndex.items.get(x).posting.keys():
                # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')
                libsvmidf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID,
                    []).append(self.getKeysByValue(self.termIdLookup, x))
                libsvmidf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID, []).append(':')
                # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')
                libsvmidf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID,
                    []).append(invertedIndex.items.get(x).idf)
            # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')

        for x in libsvmidf:
            idfdata = ''
            libsvmidffile = open("training_data_file.IDF", "a")
            if x in newsgroup.class1items1:
                classid = 1
            if x in newsgroup.class1items2:
                classid = 2
            if x in newsgroup.class1items3:
                classid = 3
            if x in newsgroup.class1items4:
                classid = 4
            if x in newsgroup.class1items5:
                classid = 5
            if x in newsgroup.class1items6:
                classid = 6
            #    print('\t '.join(libsvmtf))
            # tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n"

            #        for row in reader:  # read a row as {column1: value1, column2: value2,...}
            #           for (k, v) in row.items():  # go over each column name and value
            #               columns[k].append(v)  # append the value into the appropriate list

            #     saved_column = df.column_name  # you can also use df['column_name']
            idfdata = str(classid) + " " + str(''.join(
                str(libsvmidf[x]).split(",")))[1:-1] + "\n"
            idfdata = str.replace(idfdata, " ':' ", ":")
            # print(idfdata)
            libsvmidffile.write(idfdata)
        libsvmidffile.close()

        print('idf complete')

        print('TF-idf start')
        libsvmtfidf = {}
        if os.path.exists("training_data_file.TFIDF"):
            os.remove("training_data_file.TFIDF")
        newsgroup = self.getNewsGroupFile(directoryOfNewsgroup)

        for x in invertedIndex.items.keys():

            for postingobject in invertedIndex.items.get(x).posting.keys():
                # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')
                libsvmtfidf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID,
                    []).append(self.getKeysByValue(self.termIdLookup, x))
                libsvmtfidf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID, []).append(':')
                # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')
                libsvmtfidf.setdefault(
                    invertedIndex.items.get(x).posting.get(
                        postingobject).docID, []).append(
                            invertedIndex.items.get(x).posting.get(
                                postingobject).termfreq *
                            invertedIndex.items.get(x).idf)
            # libsvmtf.setdefault(invertedIndex.items.get(x).posting.get(postingobject).docID, []).append('\t')

        for x in libsvmtfidf:
            tfidfdata = ''
            libsvmtfidffile = open("training_data_file.TFIDF", "a")
            if x in newsgroup.class1items1:
                classid = 1
            if x in newsgroup.class1items2:
                classid = 2
            if x in newsgroup.class1items3:
                classid = 3
            if x in newsgroup.class1items4:
                classid = 4
            if x in newsgroup.class1items5:
                classid = 5
            if x in newsgroup.class1items6:
                classid = 6
            #    print('\t '.join(libsvmtf))
            # tfdata = str(x) +" : "+str(''.join(str(libsvmtf[x]).split(",")))[1:-1] + "\n"

            #        for row in reader:  # read a row as {column1: value1, column2: value2,...}
            #           for (k, v) in row.items():  # go over each column name and value
            #               columns[k].append(v)  # append the value into the appropriate list

            #     saved_column = df.column_name  # you can also use df['column_name']
            tfidfdata = str(classid) + " " + str(''.join(
                str(libsvmtfidf[x]).split(",")))[1:-1] + "\n"
            tfidfdata = str.replace(tfidfdata, " ':' ", ":")
            # print(tfidfdata)
            libsvmtfidffile.write(tfidfdata)
        libsvmtfidffile.close()

        print('TF-idf complete')
                    # print(v)
                    # print(doc.class_name)
                    if class_name in v:
                        class_label = k
                train_dict.update({(docid + class_name): [class_label, {term_id: term_val}]})

    # write to file
    with open(training_file_tfidf, "w") as train_obj:
        for doc, val in train_dict.items():
            x = ''
            classid = str(train_dict[doc][0])
            for i in val[1:]:
                for k, v in i.items():
                    x = x + " " + str(k) + ":" + str(v)
            tfidfdata = classid + "\t" + x + "\n"
            train_obj.write(tfidfdata)
    print("training data file.tfidf generated successfully")

if __name__ == '__main__':

    '''class_defn_file("class_sample_file")
    index_obj = InvertedIndex()
    iindex = index_obj.indexingCranfield("sample_newsgroup")
    feature_defn_file(iindex,"feature_sample_file")
    training_file_idf("feature_sample_file", "class_sample_file", "training_sample_file.tf", "training_sample_file.idf", "training_sample_file.tfidf", iindex)'''

    class_defn_file("class_definition_file")
    index_obj = InvertedIndex()
    iindex = index_obj.indexingCranfield("mini_newsgroups")
    feature_defn_file(iindex, "feature_definition_file")
    training_file("feature_definition_file", "class_definition_file", "training_data_file.tf", "training_data_file.idf", "training_data_file.tfidf", iindex)
예제 #22
0
def main(ref, k):
    # Build the index
    index = InvertedIndex(ref, k)
    index.prepare_disk()
    index.build()
예제 #23
0
def test_InvertedIndex():
    data = ("a,b,c,d,e,f\n"
            "g,h,i,j,k,l\n"
            "z,x\n"
            "z,x\n"
            "z,x,y\n"
            "z,x,y,i\n")
    index = InvertedIndex()
    index.load(data)
    assert (index.support({Item("a")}) == 1 / 6)
    assert (index.support({Item("b")}) == 1 / 6)
    assert (index.support({Item("c")}) == 1 / 6)
    assert (index.support({Item("d")}) == 1 / 6)
    assert (index.support({Item("e")}) == 1 / 6)
    assert (index.support({Item("f")}) == 1 / 6)
    assert (index.support({Item("h")}) == 1 / 6)
    assert (index.support({Item("i")}) == 2 / 6)
    assert (index.support({Item("j")}) == 1 / 6)
    assert (index.support({Item("k")}) == 1 / 6)
    assert (index.support({Item("l")}) == 1 / 6)
    assert (index.support({Item("z")}) == 4 / 6)
    assert (index.support({Item("x")}) == 4 / 6)
    assert (index.support({Item("y")}) == 2 / 6)

    sup_zx = index.support({Item("z"), Item("x")})
    assert (sup_zx == 4 / 6)

    sup_zxy = index.support({Item("z"), Item("x"), Item("y")})
    assert (sup_zxy == 2 / 6)

    sup_zxyi = index.support({Item("z"), Item("x"), Item("y"), Item("i")})
    assert (sup_zxyi == 1 / 6)
예제 #24
0
# -*- coding: utf-8 -*-
import sys
import argparse
import os.path
from index import InvertedIndex
from ui import Fen
from PyQt5.QtWidgets import QApplication

parser = argparse.ArgumentParser()
parser.add_argument("d", type=str, help="Le chemin de la corpus")
parser.add_argument("s",
                    type=str,
                    help="Le chemin du fichier contenant les stopwords")
args = parser.parse_args()
if os.path.isfile(args.d) and os.path.isfile(args.s):
    print("Indexation de corpus: \n  Wait ...!!")
    index = InvertedIndex(args.d, args.s)
    monApp = QApplication(sys.argv)
    fenetre = Fen(index)
    print("Fin d'indexation: vous pouvez commencer la recherche")
    print("----------------------------------------------------------")
    sys.exit(monApp.exec_())

else:
    print("ERROR: le(s) document(s) que vous avez indiqué n'existe pas")
예제 #25
0
def eval():

    # Algorithm:
    # Pick N random samples from query.txt
    # Get top 10 results from bool query for each rnd query
    # Get top 10 results from vector query for each rnd query
    # Compute NDCG btn bool query results and qrels.txt
    # Compute NDCG btn vector query results and qrels.txt
    # Get p-value btn bool and vector

    # Get the query collection
    qc = loadCranQry(query_path)
    poss_queries = list(qc)

    # Load up the inverted index
    ii = InvertedIndex()
    ii.load(index_file)

    # Load up the document collection
    cf = CranFile("cran.all")

    # Get ground-truth results from qrels.txt
    with open(qrels_path) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    # Run over N random queries, collecting NDCGs
    bool_ndcgs = []
    vector_ndcgs = []
    for _ in range(n):
        # Get random query ID
        query_id = choice(poss_queries)

        # Get the query
        if 0 < int(query_id) < 10:
            query_id = '00' + str(int(query_id))
        elif 9 < int(query_id) < 100:
            query_id = '0' + str(int(query_id))
        try:
            query = qc[query_id].text
        except KeyError:
            print("Invalid query id", query_id)
            return

        # Initialize the query processor
        qp = QueryProcessor(query, ii, cf)

        # Run bool query
        bool_result = qp.booleanQuery()[:10]

        # Run vector query
        vector_result = qp.vectorQuery(10)

        # Pull top 10 ground-truth results from qrels dict
        gt_results = qrel_dict[poss_queries.index(query_id) + 1][:10]

        # Compute NDCG for bool query
        # NOTE: There is no weighting on the bool query, so give all an even 1
        truth_vector = list(map(lambda x: x in gt_results, bool_result))
        bool_ndcg = ndcg_score(truth_vector, [1] * len(truth_vector),
                               k=len(truth_vector))

        # Compute NDCG for vector query
        vector_docs = []
        vector_scores = []
        for v in vector_result:
            vector_docs.append(v[0])
            vector_scores.append(v[1])
        truth_vector = list(map(lambda x: x in gt_results, vector_docs))
        vector_ndcg = ndcg_score(truth_vector,
                                 vector_scores,
                                 k=len(truth_vector))

        # Accumulate NDCGs
        bool_ndcgs.append(bool_ndcg)
        vector_ndcgs.append(vector_ndcg)

    # Average out score lists
    bool_avg = 0
    for bool in bool_ndcgs:
        bool_avg += bool
    bool_avg /= len(bool_ndcgs)

    vector_avg = 0
    for vector in vector_ndcgs:
        vector_avg += vector
    vector_avg /= len(vector_ndcgs)

    # Present averages and p-values
    print("Boolean NDCG average:", bool_avg)
    print("Vector NDCG average:", vector_avg)
    if n > 19:
        print("Wilcoxon p-value:", wilcoxon(bool_ndcgs, vector_ndcgs).pvalue)
    else:
        print("Wilcoxon p-value: Sample size too small to be significant")
    print("T-Test p-value:", ttest_ind(bool_ndcgs, vector_ndcgs).pvalue)
예제 #26
0
def test(index_loc, cran_loc, qrels_loc):
    ''' test your code thoroughly. put the testing cases here'''

    ##### SETUP ITEMS #####

    # Grab index file to restore II
    ii = InvertedIndex()
    ii.load(index_loc)

    # Get the document collection
    cf = CranFile(cran_loc)

    # Get ground-truth results from qrels.txt
    with open(qrels_loc) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    ##### INITIAL TEST ITEMS #####
    print("TESTS BASED ON SUGGESTED TESTING POINTS")

    # Ensure tf is correct
    #   Find a random word and check TF value against what is manually done
    posting_list = ii.find("experiment").posting
    tf_vector = []
    for posting in posting_list:
        tf_vector.append(len(posting_list[posting].positions) \
            == posting_list[posting].term_freq())
    print("TF is computed correctly:", all(tf_vector))

    # Ensure idf is correct
    print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \
        == ii.idf("experiment"))

    # As both tf and idf are correct, and tf-idf is a product of the two,
    #   it is reasonable to assume tf-idf is computed correctly

    ##### BOOL QUERY TESTS #####

    # Here, I use very specific boolean queries to ensure that a
    #   limited number of documents are returned
    print("\nBOOL QUERY TESTS")

    # Ensure that the exact title of doc 8 matches for doc 8
    doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition"
    qp1 = QueryProcessor(doc8, ii, cf)
    print("Bool query matches on exact title:", qp1.booleanQuery() == [8])

    # Ensure that bool query matches very specific AND query
    qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf)
    print(
        "Bool query matches on specific AND query ('hugoniot and infinitesimally'):",
        qp2.booleanQuery() == [329])

    # Test that an OR query is handled properly
    #   Both gravel and stagnation have completely distinct postings lists.
    #   OR should merge them.
    gravel_postings = ii.find("gravel").sorted_postings[:]
    stag_postings = ii.find("stagnat").sorted_postings[:]
    gravel_postings.extend(stag_postings)
    qp3 = QueryProcessor("gravel or stagnation", ii, cf)
    print("Bool query successfully handles OR ('gravel or stagnation'):",
          qp3.booleanQuery() == sorted(gravel_postings))

    # Test that NOT is handled properly
    #   The posting list for "diameter" is a subset of "slipstream" postings
    #   (oddly enough). To test this works, do "slipstream and not diameter"
    #   and we chould get slipstream's postings minus those of diameter.
    slip_postings = ii.find("slipstream").sorted_postings[:]
    diam_postings = ii.find("diamet").sorted_postings[:]
    slip_not_diam = [t for t in slip_postings if t not in diam_postings]
    print("Bool query successfully handles NOT ('slipstream and not diameter'):",
        QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \
          == slip_not_diam)

    # Ensure AND/OR order doesn't matter
    print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery())
    print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):",
        QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery())

    # Ensure that the presence of parens does not change query results
    print("Bool query can handle query regardless of parens ('slipstream and diameter'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery())

    # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries
    print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):",
        QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery())
    print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):",
        QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery())

    # Ensure parentheses properly group items
    #   Tested by doing the query "manually" by adding/orring the correct terms
    part_one = QueryProcessor("conduction and cylinder and gas", ii,
                              cf).booleanQuery()
    part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery()
    part_one.extend(part_two)
    expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery()
    expected_result.extend(part_one)
    print("Bool query parens successfully group conflicting operators:",
        QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \
          == sorted(list(set(expected_result))))

    ##### VECTOR QUERY TESTS #####

    # For this, just ensure that most of the results are in the expected list
    print("\nVECTOR QUERY TESTS")

    # Ensure vector query can match on exact title
    print("Vector query matches on exact title:",
          qp1.vectorQuery(1)[0][0] == 8)

    # Try a few example queries from query.text
    #   As long as one-fifth of t-10 are in gt_result, call it a pass
    # Note that queries with larger answer sets were chosen to
    #   ensure there were enough to get to one-fifth of ten
    qc = loadCranQry("query.text")
    poss_queries = list(qc)

    # Query 001
    result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("001") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 001:",
          sum(correct_vector) > 2)

    # Query 128
    result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("128") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 128:",
          sum(correct_vector) > 2)

    # Query 226
    result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("226") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 226:",
          sum(correct_vector) > 2)

    # Query 196
    result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("196") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 196:",
          sum(correct_vector) > 2)

    # Query 291
    result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("291") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 291:",
          sum(correct_vector) > 2)
예제 #27
0
def eval(indexfilename, queryfilename, queryrefilename, numberofrandomqueries):

    # ToDo
    actual = []
    #
    if numberofrandomqueries > 225:
        raise Exception('please enter query count less than or equal to 225')
    qrys = loadCranQry("query.text")
    validqueries = []
    querycounter = 0
    for q in qrys:
        validqueries.append(int(q))

    loadiindex = InvertedIndex()
    loadiindex = loadiindex.load("index_file.pickle")
    #    print("index loaded")
    cf = CranFile('cran.all')
    #QueryProcessor.numberofresult =10
    #qp = QueryProcessor(qrys,loadiindex,cf.docs,10)
    queryRelevence = dict()
    for line in open(queryrefilename):

        fields = line.split(" ")
        fields[0] = '%0*d' % (3, int(fields[0]))
        if fields[0] in queryRelevence:
            # and let's extract the data:
            queryRelevence[fields[0]].append(fields[1])
        else:
            # create a new array in this slot
            queryRelevence[fields[0]] = [fields[1]]
    replacecounter = 0
    queryRelevenceUpdated = {}
    for k in queryRelevence:

        queryRelevenceUpdated['%0*d' % (3, int(
            validqueries[replacecounter]))] = queryRelevence.get(k)
        replacecounter = replacecounter + 1

#  relevent = list(queryRelevence.keys())
# relevent = list(map(int, relevent))
#samplespace = np.intersect1d(relevent, validqueries)
    list_of_random_items = random.sample(validqueries, numberofrandomqueries)
    tempcounter2 = 0
    booleanndcg = []
    vectorndcg = []

    while tempcounter2 < numberofrandomqueries:

        list_of_random_items[tempcounter2] = '%0*d' % (
            3, int(list_of_random_items[tempcounter2]))
        print('query for which ndcg is calculated ' +
              str(list_of_random_items[tempcounter2]))
        y = str(list_of_random_items[tempcounter2])
        vectorresult = query(indexfilename, '1', queryfilename,
                             str(list_of_random_items[tempcounter2]), 10)
        #       vectorresult = ['573', '51', '944', '878', '12', '486', '875', '879', '746', '665']
        #       print(vectorresult)
        tempcounter = 0
        for z in vectorresult:

            if z in queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]:
                vectorresult[tempcounter] = 1
            else:
                vectorresult[tempcounter] = 0

            tempcounter = tempcounter + 1
        #print(vectorresult)
        idealvectorresult = vectorresult.copy()
        idealvectorresult.sort(reverse=True)
        #print(idealvectorresult)
        if sum(idealvectorresult) == 0:
            ndcgscore = 0
        else:
            ndcgscore = ndcg_score(idealvectorresult, vectorresult)
    # print(ndcgscore)
        vectorndcg.append(ndcgscore)
        tempcounter3 = 0

        booleanqueryresult = query(indexfilename, '0', queryfilename,
                                   str(list_of_random_items[tempcounter2]), 10)
        #booleanqueryresult = ['462','462','462','462','462','462','462','462','462']
        booleanquery = booleanqueryresult.copy()
        for g in booleanquery:

            if g in queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]:
                booleanquery[tempcounter3] = 1
            else:
                booleanquery[tempcounter3] = 0

            tempcounter3 = tempcounter3 + 1
        #print(booleanquery)
        tempcounter4 = len(booleanquery)
        while tempcounter4 < 10:
            booleanquery.append(0)
            tempcounter4 = tempcounter4 + 1
        idealbooleanresult = []
        for i in range(0, 10):
            if i < len(queryRelevenceUpdated[str(
                    list_of_random_items[tempcounter2])]):
                idealbooleanresult.append(1)
            else:
                idealbooleanresult.append(0)

        idealbooleanresult.sort(reverse=True)
        if sum(booleanquery) == 0:
            ndcgscoreboolean = 0
        else:
            ndcgscoreboolean = ndcg_score(booleanquery, idealbooleanresult)
        booleanndcg.append(ndcgscoreboolean)
        tempcounter2 = tempcounter2 + 1
    print('P value for all the queries processed is:')
    print(
        scipy.stats.wilcoxon(vectorndcg,
                             booleanndcg,
                             zero_method='wilcox',
                             correction=False))
    print('Done')
    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        'Finding TF and IDF of Queryterms and saving the result to TF.json and IDF.json file'
        termfrequency, IDF = postingobj.term_freq(collectionfile, Queryterm)
        'Saving TF,IDF of document for given query'
        indexobj.save(termfrequency, "TF.json")
        indexobj.save(IDF, "IDF.json")
        TF_filename = open("TF.json")
        TF = json.load(TF_filename)
        IDF_filename = open("IDF.json")
        IDF = json.load(IDF_filename)
        QueryDict = {}
        Qlen = len(Query)
        Querytf = {}
        Querytfidf = {}
        tempdic = {}
        DocSim = []
        '''processing each query term and calculating TF-IDF of query and passing document 
            and query vector to cosine function to calculate cosine similarity'''
        for term in Queryterm:
            plist = InvertedIndex.getPostingsList(term)
            QueryDict.update({term: plist})
            if term not in Querytf.keys():
                Querytf[term] = 1
            else:
                Querytf[term] = Querytf[term] + 1
        for qterms, posting in QueryDict.items():
            for pos in posting:
                for IDFword in IDF:
                    if qterms == IDFword:
                        if qterms not in Querytfidf.keys():
                            '''calculating tf of query using query token frequency in query to the total query tokens'''
                            tf = Querytf[qterms]
                            '''calculating td-idf of query where idf of word in query is 1+log(N/n) 
                                where N total documents and n is number of documents that contain the term '''
                            Querytfidf[qterms] = {pos: tf * (1 + IDF[IDFword])}
                        else:
                            Querytfidf[qterms].update(
                                {pos: (tf) * (1 + IDF[IDFword])})
                        TFwordValues = TF[qterms]
                        '''calculating TF*IDF of document and converting it to vector'''
                        for TFdoc, TFvalues in TFwordValues.items():
                            for IDFword in IDF:
                                if qterms == IDFword and TFdoc == pos:
                                    if qterms not in tempdic.keys():
                                        tempdic[qterms] = {
                                            TFdoc: (TFvalues) * IDF[IDFword]
                                        }
                                    else:
                                        tempdic[qterms].update(
                                            {TFdoc: TFvalues * IDF[IDFword]})

        'converting Query tf -idf dictionary to matrix/vector'
        Querymatrix = pd.DataFrame(Querytfidf)
        'converting document tf-idf dictionary to matrix/vector'
        DocTFIDFmatrix = pd.DataFrame(data=tempdic)
        'processing the matrix/vector to make feasible for cosine function '
        for Qpos, Dpos in zip(list(Querymatrix.index),
                              list(DocTFIDFmatrix.index)):
            if Qpos == Dpos:
                Q = np.array(Querymatrix.loc[Qpos])
                where_are_NaNs = np.isnan(Q)
                Q[where_are_NaNs] = 0
                D = np.array(DocTFIDFmatrix.loc[Dpos])
                where_are_NaNs = np.isnan(D)
                D[where_are_NaNs] = 0
                cosine = QueryProcessor.cosine_similaritys(Q, D)
                DocSim.append((int(Qpos), cosine))
        VectorID = sorted(DocSim, key=lambda x: x[1], reverse=True)
        TopID = sorted(DocSim[:10], key=lambda x: x[1], reverse=True)
        #print(VectorID)
        VectorResult.append({qid: VectorID})
        return TopID, k
예제 #29
0
def test_apriori():
    data = ("a,b,c,d,e,f\n"
            "g,h,i,j,k,l\n"
            "z,x\n"
            "z,x\n"
            "z,x,y\n"
            "z,x,y,i\n")

    expectedItemSets = {ItemSet("i"): 2 / 6,
                        ItemSet("z"): 4 / 6,
                        ItemSet("x"): 4 / 6,
                        ItemSet("y"): 2 / 6,
                        ItemSet("xz"): 4 / 6,
                        ItemSet("yz"): 2 / 6,
                        ItemSet("xy"): 2 / 6,
                        ItemSet("xyz"): 2 / 6}

    index = InvertedIndex()
    index.load(data)
    itemsets = apriori(index, 2 / 6)
    assert(len(itemsets) == len(expectedItemSets))
    for itemset in itemsets:
        assert(frozenset(itemset) in expectedItemSets)
    for itemset in itemsets:
        assert(expectedItemSets[frozenset(itemset)] == index.support(itemset))

    print("Itemsets={}".format([i for i in itemsets if len(i) > 1]))

    def itemize(a):
        return list(map(item_id, a))

    # (antecedent, consequent, confidence, lift, support)
    rx = [
        (['y'], ['x'], 1.0, 1.5, 0.3333333333333333),
        (['x'], ['y'], 0.5, 1.5, 0.3333333333333333),
        (['y'], ['z'], 1.0, 1.5, 0.3333333333333333),
        (['z'], ['y'], 0.5, 1.5, 0.3333333333333333),
        (['x'], ['z'], 1.0, 1.5, 0.6666666666666666),
        (['z'], ['x'], 1.0, 1.5, 0.6666666666666666),
        (['x', 'y'], ['z'], 1.0, 1.5, 0.3333333333333333),
        (['z', 'y'], ['x'], 1.0, 1.5, 0.3333333333333333),
        (['z', 'x'], ['y'], 0.5, 1.5, 0.3333333333333333),
        (['y'], ['z', 'x'], 1.0, 1.5, 0.3333333333333333),
        (['x'], ['z', 'y'], 0.5, 1.5, 0.3333333333333333),
        (['z'], ['x', 'y'], 0.5, 1.5, 0.3333333333333333)
    ]

    expectedRules = list(map(lambda a: (itemize(a[0]), itemize(a[1]), a[2], a[3], a[4]), rx))

    itemset_counts = dict(map(lambda i: (tuple(i), index.count(i)), itemsets))
    rules = generate_rules(
        itemsets,
        itemset_counts,
        index.num_transactions,
        0,
        0)

    def deitemize(a):
        return list(map(item_str, a))

    p = list(map(lambda a: (deitemize(a[0]), deitemize(a[1]), a[2], a[3], a[4]), rules))
    print("rules")
    print(p)

    for (antecedent,
         consequent,
         confidence,
         lift,
         support) in rules:
        print("{}, {} conf={:.4f}, {:.4f}, {:.4f}".
              format(antecedent, consequent, confidence, lift, support))

    assert(len(rules) == len(expectedRules))
    for i in range(len(rules)):
        assert(expectedRules[i] in rules)
예제 #30
0
def VectorCompare():
     queries = loadCranQry("query.text")
     queries_id_list=[str(int(x)) for x in queries.keys()]
     inputdocument = cran.CranFile("cran.all")
     # load the index file saved at from part 1
     index = InvertedIndex().load("index_file")
     qp = QueryProcessor(queries, index, inputdocument, 10)
     queries_id_list=[str(int(x)) for x in queries.keys()]
     #print(queries_id_list)
     #read querls.txt
     qrels_dict=process_querls_file("qrels.text",queries_id_list)
     #IdeaVectorsforQuery_ids={}
     sumbooleanNADC=[]
     sumvectorNADC=[]
     vectorNADC1 = []
     booleanNADC2 = []
     # random_query_id_list=[153, 18]
     # print(random_query_id_list)
     query_id = [4 , 29, 53, 58, 100]
     vectorNADC1=[]
     vectorNADC2=[]
     for q_id in query_id:
         qp.querynumber = q_id
         # boolean_res=qp.booleanQuery()
         vector_top3 = qp.vectorQuery(5)
         vector2_top3=qp.vectorQuery(5,True)
         # vector_top3=[('12',0.34),('746',0.33),('875',0.24)]
         # print(boolean_res)
         print("Output for Vector Model Result::", vector_top3)
         if (vector_top3.__len__() < 1):
             vectorNADC1.append(0)
         else:
             vector_label = [x[0] for x in vector_top3]
             score = [x[1] for x in vector_top3]
             print("DocumentIDs of Vector Model Result:: ", vector_label)
             print("Scores of Vector Model Result::", score)
             true_label = vector_label.copy()
             query_id = str(q_id)
             for x in vector_label:
                 # str_x="{0:0=3d}".format(x)
                 ind = vector_label.index(x)
                 if (x in qrels_dict.get(query_id)):
                     true_label[ind] = 1
                 else:
                     true_label[ind] = 0
             if true_label.__len__() < 5:
                 len_val = 10 - (true_label.__len__())
                 true_label.extend([0] * len_val)
             print("Actual Vector:: ", true_label)
             print("Predicted Vector:: ", score)
             if sum(true_label) == 0:
                 vectorNADC1.append(0)
             else:
                 ndcg = metrics.ndcg_score(true_label, score, 5)
                 print("Calculated ndcg for Vector::", ndcg)
                 vectorNADC1.append(ndcg)
         if (vector2_top3.__len__() < 1):
             vectorNADC2.append(0)
         else:
             vector_label = [x[0] for x in vector2_top3]
             score = [x[1] for x in vector2_top3]
             print("DocumentIDs of Vector Model Result:: ", vector_label)
             print("Scores of Vector Model Result::", score)
             true_label = vector_label.copy()
             query_id = str(q_id)
             for x in vector_label:
                 # str_x="{0:0=3d}".format(x)
                 ind = vector_label.index(x)
                 if (x in qrels_dict.get(query_id)):
                     true_label[ind] = 1
                 else:
                     true_label[ind] = 0
             if true_label.__len__() < 5:
                 len_val = 10 - (true_label.__len__())
                 true_label.extend([0] * len_val)
             print("Actual Vector:: ", true_label)
             print("Predicted Vector:: ", score)
             if sum(true_label) == 0:
                 vectorNADC2.append(0)
             else:
                 ndcg = metrics.ndcg_score(true_label, score, 5)
                 print("Calculated ndcg for Vector::", ndcg)
                 vectorNADC2.append(ndcg)
     print("Calculated NADC sum for all queries", vectorNADC1)
     avergae_vectorNADC = float(sum(vectorNADC1) / 5)
     print("Calculated NADC sum for all queries", vectorNADC2)
     avergae_vectorNADC2 = float(sum(vectorNADC2) / 5)
     print("Avergae NADC Vector::", avergae_vectorNADC)
     print("Avergae NADC boolean::", avergae_vectorNADC2)
     print(vectorNADC1)
     print(vectorNADC2)
     p_value = scipy.stats.wilcoxon(vectorNADC1, vectorNADC2, zero_method='wilcox', correction=False)
     p = "%.20f" % float(str(p_value[1]))
     print('P value for all the queries processed is:', p)