Python InvertedIndex.idf примеры использования

Язык программирования: Python

Пространство имен/Пакет: index

Класс/Тип: InvertedIndex

Метод/Функция: idf

Примеров на hotexamples.com: 4

Python InvertedIndex.idf - 4 примера найдено. Это лучшие примеры Python кода для index.InvertedIndex.idf, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

InvertedIndex(25)

load(15)

idf(4)

support(3)

indexingCranfield(2)

getPostingsList(2)

find(2)

count(2)

get_items_inverted(1)

get_total_number_Doc(1)

build(1)

create_index(1)

loadData(1)

load_csv(1)

mergeList(1)

prepare_disk(1)

save(1)

tfidf(1)

Пример #1

Показать файл

Файл: query.py Проект: inputgivesoutput/SimpleSearchEngine

    def vectorQuery(self, k):
        """ vector query processing, using the cosine similarity. """
        # ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        result = {}
        ivObj = InvertedIndex()
        ivObj.load(self.filename)  # loading the InvertedIndex
        doc_set = set()
        term_idf_list = []
        for term in self.tokens:  # for every term in the query finding the document IDs where the term is present
            if term in self.index:
                doc_set = doc_set.union(set(self.index[term].posting.keys()))
            term_idf_list.append(
                ivObj.idf(term) * 1.0 /
                len(self.tokens))  # calculating tf-idf weights for query
        doc_list = list(doc_set)
        for docID in doc_list:  # Calculating tf-idf weights for the above documents
            for term in self.tokens:
                if term in self.index:
                    if docID in result.keys():
                        result[docID].append(ivObj.tfidf(term, docID))
                    else:
                        result[docID] = [ivObj.tfidf(term, docID)]
                else:
                    if docID in result.keys():
                        result[docID].append(0.0)
                    else:
                        result[docID] = [0.0]

        score_dict = {}
        term_idf_list_np = np.array(self.unitVector(
            term_idf_list))  # calculating unit vector for each document
        for docID in doc_list:
            unit_result = self.unitVector(result[docID])
            unit_np = np.array(unit_result)
            score_dict[docID] = np.dot(
                term_idf_list_np,
                unit_np)  # dot product for query and each document
        score_list = score_dict.items()
        final = sorted(score_list, key=itemgetter(1), reverse=True)
        similarity = []
        for i in range(0, k):
            similarity.append(final[i])
        return similarity  # list of (docID,cosine similarity) in order of ranking

Пример #2

Показать файл

Файл: feature-extract.py Проект: adamhs1997/cs7800project2

def main():

    #########
    # SETUP #
    #########

    # Get input args
    newsgroups_root_dir = argv[1]
    feat_def_path = argv[2]
    class_def_path = argv[3]
    training_data_path = argv[4]

    # Generate index
    #index_newsgroups(newsgroups_root_dir, "idx_save.pkl")
    ii = InvertedIndex()
    ii.load("idx_save.pkl")

    # Write out feature/term pairs to feat_def_path
    feature_id = 0
    with open(feat_def_path, 'w') as outf:
        for item in ii.items:
            outf.write(str(feature_id) + " " + str(item) + "\n")
            feature_id += 1

    # Read back in the feature/term pairs for later
    with open(feat_def_path, 'r') as inf:
        ft_pairs = inf.readlines()

    # Put the ft_pairs into a dictionary for quick lookup
    ft_dict = {}
    for pair in ft_pairs:
        ft_dict[pair.split()[1].strip()] = pair.split()[0]

    # Map the different newsgroups to a given class
    # This is fairly manual...
    with open(class_def_path, 'w') as outf:
        for dir in listdir(newsgroups_root_dir):
            outf.write(class_def_helper(dir) + " " + dir + "\n")

    ############################
    # TRAINING DATA GENERATION #
    ############################

    # Create the training data
    # For each document:
    # Find its containing folder, and extract class from class def
    # For each term in document
    # Compute tfidf, tf or idf
    current_file_id = 1
    with open(training_data_path + ".TFIDF", 'w') as outf:
        # Compute tf-idf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":")

                    # Calculate and write out TF-IDF
                    # Note current_file_id is our doc_id
                    tf = ii.find(word).posting[current_file_id].term_freq()
                    idf = ii.idf(word)
                    #outf.write(str(log10(1 + tf) * idf) + " ")
                    data_set.add(ft_dict[word] + ":" +
                                 str(log10(1 + tf) * idf))

                # Write newline to signify end of file
                #outf.write("\n")
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")
                outf.flush()

                # Increment our current doc
                current_file_id += 1

    current_file_id = 1
    with open(training_data_path + ".TF", 'w') as outf:
        # Compute tf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":")

                    # Write the TF
                    # Note current_file_id is our doc_id
                    # outf.write(str(ii.find(word).posting[
                    # current_file_id].term_freq()) + " ")
                    data_set.add(ft_dict[word] + ":" + str(
                        ii.find(word).posting[current_file_id].term_freq()))

                # Write newline to signify end of file
                # outf.write("\n")
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")
                # outf.flush()

                # Increment our current doc
                current_file_id += 1

    current_file_id = 1
    with open(training_data_path + ".IDF", 'w') as outf:
        # Compute idf
        # Go through each document in newsgroups dir
        for root, _, files in walk(newsgroups_root_dir):
            # Find and write out the class label
            local_dir = root.split(sep)[-1]

            # For each file...
            for file in files:
                outf.write(class_def_helper(local_dir) + " ")
                print(root, file)

                # Get the words from the doc
                stemmed_token_list = preprocess_doc(root + sep + file)

                # Put all the info into a set (for uniqueness)
                data_set = set()

                # Now that we've re-done all that, find idfs
                for word in stemmed_token_list:
                    # Skip blank stopwords
                    if word == "": continue

                    # Get the term ID
                    #outf.write(ft_dict[word] + ":" + str(ii.idf(word))
                    #    + " ")
                    data_set.add(ft_dict[word] + ":" + str(ii.idf(word)))

                # Write newline to signify end of file
                outf.write(" ".join(
                    sorted(data_set, key=lambda x: int(x.split(':')[0]))) +
                           "\n")

Пример #3

Показать файл

class QueryProcessor:
    ##
    #
    #    @param         self
    #    @param         query
    #    @param         index
    #    @param         collection
    #    @return        None
    #    @brief         The constructor.
    #                   This process is extremely expensive because it loads the entire pickle object into memory.
    #                   If we are only executing this for one query it is fine but if we are doing it
    #                   for the evaluation used the load query instead
    #    @exception     None documented yet
    ##
    def __init__(self, query, index_file, collection):
        ''' index is the inverted index; collection is the document collection'''
        self.raw_query = query
        self.index = InvertedIndex()
        self.index = self.index.loadData(index_file)
        self.docs = collection
        self.tokenizer = Tokenizer(
            known_words=set(self.index.get_items_inverted().keys()))
        if self.raw_query:
            self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         query
    #   @return        None
    #   @exception     None
    ##
    def loadQuery(self, query):
        self.raw_query = query
        self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         raw_query
    #   @return        None
    #   @exception     None
    ##
    def preprocessing(self, raw_query):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''
        return self.tokenizer.transpose_document_tokenized_stemmed_spelling(
            raw_query)

    ##
    #   @brief         This method does the boolean query processing
    #   @param         self
    #   @return        results:list[docID]
    #   @bug           Fixed
    #   @exception     None
    ##
    def booleanQuery(self):
        ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them'''
        ''' This method would likely be faster due to the use of  hashes, but I wanted to do what was shown in the slides
            from functools import reduce
            docs = [set(self.index[w]) for w in self.processed_query]
            docs.sort(key=len) # notice it is still smart to order by size 
            return reduce(set.intersection,docs) 
        '''
        if len(self.processed_query) == 0:
            return []

        ## checks that all of our query words are in the index, if not return [] ##
        for w in self.processed_query:
            if not w in self.index.get_items_inverted():
                return []

        ## checks if we only have 1 term in the query and returns its posting list if we do ##
        if len(self.processed_query) == 1:
            return list(self.index.get_items_inverted()[
                self.processed_query[0]].get_posting_list().keys())

        #### document_ids is a list of lists containing only document ids ####
        document_ids = [
            list(self.index.get_items_inverted()[w].get_posting_list().keys())
            for w in self.processed_query
        ]

        # by sorting so that we start with the shortest list of documents we get a potential speed up
        document_ids.sort(key=len)
        results = document_ids[0]

        ## iterates through each query word and does the intersection of docids from its posting list with all those before it ##
        ## could be done faster if index was implemented as set or some other hash data structure
        for p in document_ids[1:]:
            intermediate = []
            i, j = 0, 0
            while i < len(results) and j < len(p):
                if int(results[i]) < int(p[j]):
                    i += 1
                elif int(results[i]) > int(p[j]):
                    j += 1
                else:
                    intermediate.append(p[j])
                    j += 1
                    i += 1
            results = intermediate

            ## checks if we have already found terms totally disjoint from one another
            if len(results) == 0:
                return results

        return results

    ##
    #   @brief         This method compute cosine similarity for two vectors
    #   @param         self
    #   @param         vec1
    #   @param         vec2
    #   @return        score cosine: int
    #   @exception     None
    ##
    def cosine_similarity(self, vec1, vec2):
        # "compute cosine similarity: (vec1*vec2)/(||vec1||*||vec2||)"
        AA, AB, BB = 0, 0, 0
        for i in range(len(vec1)):
            x = vec1[i]
            y = vec2[i]
            AA += x * x
            BB += y * y
            AB += x * y
        return round(AB / math.sqrt(AA * BB), 4)

    ##
    #   @brief         This method compute vector model
    #   @param         self
    #   @param         k
    #   @return        cosines: dict{docID: score}
    #   @bug           Fixed
    #   @exception     ValueError
    ##
    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        if len(self.processed_query) == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        query_words = list(set(self.processed_query))
        idfs = [self.index.idf(w) for w in query_words]

        # undefined behavior from document on what to do if k is larger than the corpus
        try:
            if k > self.index.get_total_number_Doc():
                raise ValueError('k is greater than number of documents')
        except ValueError as err:
            print(err.args)
            return

        # below we define behavior if none of the words in the query are in any documents
        # this behavior was not defined in instructions so no documents seems most appropriate
        # if you used google and got 0 cosine it would return 0 documents even if you wanted the 50 most relevant
        if set(idfs) == {0}:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        # removes any words that have 0 idf as that means they didn't appear in the corpus, means save memory
        # probably not necessary to turn it into lists, and may actually be more appropriate to leave as tuples
        idfs, query_words = map(
            list,
            zip(*[i for i in list(zip(idfs, query_words)) if not i[0] == 0]))

        #Calculates tfs of relevant words
        query_term_counter = Counter(self.processed_query)
        query_tf_vector = [
            round(math.log10(query_term_counter[w] + 1), 4)
            for w in query_words
        ]

        #Other way of doing tf
        #query_tf_vector = [round(1 + math.log10(query_term_counter[w]),4) if query_term_counter[w] > 0 else 0 for w in query_words]

        ### NCC change if a term in a quiry does not appear in our inverted index Forget/Discount term
        #### postings should be a list of lists which contains word postings

        postings = [
            self.index.get_items_inverted()[w].get_posting_list()
            for w in query_words if w in self.index.get_items_inverted()
        ]

        document_ids = set().union(*postings)
        document_tfs = {d: [0] * len(query_words) for d in document_ids}

        for inx, term in enumerate(postings):
            for document_id, posting in term.items():
                #log normalization
                document_tfs[document_id][inx] = math.log10(
                    posting.term_freq() + 1)

                #Other
                # tf = posting.term_freq()
                # if tf > 0 :
                #     tf = 1 + math.log10(tf)
                # else:
                #     tf = 0
                # document_tfs[document_id][inx] = tf

        query_tfidf = np.multiply(query_tf_vector, idfs)

        cosines = Counter({
            d: self.cosine_similarity(query_tfidf, np.multiply(d_tf, idfs))
            for d, d_tf in document_tfs.items()
        })
        # this has to be a list as dict are not sorted...
        # need a consistent ordering of documents when multiple documents have the same score we first sort on score then docid, very slow
        # if we know k or know the number of documents we could use numpy to preallocate memory which means we would not have to use append and could just use copy
        temp_k = k
        scores = sorted(list(set(cosines.values())), reverse=True)
        ret = []
        for s in scores:
            docs_with_score_s = sorted(
                [int(d) for d, v in cosines.items() if v == s])
            if len(docs_with_score_s) >= temp_k:
                docs_with_score_s = docs_with_score_s[:temp_k]
                ret.extend([(str(d), s) for d in docs_with_score_s])
                temp_k = 0
                break
            else:
                temp_k = temp_k - len(docs_with_score_s)
                ret.extend([(str(d), s) for d in docs_with_score_s])
        if not temp_k == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())

            ret.extend([(str(j), 0) for j in sorted(
                list(map(int, all_docids.difference({i[0]
                                                     for i in ret}))))[:temp_k]
                        ])
        return ret

Пример #4

Показать файл

def test(index_loc, cran_loc, qrels_loc):
    ''' test your code thoroughly. put the testing cases here'''

    ##### SETUP ITEMS #####

    # Grab index file to restore II
    ii = InvertedIndex()
    ii.load(index_loc)

    # Get the document collection
    cf = CranFile(cran_loc)

    # Get ground-truth results from qrels.txt
    with open(qrels_loc) as f:
        qrels = f.readlines()

    # Index qrels into a dict
    qrel_dict = {}
    for qrel in qrels:
        qrel_split = qrel.split()
        if int(qrel_split[0]) in qrel_dict:
            qrel_dict[int(qrel_split[0])].append(int(qrel_split[1]))
        else:
            qrel_dict[int(qrel_split[0])] = [int(qrel_split[1])]

    ##### INITIAL TEST ITEMS #####
    print("TESTS BASED ON SUGGESTED TESTING POINTS")

    # Ensure tf is correct
    #   Find a random word and check TF value against what is manually done
    posting_list = ii.find("experiment").posting
    tf_vector = []
    for posting in posting_list:
        tf_vector.append(len(posting_list[posting].positions) \
            == posting_list[posting].term_freq())
    print("TF is computed correctly:", all(tf_vector))

    # Ensure idf is correct
    print("IDF is computed correctly:", log10(ii.nDocs / len(posting_list)) \
        == ii.idf("experiment"))

    # As both tf and idf are correct, and tf-idf is a product of the two,
    #   it is reasonable to assume tf-idf is computed correctly

    ##### BOOL QUERY TESTS #####

    # Here, I use very specific boolean queries to ensure that a
    #   limited number of documents are returned
    print("\nBOOL QUERY TESTS")

    # Ensure that the exact title of doc 8 matches for doc 8
    doc8 = "measurements of the effect of two-dimensional and three-dimensional roughness elements on boundary layer transition"
    qp1 = QueryProcessor(doc8, ii, cf)
    print("Bool query matches on exact title:", qp1.booleanQuery() == [8])

    # Ensure that bool query matches very specific AND query
    qp2 = QueryProcessor("hugoniot and infinitesimally", ii, cf)
    print(
        "Bool query matches on specific AND query ('hugoniot and infinitesimally'):",
        qp2.booleanQuery() == [329])

    # Test that an OR query is handled properly
    #   Both gravel and stagnation have completely distinct postings lists.
    #   OR should merge them.
    gravel_postings = ii.find("gravel").sorted_postings[:]
    stag_postings = ii.find("stagnat").sorted_postings[:]
    gravel_postings.extend(stag_postings)
    qp3 = QueryProcessor("gravel or stagnation", ii, cf)
    print("Bool query successfully handles OR ('gravel or stagnation'):",
          qp3.booleanQuery() == sorted(gravel_postings))

    # Test that NOT is handled properly
    #   The posting list for "diameter" is a subset of "slipstream" postings
    #   (oddly enough). To test this works, do "slipstream and not diameter"
    #   and we chould get slipstream's postings minus those of diameter.
    slip_postings = ii.find("slipstream").sorted_postings[:]
    diam_postings = ii.find("diamet").sorted_postings[:]
    slip_not_diam = [t for t in slip_postings if t not in diam_postings]
    print("Bool query successfully handles NOT ('slipstream and not diameter'):",
        QueryProcessor("slipstream and not diameter", ii, cf).booleanQuery() \
          == slip_not_diam)

    # Ensure AND/OR order doesn't matter
    print("Bool query can handle query regardless of AND order ('a and b' = 'b and a'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter and slipstream", ii, cf).booleanQuery())
    print("Bool query can handle query regardless of OR order ('a or b' = 'b or a'):",
        QueryProcessor("slipstream or diameter", ii, cf).booleanQuery() \
          == QueryProcessor("diameter or slipstream", ii, cf).booleanQuery())

    # Ensure that the presence of parens does not change query results
    print("Bool query can handle query regardless of parens ('slipstream and diameter'):",
        QueryProcessor("slipstream and diameter", ii, cf).booleanQuery() \
          == QueryProcessor("(slipstream and diameter)", ii, cf).booleanQuery())

    # Ensure parentheses do not change order of processing for AND-AND and OR-OR queries
    print("Bool query AND is accociative ('(a and b) and c' = 'a and (b and c)'):",
        QueryProcessor("(slipstream and diameter) and thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream and (diameter and thrust)", ii, cf).booleanQuery())
    print("Bool query OR is accociative ('(a or b) or c' = 'a or (b or c)'):",
        QueryProcessor("(slipstream or diameter) or thrust", ii, cf).booleanQuery() \
          == QueryProcessor("slipstream or (diameter or thrust)", ii, cf).booleanQuery())

    # Ensure parentheses properly group items
    #   Tested by doing the query "manually" by adding/orring the correct terms
    part_one = QueryProcessor("conduction and cylinder and gas", ii,
                              cf).booleanQuery()
    part_two = QueryProcessor("radiation and gas", ii, cf).booleanQuery()
    part_one.extend(part_two)
    expected_result = QueryProcessor("hugoniot", ii, cf).booleanQuery()
    expected_result.extend(part_one)
    print("Bool query parens successfully group conflicting operators:",
        QueryProcessor("(conduction and cylinder and gas) or (radiation and gas) or hugoniot", ii, cf).booleanQuery() \
          == sorted(list(set(expected_result))))

    ##### VECTOR QUERY TESTS #####

    # For this, just ensure that most of the results are in the expected list
    print("\nVECTOR QUERY TESTS")

    # Ensure vector query can match on exact title
    print("Vector query matches on exact title:",
          qp1.vectorQuery(1)[0][0] == 8)

    # Try a few example queries from query.text
    #   As long as one-fifth of t-10 are in gt_result, call it a pass
    # Note that queries with larger answer sets were chosen to
    #   ensure there were enough to get to one-fifth of ten
    qc = loadCranQry("query.text")
    poss_queries = list(qc)

    # Query 001
    result = QueryProcessor(qc["001"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("001") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 001:",
          sum(correct_vector) > 2)

    # Query 128
    result = QueryProcessor(qc["128"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("128") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 128:",
          sum(correct_vector) > 2)

    # Query 226
    result = QueryProcessor(qc["226"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("226") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 226:",
          sum(correct_vector) > 2)

    # Query 196
    result = QueryProcessor(qc["196"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("196") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 196:",
          sum(correct_vector) > 2)

    # Query 291
    result = QueryProcessor(qc["291"].text, ii, cf).vectorQuery(10)
    gt_result = qrel_dict[poss_queries.index("291") + 1]
    correct_vector = list(map(lambda x: x in gt_result,
                              [x[0] for x in result]))
    print("Vector query is at least one-fifth correct for query 291:",
          sum(correct_vector) > 2)