Пример #1
0
 def __init__(self, query, index_file, collection):
     ''' index is the inverted index; collection is the document collection'''
     self.raw_query = query
     self.index = InvertedIndex()
     self.index = self.index.loadData(index_file)
     self.docs = collection
     self.tokenizer = Tokenizer(
         known_words=set(self.index.get_items_inverted().keys()))
     if self.raw_query:
         self.processed_query = self.preprocessing(self.raw_query)
Пример #2
0
def prepare_tokenizer(config, model):
    args = config['args']
    if config['emb_class'] == 'glove':
        vocab = load_vocab(args.vocab_path)
        tokenizer = Tokenizer(vocab, config)
    else:
        tokenizer = model.bert_tokenizer
    return tokenizer
Пример #3
0
def preprocess_glove(config):
    args = config['args']

    # vocab, embedding
    init_vocab = build_init_vocab(config)
    vocab, embedding = build_vocab_from_embedding(args.embedding_path, init_vocab, config)

    # build data
    tokenizer = Tokenizer(vocab, config)
    if args.augmented:
        path = os.path.join(args.data_dir, args.augmented_filename)
    else:
        path = os.path.join(args.data_dir, _TRAIN_FILE)
    train_data = build_data(path, tokenizer)

    path = os.path.join(args.data_dir, _VALID_FILE)
    valid_data = build_data(path, tokenizer)

    path = os.path.join(args.data_dir, _TEST_FILE)
    test_data = build_data(path, tokenizer)

    # build labels
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    labels = build_label(path)

    # write data, vocab, embedding, labels
    if args.augmented:
        path = os.path.join(args.data_dir, args.augmented_filename + _SUFFIX)
    else:
        path = os.path.join(args.data_dir, _TRAIN_FILE + _SUFFIX)
    write_data(train_data, path, tokenizer, labels)

    path = os.path.join(args.data_dir, _VALID_FILE + _SUFFIX)
    write_data(valid_data, path, tokenizer, labels)

    path = os.path.join(args.data_dir, _TEST_FILE + _SUFFIX)
    write_data(test_data, path, tokenizer, labels)

    path = os.path.join(args.data_dir, _VOCAB_FILE)
    write_vocab(vocab, path)

    path = os.path.join(args.data_dir, _EMBED_FILE)
    write_embedding(embedding, path)

    path = os.path.join(args.data_dir, _LABEL_FILE)
    write_label(labels, path)
Пример #4
0
def preprocess_glove_or_elmo(config):
    args = config['args']

    # vocab, embedding
    init_vocab = build_init_vocab(config)
    vocab, embedding = build_vocab_from_embedding(args.embedding_path,
                                                  init_vocab, config)

    # build poss, chars, labels
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    poss, chars, labels, _ = build_dict(path, config)

    tokenizer = Tokenizer(vocab, config)

    # build data
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    train_data = build_data(path, tokenizer)

    path = os.path.join(args.data_dir, _VALID_FILE)
    valid_data = build_data(path, tokenizer)

    path = os.path.join(args.data_dir, _TEST_FILE)
    test_data = build_data(path, tokenizer)

    # write data, vocab, embedding, poss, labels
    path = os.path.join(args.data_dir, _TRAIN_FILE + _SUFFIX)
    write_data(args, train_data, path, tokenizer, poss, labels)

    path = os.path.join(args.data_dir, _VALID_FILE + _SUFFIX)
    write_data(args, valid_data, path, tokenizer, poss, labels)

    path = os.path.join(args.data_dir, _TEST_FILE + _SUFFIX)
    write_data(args, test_data, path, tokenizer, poss, labels)

    path = os.path.join(args.data_dir, _VOCAB_FILE)
    write_vocab(vocab, path)

    path = os.path.join(args.data_dir, _EMBED_FILE)
    write_embedding(embedding, path)

    path = os.path.join(args.data_dir, _POS_FILE)
    write_dict(poss, path)

    path = os.path.join(args.data_dir, _LABEL_FILE)
    write_dict(labels, path)
Пример #5
0
def preprocess_bert(config):
    args = config['args']

    w_tokenizer = None
    if args.bert_use_subword_pooling and args.bert_use_word_embedding:
        args = config['args']
        # vocab, embedding
        init_vocab = build_init_vocab(config)
        vocab, embedding = build_vocab_from_embedding(args.embedding_path,
                                                      init_vocab, config)
        w_tokenizer = Tokenizer(vocab, config)
        # write embedding
        path = os.path.join(args.data_dir, _EMBED_FILE)
        write_embedding(embedding, path)

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model_name_or_path)
    # build poss, chars, labels, glabels
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    poss, chars, labels, glabels = build_dict(path, config)

    # build features
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    train_features = build_features(path,
                                    tokenizer,
                                    poss,
                                    labels,
                                    config,
                                    mode='train',
                                    w_tokenizer=w_tokenizer,
                                    glabels=glabels)

    path = os.path.join(args.data_dir, _VALID_FILE)
    valid_features = build_features(path,
                                    tokenizer,
                                    poss,
                                    labels,
                                    config,
                                    mode='valid',
                                    w_tokenizer=w_tokenizer,
                                    glabels=glabels)

    path = os.path.join(args.data_dir, _TEST_FILE)
    test_features = build_features(path,
                                   tokenizer,
                                   poss,
                                   labels,
                                   config,
                                   mode='test',
                                   w_tokenizer=w_tokenizer,
                                   glabels=glabels)

    # write features
    path = os.path.join(args.data_dir, _TRAIN_FILE + _FSUFFIX)
    write_features(train_features, path)

    path = os.path.join(args.data_dir, _VALID_FILE + _FSUFFIX)
    write_features(valid_features, path)

    path = os.path.join(args.data_dir, _TEST_FILE + _FSUFFIX)
    write_features(test_features, path)

    # write poss, labels, glabels
    path = os.path.join(args.data_dir, _POS_FILE)
    write_dict(poss, path)
    path = os.path.join(args.data_dir, _LABEL_FILE)
    write_dict(labels, path)
    path = os.path.join(args.data_dir, _GLABEL_FILE)
    write_dict(glabels, path)
Пример #6
0
class QueryProcessor:
    ##
    #
    #    @param         self
    #    @param         query
    #    @param         index
    #    @param         collection
    #    @return        None
    #    @brief         The constructor.
    #                   This process is extremely expensive because it loads the entire pickle object into memory.
    #                   If we are only executing this for one query it is fine but if we are doing it
    #                   for the evaluation used the load query instead
    #    @exception     None documented yet
    ##
    def __init__(self, query, index_file, collection):
        ''' index is the inverted index; collection is the document collection'''
        self.raw_query = query
        self.index = InvertedIndex()
        self.index = self.index.loadData(index_file)
        self.docs = collection
        self.tokenizer = Tokenizer(
            known_words=set(self.index.get_items_inverted().keys()))
        if self.raw_query:
            self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         query
    #   @return        None
    #   @exception     None
    ##
    def loadQuery(self, query):
        self.raw_query = query
        self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         raw_query
    #   @return        None
    #   @exception     None
    ##
    def preprocessing(self, raw_query):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''
        return self.tokenizer.transpose_document_tokenized_stemmed_spelling(
            raw_query)

    ##
    #   @brief         This method does the boolean query processing
    #   @param         self
    #   @return        results:list[docID]
    #   @bug           Fixed
    #   @exception     None
    ##
    def booleanQuery(self):
        ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them'''
        ''' This method would likely be faster due to the use of  hashes, but I wanted to do what was shown in the slides
            from functools import reduce
            docs = [set(self.index[w]) for w in self.processed_query]
            docs.sort(key=len) # notice it is still smart to order by size 
            return reduce(set.intersection,docs) 
        '''
        if len(self.processed_query) == 0:
            return []

        ## checks that all of our query words are in the index, if not return [] ##
        for w in self.processed_query:
            if not w in self.index.get_items_inverted():
                return []

        ## checks if we only have 1 term in the query and returns its posting list if we do ##
        if len(self.processed_query) == 1:
            return list(self.index.get_items_inverted()[
                self.processed_query[0]].get_posting_list().keys())

        #### document_ids is a list of lists containing only document ids ####
        document_ids = [
            list(self.index.get_items_inverted()[w].get_posting_list().keys())
            for w in self.processed_query
        ]

        # by sorting so that we start with the shortest list of documents we get a potential speed up
        document_ids.sort(key=len)
        results = document_ids[0]

        ## iterates through each query word and does the intersection of docids from its posting list with all those before it ##
        ## could be done faster if index was implemented as set or some other hash data structure
        for p in document_ids[1:]:
            intermediate = []
            i, j = 0, 0
            while i < len(results) and j < len(p):
                if int(results[i]) < int(p[j]):
                    i += 1
                elif int(results[i]) > int(p[j]):
                    j += 1
                else:
                    intermediate.append(p[j])
                    j += 1
                    i += 1
            results = intermediate

            ## checks if we have already found terms totally disjoint from one another
            if len(results) == 0:
                return results

        return results

    ##
    #   @brief         This method compute cosine similarity for two vectors
    #   @param         self
    #   @param         vec1
    #   @param         vec2
    #   @return        score cosine: int
    #   @exception     None
    ##
    def cosine_similarity(self, vec1, vec2):
        # "compute cosine similarity: (vec1*vec2)/(||vec1||*||vec2||)"
        AA, AB, BB = 0, 0, 0
        for i in range(len(vec1)):
            x = vec1[i]
            y = vec2[i]
            AA += x * x
            BB += y * y
            AB += x * y
        return round(AB / math.sqrt(AA * BB), 4)

    ##
    #   @brief         This method compute vector model
    #   @param         self
    #   @param         k
    #   @return        cosines: dict{docID: score}
    #   @bug           Fixed
    #   @exception     ValueError
    ##
    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        if len(self.processed_query) == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        query_words = list(set(self.processed_query))
        idfs = [self.index.idf(w) for w in query_words]

        # undefined behavior from document on what to do if k is larger than the corpus
        try:
            if k > self.index.get_total_number_Doc():
                raise ValueError('k is greater than number of documents')
        except ValueError as err:
            print(err.args)
            return

        # below we define behavior if none of the words in the query are in any documents
        # this behavior was not defined in instructions so no documents seems most appropriate
        # if you used google and got 0 cosine it would return 0 documents even if you wanted the 50 most relevant
        if set(idfs) == {0}:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        # removes any words that have 0 idf as that means they didn't appear in the corpus, means save memory
        # probably not necessary to turn it into lists, and may actually be more appropriate to leave as tuples
        idfs, query_words = map(
            list,
            zip(*[i for i in list(zip(idfs, query_words)) if not i[0] == 0]))

        #Calculates tfs of relevant words
        query_term_counter = Counter(self.processed_query)
        query_tf_vector = [
            round(math.log10(query_term_counter[w] + 1), 4)
            for w in query_words
        ]

        #Other way of doing tf
        #query_tf_vector = [round(1 + math.log10(query_term_counter[w]),4) if query_term_counter[w] > 0 else 0 for w in query_words]

        ### NCC change if a term in a quiry does not appear in our inverted index Forget/Discount term
        #### postings should be a list of lists which contains word postings

        postings = [
            self.index.get_items_inverted()[w].get_posting_list()
            for w in query_words if w in self.index.get_items_inverted()
        ]

        document_ids = set().union(*postings)
        document_tfs = {d: [0] * len(query_words) for d in document_ids}

        for inx, term in enumerate(postings):
            for document_id, posting in term.items():
                #log normalization
                document_tfs[document_id][inx] = math.log10(
                    posting.term_freq() + 1)

                #Other
                # tf = posting.term_freq()
                # if tf > 0 :
                #     tf = 1 + math.log10(tf)
                # else:
                #     tf = 0
                # document_tfs[document_id][inx] = tf

        query_tfidf = np.multiply(query_tf_vector, idfs)

        cosines = Counter({
            d: self.cosine_similarity(query_tfidf, np.multiply(d_tf, idfs))
            for d, d_tf in document_tfs.items()
        })
        # this has to be a list as dict are not sorted...
        # need a consistent ordering of documents when multiple documents have the same score we first sort on score then docid, very slow
        # if we know k or know the number of documents we could use numpy to preallocate memory which means we would not have to use append and could just use copy
        temp_k = k
        scores = sorted(list(set(cosines.values())), reverse=True)
        ret = []
        for s in scores:
            docs_with_score_s = sorted(
                [int(d) for d, v in cosines.items() if v == s])
            if len(docs_with_score_s) >= temp_k:
                docs_with_score_s = docs_with_score_s[:temp_k]
                ret.extend([(str(d), s) for d in docs_with_score_s])
                temp_k = 0
                break
            else:
                temp_k = temp_k - len(docs_with_score_s)
                ret.extend([(str(d), s) for d in docs_with_score_s])
        if not temp_k == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())

            ret.extend([(str(j), 0) for j in sorted(
                list(map(int, all_docids.difference({i[0]
                                                     for i in ret}))))[:temp_k]
                        ])
        return ret
Пример #7
0
 def __init__(self):
     self.__items = {}  # list of IndexItems
     self.__nDocs = 0  # the number of indexed documents
     self.__tokenizer = Tokenizer()
Пример #8
0
class InvertedIndex:
    ##
    #    @param         self
    #    @param         topicName
    #    @return        None
    #    @brief         The constructor.
    #    @exception     None documented yet
    ##
    def __init__(self):
        self.__items = {}  # list of IndexItems
        self.__nDocs = 0  # the number of indexed documents
        self.__tokenizer = Tokenizer()

    ##
    #   @brief     This method return the total number of doc in our data set
    #
    #   @param         self
    #   @param         Doc
    #   @return        int
    #   @exception     None
    ##
    def get_total_number_Doc(self):
        return self.__nDocs

    ##
    #   @brief     This method return the total number of doc in our data set
    #
    #   @param         self
    #   @param         Doc
    #   @return        items: dict
    #   @exception     None
    ##
    def get_items_inverted(self):
        return self.__items

    ##
    #   @brief     This method is designed to index a docuemnt, using the simple SPIMI algorithm,
    #              but no need to store blocks due to the small collection we are handling.
    #              Using save/load the whole index instead
    #
    #       ToDo: indexing only title and body; use some functions defined in util.py
    #       (1) convert to lower cases,
    #       (2) remove stopwords,
    #       (3) stemming
    #
    #   @param         self
    #   @param         Doc
    #   @return        None
    #   @exception     None
    ##
    def indexDoc(self, doc):  # indexing a Document object
        #Concatenate document title
        newDoc = doc.title + " " + doc.author + " " + doc.body
        docID = doc.docID
        full_stemmed_list = self.__tokenizer.transpose_document_tokenized_stemmed(
            newDoc)

        for position, term in enumerate(full_stemmed_list):
            if self.__items.get(term) != None:
                self.__items[term].add(docID, position)
            else:
                #key does not exists in dict
                newPosting = Posting(docID)
                newPosting.append(position)
                self.__items[term] = IndexItem(term)
                self.__items[term].set_posting_list(docID, newPosting)
        self.__nDocs += 1

    ##
    #   @brief     This method Sorts all posting list by document ID.
    #              NOTE: This method seems redundant as by default all postings list document IDs will be in order.
    #                    Since documents are read in in a particular order.
    #   @param         self
    #   @return        None
    #   @exception     None
    ##
    def sort(self):
        ''' sort all posting lists by docID'''
        for term, posting in self.__items.items():
            posting.sort()

    ##
    #   @brief     This method sorts all indexing terms in our index
    #
    #   @param         self
    #   @return        OrderedDict
    #   @exception     None
    ##
    def sort_terms(self):
        ''' sort all posting lists by docID'''
        return collections.OrderedDict(
            sorted(self.__items.items(), key=operator.itemgetter(0)))
        #

    ##
    #   @brief     This method finds a term in the indexing and returns its posting list
    #
    #   @param         self
    #   @param         term
    #   @return        postingList:dict
    #   @exception     None
    ##
    def find(self, term):
        return self.__items[term]

    ##
    #   @brief     This method to dumper for json
    #
    #   @param         self
    #   @param         obj
    #   @return        toJSON or dict
    #   @exception     None
    ##
    def dumper(self, obj):
        try:
            return obj.toJSON()
        except:
            return obj.__dict__

    ##
    #   @brief     This method Serializes the inverted index to a json format and
    #              clears the Memory that holds this dictionary
    #
    #   @param         self
    #   @param         filename
    #   @return        ValueError
    #   @exception     None
    ##
    def save(self, filename):
        write_stream = open(filename, 'w')
        listTerm = self.sort_terms()
        dictMain = {}
        listInfo = {}

        for term, postingList in listTerm.items():
            dictTemp = postingList.posting_list_to_string()
            dictTemp["idf"] = self.idf(term)
            dictMain[term] = dictTemp

        listInfo["nDoc"] = self.get_total_number_Doc()
        listInfo["Data"] = dictMain
        try:
            write_stream.write(json.dumps(listInfo, indent=3))
        except ValueError as e:
            print("Is not valid json")
        write_stream.close()

    ##
    #   @brief     This method deserializes a json file in a object by reallocating the self.__items
    #
    #   @param         self
    #   @param         filename
    #   @return        json: dict
    #   @exception     ValueError
    ##
    def load(self, filename):
        try:
            with open(filename) as json_file:
                return json.load(json_file)
        except ValueError as e:
            print("Is not valid json")

    ##
    #   @brief     This method get IDF for  term by compute the inverted document frequency for a given term.
    #               We used this IDF = (Total number of (documents))/(Number of  (documents) containing the word)
    #
    #   @param         self
    #   @param         term
    #   @return        idf:int
    #   @exception     None
    ##
    def idf(self, term):
        ''' '''
        if not term in self.__items:
            return 0
        termData = self.__items[term]
        N = self.get_total_number_Doc()
        df = len(termData.get_posting_list())
        #inverse document frequency
        idf = round(math.log10(N / (float(df))), 4)
        #probabilistic inverse document frequency from
        #idf = round(math.log10(N - df /(float(df))), 4)
        return idf

    ##
    #   @brief     This method create IDF for doc
    #
    #   @param         self
    #   @return        idf: {term: {docID:idf}}
    #   @exception     None
    ##
    def idfDict(self):
        idf = collections.OrderedDict()

        for term, postingList in self.sort_terms().items():
            idf[term] = self.idf(term)

        return idf

    ##
    #   @brief     This method create TF for add doc
    #   There are different ways to represent TF we used tf = log(1+tf)
    #   Another way is TF = (Frequency of the word in the sentence) / (Total number of words in the sentence)
    #
    #   @param         self
    #   @return        word_tf_values: {term: {docID: tf, docID: tf }}
    #   @exception     None
    ##
    def tf_doc(self):
        word_tf_values = collections.OrderedDict()
        for term, postingList in self.sort_terms().items():
            doc_tf = collections.OrderedDict()
            for docID, post in postingList.get_posting_list().items():
                doc_tf[docID] = round(math.log10(1 + post.term_freq()),
                                      4)  #log normalize
            word_tf_values[term] = doc_tf
        return word_tf_values

    ##
    #   @brief     This method create tfidf for all doc.
    #              It structure is of the form {docID: {term: tf-idf,term: tf-idf }}
    #   @param         self
    #   @param         word_tf_valuesm
    #   @param         idfDict
    #   @return        TFIDF_dict:{docID: {term: tf-idf,term: tf-idf }}
    #   @exception     None
    ##
    def tf_idf(self, word_tf_valuesm, idfDict):
        TFIDF_dict = collections.defaultdict(list)

        for term, postingList in self.sort_terms().items():
            tf_idf = 0.0
            for doc, doctf in word_tf_valuesm[term].items():
                term_tf_idf_doc = {}
                tf_idf = doctf * idfDict[term]
                term_tf_idf_doc[term] = tf_idf
                TFIDF_dict[doc].append(term_tf_idf_doc)
        return TFIDF_dict

        ##

    ##
    #   @brief     This method Saves the current state of the InvertedIndex
    #
    #   @param         self
    #   @param         filename
    #   @return        None
    #   @exception     AttributeError,  pickle.PickleError
    ##
    def storeData(self, filename):

        try:
            fileP = open(filename, "wb")
            pickle.dump(self, fileP)  # serialize class object
        except (AttributeError, pickle.PickleError):
            print("Error pickle.dump InvertedIndex ")
        fileP.close()

    ##
    #   @brief     This method Loads the saved InvertedIndex
    #
    #   @param         self
    #   @param         filename
    #   @return        invertedIndexer
    #   @exception     (pickle.UnpicklingError, ImportError, EOFError, IndexError, TypeError)
    ##
    def loadData(self, filename):
        try:
            fileP = open(filename, "rb")
            invertedIndexer = pickle.load(fileP)
        except (pickle.UnpicklingError, ImportError, EOFError, IndexError,
                TypeError) as err:
            print(err)
            print("Error pickle.load InvertedIndex ")
        fileP.close()
        return invertedIndexer