Python Tokenizer примеры использования

Язык программирования: Python

Пространство имен/Пакет: util

Класс/Тип: Tokenizer

Примеров на hotexamples.com: 8

Python Tokenizer - 8 примеров найдено. Это лучшие примеры Python кода для util.Tokenizer, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Tokenizer(6)

transpose_document_tokenized_stemmed(1)

transpose_document_tokenized_stemmed_spelling(1)

Пример #1

Показать файл

 def __init__(self, query, index_file, collection):
     ''' index is the inverted index; collection is the document collection'''
     self.raw_query = query
     self.index = InvertedIndex()
     self.index = self.index.loadData(index_file)
     self.docs = collection
     self.tokenizer = Tokenizer(
         known_words=set(self.index.get_items_inverted().keys()))
     if self.raw_query:
         self.processed_query = self.preprocessing(self.raw_query)

Пример #2

Показать файл

def prepare_tokenizer(config, model):
    args = config['args']
    if config['emb_class'] == 'glove':
        vocab = load_vocab(args.vocab_path)
        tokenizer = Tokenizer(vocab, config)
    else:
        tokenizer = model.bert_tokenizer
    return tokenizer

Пример #3

Показать файл

def preprocess_glove(config):
    args = config['args']

    # vocab, embedding
    init_vocab = build_init_vocab(config)
    vocab, embedding = build_vocab_from_embedding(args.embedding_path, init_vocab, config)

    # build data
    tokenizer = Tokenizer(vocab, config)
    if args.augmented:
        path = os.path.join(args.data_dir, args.augmented_filename)
    else:
        path = os.path.join(args.data_dir, _TRAIN_FILE)
    train_data = build_data(path, tokenizer)

    path = os.path.join(args.data_dir, _VALID_FILE)
    valid_data = build_data(path, tokenizer)

    path = os.path.join(args.data_dir, _TEST_FILE)
    test_data = build_data(path, tokenizer)

    # build labels
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    labels = build_label(path)

    # write data, vocab, embedding, labels
    if args.augmented:
        path = os.path.join(args.data_dir, args.augmented_filename + _SUFFIX)
    else:
        path = os.path.join(args.data_dir, _TRAIN_FILE + _SUFFIX)
    write_data(train_data, path, tokenizer, labels)

    path = os.path.join(args.data_dir, _VALID_FILE + _SUFFIX)
    write_data(valid_data, path, tokenizer, labels)

    path = os.path.join(args.data_dir, _TEST_FILE + _SUFFIX)
    write_data(test_data, path, tokenizer, labels)

    path = os.path.join(args.data_dir, _VOCAB_FILE)
    write_vocab(vocab, path)

    path = os.path.join(args.data_dir, _EMBED_FILE)
    write_embedding(embedding, path)

    path = os.path.join(args.data_dir, _LABEL_FILE)
    write_label(labels, path)

Пример #4

Показать файл

Файл: preprocess.py Проект: dsindex/ntagger

def preprocess_glove_or_elmo(config):
    args = config['args']

    # vocab, embedding
    init_vocab = build_init_vocab(config)
    vocab, embedding = build_vocab_from_embedding(args.embedding_path,
                                                  init_vocab, config)

    # build poss, chars, labels
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    poss, chars, labels, _ = build_dict(path, config)

    tokenizer = Tokenizer(vocab, config)

    # build data
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    train_data = build_data(path, tokenizer)

    path = os.path.join(args.data_dir, _VALID_FILE)
    valid_data = build_data(path, tokenizer)

    path = os.path.join(args.data_dir, _TEST_FILE)
    test_data = build_data(path, tokenizer)

    # write data, vocab, embedding, poss, labels
    path = os.path.join(args.data_dir, _TRAIN_FILE + _SUFFIX)
    write_data(args, train_data, path, tokenizer, poss, labels)

    path = os.path.join(args.data_dir, _VALID_FILE + _SUFFIX)
    write_data(args, valid_data, path, tokenizer, poss, labels)

    path = os.path.join(args.data_dir, _TEST_FILE + _SUFFIX)
    write_data(args, test_data, path, tokenizer, poss, labels)

    path = os.path.join(args.data_dir, _VOCAB_FILE)
    write_vocab(vocab, path)

    path = os.path.join(args.data_dir, _EMBED_FILE)
    write_embedding(embedding, path)

    path = os.path.join(args.data_dir, _POS_FILE)
    write_dict(poss, path)

    path = os.path.join(args.data_dir, _LABEL_FILE)
    write_dict(labels, path)

Пример #5

Показать файл

Файл: preprocess.py Проект: dsindex/ntagger

def preprocess_bert(config):
    args = config['args']

    w_tokenizer = None
    if args.bert_use_subword_pooling and args.bert_use_word_embedding:
        args = config['args']
        # vocab, embedding
        init_vocab = build_init_vocab(config)
        vocab, embedding = build_vocab_from_embedding(args.embedding_path,
                                                      init_vocab, config)
        w_tokenizer = Tokenizer(vocab, config)
        # write embedding
        path = os.path.join(args.data_dir, _EMBED_FILE)
        write_embedding(embedding, path)

    tokenizer = AutoTokenizer.from_pretrained(args.bert_model_name_or_path)
    # build poss, chars, labels, glabels
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    poss, chars, labels, glabels = build_dict(path, config)

    # build features
    path = os.path.join(args.data_dir, _TRAIN_FILE)
    train_features = build_features(path,
                                    tokenizer,
                                    poss,
                                    labels,
                                    config,
                                    mode='train',
                                    w_tokenizer=w_tokenizer,
                                    glabels=glabels)

    path = os.path.join(args.data_dir, _VALID_FILE)
    valid_features = build_features(path,
                                    tokenizer,
                                    poss,
                                    labels,
                                    config,
                                    mode='valid',
                                    w_tokenizer=w_tokenizer,
                                    glabels=glabels)

    path = os.path.join(args.data_dir, _TEST_FILE)
    test_features = build_features(path,
                                   tokenizer,
                                   poss,
                                   labels,
                                   config,
                                   mode='test',
                                   w_tokenizer=w_tokenizer,
                                   glabels=glabels)

    # write features
    path = os.path.join(args.data_dir, _TRAIN_FILE + _FSUFFIX)
    write_features(train_features, path)

    path = os.path.join(args.data_dir, _VALID_FILE + _FSUFFIX)
    write_features(valid_features, path)

    path = os.path.join(args.data_dir, _TEST_FILE + _FSUFFIX)
    write_features(test_features, path)

    # write poss, labels, glabels
    path = os.path.join(args.data_dir, _POS_FILE)
    write_dict(poss, path)
    path = os.path.join(args.data_dir, _LABEL_FILE)
    write_dict(labels, path)
    path = os.path.join(args.data_dir, _GLABEL_FILE)
    write_dict(glabels, path)

Пример #6

Показать файл

class QueryProcessor:
    ##
    #
    #    @param         self
    #    @param         query
    #    @param         index
    #    @param         collection
    #    @return        None
    #    @brief         The constructor.
    #                   This process is extremely expensive because it loads the entire pickle object into memory.
    #                   If we are only executing this for one query it is fine but if we are doing it
    #                   for the evaluation used the load query instead
    #    @exception     None documented yet
    ##
    def __init__(self, query, index_file, collection):
        ''' index is the inverted index; collection is the document collection'''
        self.raw_query = query
        self.index = InvertedIndex()
        self.index = self.index.loadData(index_file)
        self.docs = collection
        self.tokenizer = Tokenizer(
            known_words=set(self.index.get_items_inverted().keys()))
        if self.raw_query:
            self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         query
    #   @return        None
    #   @exception     None
    ##
    def loadQuery(self, query):
        self.raw_query = query
        self.processed_query = self.preprocessing(self.raw_query)

    ##
    #   @brief         This method is used to load the next query for evaluation
    #   @param         self
    #   @param         raw_query
    #   @return        None
    #   @exception     None
    ##
    def preprocessing(self, raw_query):
        ''' apply the same preprocessing steps used by indexing,
            also use the provided spelling corrector. Note that
            spelling corrector should be applied before stopword
            removal and stemming (why?)'''
        return self.tokenizer.transpose_document_tokenized_stemmed_spelling(
            raw_query)

    ##
    #   @brief         This method does the boolean query processing
    #   @param         self
    #   @return        results:list[docID]
    #   @bug           Fixed
    #   @exception     None
    ##
    def booleanQuery(self):
        ''' boolean query processing; note that a query like "A B C" is transformed to "A AND B AND C" for retrieving posting lists and merge them'''
        ''' This method would likely be faster due to the use of  hashes, but I wanted to do what was shown in the slides
            from functools import reduce
            docs = [set(self.index[w]) for w in self.processed_query]
            docs.sort(key=len) # notice it is still smart to order by size 
            return reduce(set.intersection,docs) 
        '''
        if len(self.processed_query) == 0:
            return []

        ## checks that all of our query words are in the index, if not return [] ##
        for w in self.processed_query:
            if not w in self.index.get_items_inverted():
                return []

        ## checks if we only have 1 term in the query and returns its posting list if we do ##
        if len(self.processed_query) == 1:
            return list(self.index.get_items_inverted()[
                self.processed_query[0]].get_posting_list().keys())

        #### document_ids is a list of lists containing only document ids ####
        document_ids = [
            list(self.index.get_items_inverted()[w].get_posting_list().keys())
            for w in self.processed_query
        ]

        # by sorting so that we start with the shortest list of documents we get a potential speed up
        document_ids.sort(key=len)
        results = document_ids[0]

        ## iterates through each query word and does the intersection of docids from its posting list with all those before it ##
        ## could be done faster if index was implemented as set or some other hash data structure
        for p in document_ids[1:]:
            intermediate = []
            i, j = 0, 0
            while i < len(results) and j < len(p):
                if int(results[i]) < int(p[j]):
                    i += 1
                elif int(results[i]) > int(p[j]):
                    j += 1
                else:
                    intermediate.append(p[j])
                    j += 1
                    i += 1
            results = intermediate

            ## checks if we have already found terms totally disjoint from one another
            if len(results) == 0:
                return results

        return results

    ##
    #   @brief         This method compute cosine similarity for two vectors
    #   @param         self
    #   @param         vec1
    #   @param         vec2
    #   @return        score cosine: int
    #   @exception     None
    ##
    def cosine_similarity(self, vec1, vec2):
        # "compute cosine similarity: (vec1*vec2)/(||vec1||*||vec2||)"
        AA, AB, BB = 0, 0, 0
        for i in range(len(vec1)):
            x = vec1[i]
            y = vec2[i]
            AA += x * x
            BB += y * y
            AB += x * y
        return round(AB / math.sqrt(AA * BB), 4)

    ##
    #   @brief         This method compute vector model
    #   @param         self
    #   @param         k
    #   @return        cosines: dict{docID: score}
    #   @bug           Fixed
    #   @exception     ValueError
    ##
    def vectorQuery(self, k):
        ''' vector query processing, using the cosine similarity. '''
        #ToDo: return top k pairs of (docID, similarity), ranked by their cosine similarity with the query in the descending order
        # You can use term frequency or TFIDF to construct the vectors
        if len(self.processed_query) == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        query_words = list(set(self.processed_query))
        idfs = [self.index.idf(w) for w in query_words]

        # undefined behavior from document on what to do if k is larger than the corpus
        try:
            if k > self.index.get_total_number_Doc():
                raise ValueError('k is greater than number of documents')
        except ValueError as err:
            print(err.args)
            return

        # below we define behavior if none of the words in the query are in any documents
        # this behavior was not defined in instructions so no documents seems most appropriate
        # if you used google and got 0 cosine it would return 0 documents even if you wanted the 50 most relevant
        if set(idfs) == {0}:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())
            return [(str(id), 0)
                    for id in sorted(list(map(int, all_docids)))[:k]]

        # removes any words that have 0 idf as that means they didn't appear in the corpus, means save memory
        # probably not necessary to turn it into lists, and may actually be more appropriate to leave as tuples
        idfs, query_words = map(
            list,
            zip(*[i for i in list(zip(idfs, query_words)) if not i[0] == 0]))

        #Calculates tfs of relevant words
        query_term_counter = Counter(self.processed_query)
        query_tf_vector = [
            round(math.log10(query_term_counter[w] + 1), 4)
            for w in query_words
        ]

        #Other way of doing tf
        #query_tf_vector = [round(1 + math.log10(query_term_counter[w]),4) if query_term_counter[w] > 0 else 0 for w in query_words]

        ### NCC change if a term in a quiry does not appear in our inverted index Forget/Discount term
        #### postings should be a list of lists which contains word postings

        postings = [
            self.index.get_items_inverted()[w].get_posting_list()
            for w in query_words if w in self.index.get_items_inverted()
        ]

        document_ids = set().union(*postings)
        document_tfs = {d: [0] * len(query_words) for d in document_ids}

        for inx, term in enumerate(postings):
            for document_id, posting in term.items():
                #log normalization
                document_tfs[document_id][inx] = math.log10(
                    posting.term_freq() + 1)

                #Other
                # tf = posting.term_freq()
                # if tf > 0 :
                #     tf = 1 + math.log10(tf)
                # else:
                #     tf = 0
                # document_tfs[document_id][inx] = tf

        query_tfidf = np.multiply(query_tf_vector, idfs)

        cosines = Counter({
            d: self.cosine_similarity(query_tfidf, np.multiply(d_tf, idfs))
            for d, d_tf in document_tfs.items()
        })
        # this has to be a list as dict are not sorted...
        # need a consistent ordering of documents when multiple documents have the same score we first sort on score then docid, very slow
        # if we know k or know the number of documents we could use numpy to preallocate memory which means we would not have to use append and could just use copy
        temp_k = k
        scores = sorted(list(set(cosines.values())), reverse=True)
        ret = []
        for s in scores:
            docs_with_score_s = sorted(
                [int(d) for d, v in cosines.items() if v == s])
            if len(docs_with_score_s) >= temp_k:
                docs_with_score_s = docs_with_score_s[:temp_k]
                ret.extend([(str(d), s) for d in docs_with_score_s])
                temp_k = 0
                break
            else:
                temp_k = temp_k - len(docs_with_score_s)
                ret.extend([(str(d), s) for d in docs_with_score_s])
        if not temp_k == 0:
            all_docids = set()
            for _, v in self.index.get_items_inverted().items():
                all_docids.update(v.get_posting_list().keys())

            ret.extend([(str(j), 0) for j in sorted(
                list(map(int, all_docids.difference({i[0]
                                                     for i in ret}))))[:temp_k]
                        ])
        return ret

Пример #7

Показать файл

 def __init__(self):
     self.__items = {}  # list of IndexItems
     self.__nDocs = 0  # the number of indexed documents
     self.__tokenizer = Tokenizer()

Пример #8

Показать файл

class InvertedIndex:
    ##
    #    @param         self
    #    @param         topicName
    #    @return        None
    #    @brief         The constructor.
    #    @exception     None documented yet
    ##
    def __init__(self):
        self.__items = {}  # list of IndexItems
        self.__nDocs = 0  # the number of indexed documents
        self.__tokenizer = Tokenizer()

    ##
    #   @brief     This method return the total number of doc in our data set
    #
    #   @param         self
    #   @param         Doc
    #   @return        int
    #   @exception     None
    ##
    def get_total_number_Doc(self):
        return self.__nDocs

    ##
    #   @brief     This method return the total number of doc in our data set
    #
    #   @param         self
    #   @param         Doc
    #   @return        items: dict
    #   @exception     None
    ##
    def get_items_inverted(self):
        return self.__items

    ##
    #   @brief     This method is designed to index a docuemnt, using the simple SPIMI algorithm,
    #              but no need to store blocks due to the small collection we are handling.
    #              Using save/load the whole index instead
    #
    #       ToDo: indexing only title and body; use some functions defined in util.py
    #       (1) convert to lower cases,
    #       (2) remove stopwords,
    #       (3) stemming
    #
    #   @param         self
    #   @param         Doc
    #   @return        None
    #   @exception     None
    ##
    def indexDoc(self, doc):  # indexing a Document object
        #Concatenate document title
        newDoc = doc.title + " " + doc.author + " " + doc.body
        docID = doc.docID
        full_stemmed_list = self.__tokenizer.transpose_document_tokenized_stemmed(
            newDoc)

        for position, term in enumerate(full_stemmed_list):
            if self.__items.get(term) != None:
                self.__items[term].add(docID, position)
            else:
                #key does not exists in dict
                newPosting = Posting(docID)
                newPosting.append(position)
                self.__items[term] = IndexItem(term)
                self.__items[term].set_posting_list(docID, newPosting)
        self.__nDocs += 1

    ##
    #   @brief     This method Sorts all posting list by document ID.
    #              NOTE: This method seems redundant as by default all postings list document IDs will be in order.
    #                    Since documents are read in in a particular order.
    #   @param         self
    #   @return        None
    #   @exception     None
    ##
    def sort(self):
        ''' sort all posting lists by docID'''
        for term, posting in self.__items.items():
            posting.sort()

    ##
    #   @brief     This method sorts all indexing terms in our index
    #
    #   @param         self
    #   @return        OrderedDict
    #   @exception     None
    ##
    def sort_terms(self):
        ''' sort all posting lists by docID'''
        return collections.OrderedDict(
            sorted(self.__items.items(), key=operator.itemgetter(0)))
        #

    ##
    #   @brief     This method finds a term in the indexing and returns its posting list
    #
    #   @param         self
    #   @param         term
    #   @return        postingList:dict
    #   @exception     None
    ##
    def find(self, term):
        return self.__items[term]

    ##
    #   @brief     This method to dumper for json
    #
    #   @param         self
    #   @param         obj
    #   @return        toJSON or dict
    #   @exception     None
    ##
    def dumper(self, obj):
        try:
            return obj.toJSON()
        except:
            return obj.__dict__

    ##
    #   @brief     This method Serializes the inverted index to a json format and
    #              clears the Memory that holds this dictionary
    #
    #   @param         self
    #   @param         filename
    #   @return        ValueError
    #   @exception     None
    ##
    def save(self, filename):
        write_stream = open(filename, 'w')
        listTerm = self.sort_terms()
        dictMain = {}
        listInfo = {}

        for term, postingList in listTerm.items():
            dictTemp = postingList.posting_list_to_string()
            dictTemp["idf"] = self.idf(term)
            dictMain[term] = dictTemp

        listInfo["nDoc"] = self.get_total_number_Doc()
        listInfo["Data"] = dictMain
        try:
            write_stream.write(json.dumps(listInfo, indent=3))
        except ValueError as e:
            print("Is not valid json")
        write_stream.close()

    ##
    #   @brief     This method deserializes a json file in a object by reallocating the self.__items
    #
    #   @param         self
    #   @param         filename
    #   @return        json: dict
    #   @exception     ValueError
    ##
    def load(self, filename):
        try:
            with open(filename) as json_file:
                return json.load(json_file)
        except ValueError as e:
            print("Is not valid json")

    ##
    #   @brief     This method get IDF for  term by compute the inverted document frequency for a given term.
    #               We used this IDF = (Total number of (documents))/(Number of  (documents) containing the word)
    #
    #   @param         self
    #   @param         term
    #   @return        idf:int
    #   @exception     None
    ##
    def idf(self, term):
        ''' '''
        if not term in self.__items:
            return 0
        termData = self.__items[term]
        N = self.get_total_number_Doc()
        df = len(termData.get_posting_list())
        #inverse document frequency
        idf = round(math.log10(N / (float(df))), 4)
        #probabilistic inverse document frequency from
        #idf = round(math.log10(N - df /(float(df))), 4)
        return idf

    ##
    #   @brief     This method create IDF for doc
    #
    #   @param         self
    #   @return        idf: {term: {docID:idf}}
    #   @exception     None
    ##
    def idfDict(self):
        idf = collections.OrderedDict()

        for term, postingList in self.sort_terms().items():
            idf[term] = self.idf(term)

        return idf

    ##
    #   @brief     This method create TF for add doc
    #   There are different ways to represent TF we used tf = log(1+tf)
    #   Another way is TF = (Frequency of the word in the sentence) / (Total number of words in the sentence)
    #
    #   @param         self
    #   @return        word_tf_values: {term: {docID: tf, docID: tf }}
    #   @exception     None
    ##
    def tf_doc(self):
        word_tf_values = collections.OrderedDict()
        for term, postingList in self.sort_terms().items():
            doc_tf = collections.OrderedDict()
            for docID, post in postingList.get_posting_list().items():
                doc_tf[docID] = round(math.log10(1 + post.term_freq()),
                                      4)  #log normalize
            word_tf_values[term] = doc_tf
        return word_tf_values

    ##
    #   @brief     This method create tfidf for all doc.
    #              It structure is of the form {docID: {term: tf-idf,term: tf-idf }}
    #   @param         self
    #   @param         word_tf_valuesm
    #   @param         idfDict
    #   @return        TFIDF_dict:{docID: {term: tf-idf,term: tf-idf }}
    #   @exception     None
    ##
    def tf_idf(self, word_tf_valuesm, idfDict):
        TFIDF_dict = collections.defaultdict(list)

        for term, postingList in self.sort_terms().items():
            tf_idf = 0.0
            for doc, doctf in word_tf_valuesm[term].items():
                term_tf_idf_doc = {}
                tf_idf = doctf * idfDict[term]
                term_tf_idf_doc[term] = tf_idf
                TFIDF_dict[doc].append(term_tf_idf_doc)
        return TFIDF_dict

        ##

    ##
    #   @brief     This method Saves the current state of the InvertedIndex
    #
    #   @param         self
    #   @param         filename
    #   @return        None
    #   @exception     AttributeError,  pickle.PickleError
    ##
    def storeData(self, filename):

        try:
            fileP = open(filename, "wb")
            pickle.dump(self, fileP)  # serialize class object
        except (AttributeError, pickle.PickleError):
            print("Error pickle.dump InvertedIndex ")
        fileP.close()

    ##
    #   @brief     This method Loads the saved InvertedIndex
    #
    #   @param         self
    #   @param         filename
    #   @return        invertedIndexer
    #   @exception     (pickle.UnpicklingError, ImportError, EOFError, IndexError, TypeError)
    ##
    def loadData(self, filename):
        try:
            fileP = open(filename, "rb")
            invertedIndexer = pickle.load(fileP)
        except (pickle.UnpicklingError, ImportError, EOFError, IndexError,
                TypeError) as err:
            print(err)
            print("Error pickle.load InvertedIndex ")
        fileP.close()
        return invertedIndexer