Python tokenizer示例，Tokenizer.tokenizer Python示例

示例#1

0

显示文件

文件： Pyxie.py 项目： Rhodolite/Gem.py.Old

    def main():
        if 0:
            print_unknown_modules()
            print_unknown_functions()

            for v in get_objects():
                if type(v) is Map:
                    if '__module__' in v:
                        line('%d: Map.__module__: %s', reference_count(v),
                             v['__module__'])
                        continue

                    if '__doc__' in v:
                        line('%d: Map.__doc__: %r', reference_count(v),
                             v['__doc__'])
                        continue

                    line('%d: Map.keys: %s', reference_count(v), v.keys())

            def find_object_by_address(address):
                for v in get_objects():
                    if address_of(v) == address:
                        return v

        from Tokenizer import tokenizer

        tokenizer()

示例#2

0

显示文件

def preprocess():
    blog_en, blog_cn = [], []
    t = tokenizer()
    with open('UM_Corpus/Bi-Microblog.txt', 'rU') as f:
        lines = f.readlines()
        for i in xrange(0, len(lines), 2):
            en = t.tokenize(lines[i].strip())
            cn = t.tokenize(lines[i + 1].strip())
            if en is None or cn is None:
                continue
            blog_en.append(en)
            blog_cn.append(cn)

    news_en, news_cn = [], []
    with open('UM_Corpus/Bi-News.txt', 'rU') as f:
        lines = f.readlines()
        for i in xrange(0, len(lines), 2):
            en = t.tokenize(lines[i].strip())
            cn = t.tokenize(lines[i + 1].strip())
            if en is None or cn is None:
                continue
            news_en.append(en)
            news_cn.append(cn)

    with open('UM_Corpus/blog_en',
              'w') as f1, open('UM_Corpus/blog_cn', 'w') as f2, open(
                  'UM_Corpus/news_en',
                  'w') as f3, open('UM_Corpus/news_cn', 'w') as f4:
        json.dump(blog_en, f1)
        json.dump(blog_cn, f2)
        json.dump(news_en, f3)
        json.dump(news_cn, f4)

示例#3

0

显示文件

 def embed(self, text1, text2):
     p = tokenizer()
     text1 = [p.tokenize(t, cn=False) for t in text1]
     text2 = [p.tokenize(t, cn=False) for t in text2]
     feats1, feats2 = encode_sentences(self.model, (text1, text2),
                                       test=True)
     return feats1, feats2

示例#4

0

显示文件

文件： Calculator.py 项目： mbhushan/ps-algods-py

def calculator():
    while True:
        expr = readInput()
        tokens = tokenizer(expr)
        parens = []
        for t in tokens:
            if t == '(' or t == ')':
                parens.append(t)
        if not parenChecker(''.join(parens)):
            print ("Parenthesis mismatch. Please try again..")
            continue

        if (tokens is not None) or (len(tokens) >= 1):
            postfix = infixToPostfix(tokens)
            print ("postfix: ", postfix)
            result = postfixEvaluator(postfix)
            print ("ANS: ", result)
        print ("press any key to continue, \'n\' to exit.")
        ch = input()
        if ch == 'n':
            break

示例#5

0

显示文件

文件： Calculator.py 项目： noah8713/ps-algods-py

def calculator():
    while True:
        expr = readInput()
        tokens = tokenizer(expr)
        parens = []
        for t in tokens:
            if t == '(' or t == ')':
                parens.append(t)
        if not parenChecker(''.join(parens)):
            print("Parenthesis mismatch. Please try again..")
            continue

        if (tokens is not None) or (len(tokens) >= 1):
            postfix = infixToPostfix(tokens)
            print("postfix: ", postfix)
            result = postfixEvaluator(postfix)
            print("ANS: ", result)
        print("press any key to continue, \'n\' to exit.")
        ch = input()
        if ch == 'n':
            break

示例#6

0

显示文件

def main():
    # Attributes
    filename = 'Dataset.csv'

    # Reading the dataset
    intents, uniqueIntents, sentences = readDataset(filename)

    # Cleaning the sentences and tokenizing
    cleanedWords = cleaningSentences(sentences)

    # Indexing
    wordTokenizer = tokenizer(cleanedWords)

    # Maximum set of words
    length = maxLength(cleanedWords)

    # Prediction
    text = 'What to do if my business category is not in the options?'
    pred = predictions(text, wordTokenizer, length)

    # Getting final output
    getSetOfIntents(pred, uniqueIntents)

示例#7

0

显示文件

    def create_index(self):
        corpus_data = json.load(open(self.corpus_html),
                                encoding='utf-8')  # load the data
        print('Start to parse...')
        num = 1
        for (doc_id, url) in corpus_data.items(
        ):  # 0/19 : www.ics.uci.edu........ 0/19 means the url is in the folder named 0 and the file named 19 inside the folder
            id_info = doc_id.split('/')
            folder_id = id_info[0]
            file_id = id_info[1]

            file_name = "{}/{}/{}".format("WEBPAGES_RAW", folder_id, file_id)
            html = open(file_name, 'r', encoding='utf-8')
            soup = BeautifulSoup(html, 'lxml')
            text_info = soup.findAll(text=True)
            for text in text_info:
                if (text.parent.name not in [
                        'style', 'script', '[document]', 'meta'
                ]) and (not isinstance(text, Comment)):
                    self.token_tf_dict = tokenizer(text.strip())
                    token_dict_items = self.token_tf_dict.items()
                    if text.parent.name in [
                            "head", "title", "bold"
                    ]:  # if the term is in some more "important" tag, the term frequency
                        for (token, frequency) in token_dict_items:
                            # self.token_dict[token] = frequency*2

                            self.inverted_index_tf[token][
                                doc_id] = frequency * 2  # Notice that self.inverted_index is a dict of dict
                            # if self.token_dict[token] == []:
                            #     self.token_dict[token].append(frequency * 2)
                            #     self.token_dict[token].append(text.parent.name)
                            # else:
                            #     self.token_dict[token].append(frequency*2)
                            #     self.token_dict[token].append(text.parent.name)
                    else:
                        for (token, frequency) in token_dict_items:
                            # self.token_dict[token] = frequency
                            #print(token, frequency)
                            self.inverted_index_tf[token][
                                doc_id] = frequency  # Notice that self.inverted_index is a dict of dict
                            # if self.token_dict[token] == []:
                            #     self.token_dict[token].append(frequency)
                            #     self.token_dict[token].append(text.parent.name)
                            # else:
                            #     self.token_dict[token].append(frequency)
                            #     self.token_dict[token].append(text.parent.name)

            # for (term, tf) in token_dict_items:
            #     self.inverted_index_tf[term][doc_id] = tf  # Notice that self.inverted_index is a dict of dict
            print("Starting...." + str(num))
            num += 1

        for term in self.inverted_index_tf.keys():
            df = len(self.inverted_index_tf[term])
            idf = math.log10(self.total_num_of_doc / df)

            for docid in self.inverted_index_tf[term].keys():
                weighted_tf = float(
                    1 + math.log10(self.inverted_index_tf[term][docid]))
                tf_idf_score = weighted_tf * idf
                self.inverted_index[term][docid] = []
                self.inverted_index[term][docid].append(weighted_tf)
                self.inverted_index[term][docid].append(tf_idf_score)
                #print(self.inverted_index[term][docid])
                self.document_length[docid] += math.pow(
                    self.inverted_index[term][docid][0], 2)

        for doc in self.document_length.keys():
            self.document_length[doc] = math.sqrt(self.document_length[doc])

        # write inverted_index dict into pandas pickle file
        index_storage = pd.Series(self.inverted_index)
        index_storage.to_pickle("inverted_index__final_file.pkl")
        # write document_length dict into pandas pickle file
        document_storage = pd.Series(self.document_length)
        document_storage.to_pickle("document_length__final_file.pkl")

示例#8

0

显示文件

def query_process(inverted_index, document_length, query):
    tokenized_query = tokenizer(query)
    if len(tokenized_query) == 1:
        query_dict = dict()
        try:
            token = list(tokenized_query.keys())[0]
            #print(token)
            #print(inverted_index['irvine'])
            query_dict = inverted_index[token]
            #print('query dict : ')
            #print(inverted_index['irvine'])
            #query_dict = url_tfidf_dict
            #for info in url_tfidf_dict:
            #query_dict[info] = url_tfidf_dict[info][1]
        except:
            pass

        # Only sort if more than 1 url returned
        if len(query_dict.items()) > 1:
            query_result = sorted(list(query_dict.items()),
                                  key=lambda x: x[1],
                                  reverse=True)
        # else:
        #     query_result = sorted(list(query_dict.items()), key = lambda x: x[1], reverse = True)
        return query_result

    else:
        multi_query_dict = defaultdict(float)
        try:
            query_normalized_tfidf_dict = defaultdict(float)
            query_length_square = 0
            for token in tokenized_query.keys():
                tf_weight = 1 + math.log10(tokenized_query[token])
                idf = math.log10(
                    len(document_length) / len(inverted_index[token]))
                tf_idf = tf_weight * idf
                query_normalized_tfidf_dict[token] = tf_idf
                query_length_square += math.pow(tf_idf, 2)
                #normalized_tf_in_query = tf_idf / query_length

            query_length = math.sqrt(query_length_square)
            for token in query_normalized_tfidf_dict.keys():
                query_normalized_tfidf_dict[
                    token] = query_normalized_tfidf_dict[token] / query_length

            for token in query_normalized_tfidf_dict.keys():
                doc_dict = inverted_index[token]
                for doc in doc_dict.keys():
                    normalized_tf_in_doc = inverted_index[token][doc][
                        0] / document_length[doc]
                    multi_query_dict[doc] += query_normalized_tfidf_dict[
                        token] * normalized_tf_in_doc

        except:
            pass

        # Only sort if more than 1 url returned
        if len(multi_query_dict.items()) > 1:
            multi_query_result = sorted(list(multi_query_dict.items()),
                                        key=lambda x: x[1],
                                        reverse=True)
        # else: # Removed because we will allow more than 20 urls
        #     multi_query_result = sorted(list(multi_query_dict.items()), key = lambda x: x[1], reverse = True)
        return multi_query_result

示例#9

0

显示文件

文件： Main.py 项目： hpanushan/Intent_Classification

def main():
    # Attributes
    filename = 'Dataset.csv'

    # Reading the dataset
    intents, uniqueIntents, sentences = readDataset(filename)

    # Cleaning the sentences and tokenizing
    cleanedWords = cleaningSentences(sentences)

    # Indexing
    wordTokenizer = tokenizer(cleanedWords)
    vocabSize = len(wordTokenizer.word_index) + 1
    length = maxLength(cleanedWords)

    # Encoding the sentences
    encodedDoc = encodingDoc(wordTokenizer, cleanedWords)

    # Making equal length
    paddedDoc = paddindDoc(encodedDoc, length)

    # For intents
    # Tokenizer with filter changed
    outputTokenizer = tokenizer(uniqueIntents,
                                filters='!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

    # Encodeing the intents with unique intents
    encodedOutput = encodingDoc(outputTokenizer, intents)

    # Creating a array for each intent
    encodedOutput = np.array(encodedOutput).reshape(len(encodedOutput), 1)

    # One hot encoding (This creates 2D array with columns=Unique intents, rows=intents)
    outputOneHot = oneHotEncoder(encodedOutput)

    # Now dataset cleaning is finished!!!!
    # Splitting the dataset
    trainX, valX, trainY, valY = train_test_split(paddedDoc,
                                                  outputOneHot,
                                                  shuffle=True,
                                                  test_size=0.2)

    # Model creation
    model = createModel(vocabSize, length)

    # Checking model (Layers)
    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])
    model.summary()

    # Start model training
    filename = 'model.h5'
    checkpoint = ModelCheckpoint(filename,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='min')

    hist = model.fit(trainX,
                     trainY,
                     epochs=200,
                     batch_size=16,
                     validation_data=(valX, valY),
                     callbacks=[checkpoint])

示例#10

0

显示文件

import json
from Tokenizer import tokenizer

p = tokenizer()

#
# with open('google_results.txt','w') as f:
#     json.dump(google, f)

# with open('baidu_results.txt') as f:
#     baidu = json.load(f)
#
# for id in baidu:
#     for s in baidu[id]:
#         s['title'] = p.tokenize(s['title'])
#
# with open('baidu_results.txt', 'w') as f:
#     json.dump(baidu, f)


def num():
    with open('baidu_results.txt') as f:
        baidu = json.load(f)

    with open('google_results.txt') as f:
        google = json.load(f)

    with open('dataset.txt') as f:
        twitter = json.load(f)

    events = [