Python tokenize示例，text_processing.tokenize Python示例

示例#1

0

显示文件

文件： main.py 项目： ephemera2015/text-categorization

def createToken():
    if os.path.exists(TOKEN):
        with open(TOKEN) as f:
            return json.loads(f.read())
    else:
        data_set = readDataSet(TRAIN_DATA_SET)
        preProcessing(data_set)
        tokenize(data_set)
        with open(TOKEN, 'w') as f:
            f.write(json.dumps(data_set))
        return data_set

示例#2

0

显示文件

文件： utils.py 项目： clounarae/friweb

def create_inverted_index(path):
    '''
    Create the inverted index of the collection along the path
    '''
    tracemalloc.start()
    start_time = time.clock()
    file_obj = open(path, 'r')
    lines = file_obj.readlines()
    inverted_index = {}
    docID = None
    for (i, line) in enumerate(lines):
        if line[0] == '.':
            if line[1] == 'I':
                docID = int(line[3:])
            elif (line[1] in ["T", "W", "B", "A", "N", "X", "K"
                              ]) and (docID != None):
                for word in vocabulary(
                        tokenize(select_text_from_doc_part(i, lines))):
                    try:
                        inverted_index[word].append(docID)
                    except:
                        inverted_index[word] = [docID]
    for word in inverted_index.keys():
        inverted_index[word] = sorted(list(set(inverted_index[word])))
    snapshot = tracemalloc.take_snapshot()
    print('inverted index took {} s to be created'.format(time.clock() -
                                                          start_time))
    print("memory stats for inverted index")
    display_top(snapshot)
    return inverted_index

示例#3

0

显示文件

def get_capital_words(text):
    tokens = tokenize(text).split()
    words = []
    for token in tokens:
        if is_capital_token(token):
            words.append(token)
    return words

示例#4

0

显示文件

文件： search_methods.py 项目： clounarae/friweb

def input_query_vectorial_model(docs_coordinates,
                                vocabulary,
                                inverted_index,
                                path,
                                weight_function,
                                n_results=10):
    '''
    Give the n_results closest document IDs to the user's querry based on the vectorial search model
    '''
    doc_dict = split_documents(path)
    n_documents = get_number_of_documents(path)
    request_id = 0
    user_input = input('Faites une recherche vectorielle : ')
    start_time = time.clock()
    doc_dict[request_id] = tokenize(user_input.lower(
    ))  #On demande une requête et on fait comme si elle était le document n°0
    docs_coordinates[request_id] = np.asarray([
        weight_function(request_id, term, doc_dict, inverted_index,
                        n_documents) for term in vocabulary
    ])
    similarity_dict = {
        docID: compute_cos_similarity(request_id, docID, docs_coordinates)
        for docID in doc_dict.keys()
    }
    similarity_to_query = sorted(similarity_dict.items(),
                                 key=operator.itemgetter(1),
                                 reverse=True)
    results = [
        result[0] for result in similarity_to_query[1:n_results + 1]
        if result[1] > 0
    ]
    elapsed_time = time.clock() - start_time
    print(
        'vectorial search took {:.2f} s to run the query'.format(elapsed_time))
    return results

示例#5

0

显示文件

文件： main.py 项目： ephemera2015/text-categorization

def test():
    params = createBayesParams()
    test_data_set = readDataSet(TEST_DATA_SET)
    preProcessing(test_data_set)
    tokenize(test_data_set)
    total = 0.0
    s = 0.0
    for topic in test_data_set.keys():
        cnt = 0.0
        for sample in test_data_set[topic].values():
            if (topic == classSpecificBayes(sample, params)):
                cnt += 1
        print(topic, cnt / len(test_data_set[topic]))
        total += cnt
        s += len(test_data_set[topic])
    print('total', total / s)

示例#6

0

显示文件

文件： similarity_test.py 项目： ClementNguyen/MOPSI

 def processText(text):
     words = text_processing.tokenize(text)
     words = text_processing.normalize(words)
     new_words = []
     for word in words:
         if word not in stopwords_:
             new_words.append(word)
     return new_words

示例#7

0

显示文件

文件： entity_recognition.py 项目： vladislavneon/kbqa-tools

 def _extract_question_word(self, text):
     text = tokenize(text)
     for qw in self.question_words:
         q_phrase = re.match(qw['re'], text, re.IGNORECASE)
         if q_phrase:
             q_word = qw['word']
             break
     q_phrase = q_phrase.group(0)
     return re.search(q_word, q_phrase, re.IGNORECASE).group(0)

示例#8

0

显示文件

文件： utils.py 项目： clounarae/friweb

def normalized_freq_weight(term, docID, doc_dict, inverted_index, n_documents):
    '''
    Compute the normalized frequency weight
    '''
    max_term_freq = max([
        log_term_freq_in_doc(t, docID, doc_dict)
        for t in tokenize(doc_dict[docID])
    ])
    return log_term_freq_in_doc(term, docID, doc_dict) / max_term_freq

示例#9

0

显示文件

文件： utils.py 项目： clounarae/friweb

def compute_linear_reg(lowercase_string_of_interesting_data):
    '''
    Compute the coefs for the linear regression
    '''
    half_doc_of_interest = lowercase_string_of_interesting_data[:len(
        lowercase_string_of_interesting_data) // 2]
    linear_reg = []
    T1 = len(tokenize(lowercase_string_of_interesting_data))
    M1 = len(vocabulary(tokenize(lowercase_string_of_interesting_data)))
    linear_reg.append([log(T1), log(M1)])
    T2 = len(tokenize(half_doc_of_interest))
    M2 = len(vocabulary(tokenize(half_doc_of_interest)))
    linear_reg.append([log(T2), log(M2)])

    beta = (linear_reg[1][1] - linear_reg[0][1]) / (linear_reg[1][0] -
                                                    linear_reg[0][0])
    K = exp(1 / 2 * (linear_reg[1][1] + linear_reg[0][1] - beta *
                     (linear_reg[1][0] + linear_reg[0][0])))
    return beta, K

示例#10

0

显示文件

文件： train_word2vec.py 项目： abchapman93/info_3700_fall_2019

def process_texts(texts):
    sentences = create_sentences(texts)
    preprocessed_sents = [
        text_processing.preprocess(sent) for sent in sentences
    ]
    tokenized_sents = [
        text_processing.tokenize(sent, rm_stopwords=True)
        for sent in preprocessed_sents
    ]
    tokenized_sents = text_processing.make_bigrams(tokenized_sents)
    return tokenized_sents

示例#11

0

显示文件

文件： utils.py 项目： clounarae/friweb

def split_documents(path):
    '''
    Create a dictionnary with doc IDs for keys and lists of tokens contained in the said document
    '''
    file_obj = open(path, 'r')
    lines = file_obj.readlines()
    documents_splitted = {}
    docID = None
    for (i, line) in enumerate(lines):
        if line[0] == '.':
            if line[1] == 'I':
                docID = int(line[3:])
            elif (line[1] in ["T", "W", "B", "A", "N", "X", "K"
                              ]) and (docID != None):
                try:
                    documents_splitted[docID] += tokenize(
                        select_text_from_doc_part(i, lines))
                except:
                    documents_splitted[docID] = tokenize(
                        select_text_from_doc_part(i, lines))
    return documents_splitted

示例#12

0

显示文件

文件： similarity_train_set.py 项目： ClementNguyen/MOPSI

 def processDescription(desc):
     """
     Process products' description
     """
     global idx
     idx += 1
     if (idx % 10000 == 0):
         print(idx)
     if type(desc) == str:
         text = desc
         text = text_processing.tokenize(text)
         text = text_processing.normalize(text)
         return text

示例#13

0

显示文件

def embed_documents(model, posts):
    embedded_posts = list()
    for i, post in enumerate(posts):
        print(100 * i / len(posts))
        post = remove_links(post)
        post = remove_ats(post)
        posts = remove_retweets(post)
        tokens = tokenize(post)
        tokens = remove_consecutive_phrases(tokens)
        embedded_post = embed_document(model, tokens)
        embedded_posts.append(embedded_post)
    embedded_posts.reverse()
    return embedded_posts

示例#14

0

显示文件

文件： build_vqa_inputs.py 项目： yong6600/basic_vqa

def vqa_processing(image_dir, annotation_file, question_file, valid_answer_set,
                   image_set):
    print('building vqa %s dataset' % image_set)
    if image_set in ['train2014', 'val2014']:
        load_answer = True
        with open(annotation_file % image_set) as f:
            annotations = json.load(f)['annotations']
            qid2ann_dict = {ann['question_id']: ann for ann in annotations}
    else:
        load_answer = False
    with open(question_file % image_set) as f:
        questions = json.load(f)['questions']
    coco_set_name = image_set.replace('-dev', '')
    abs_image_dir = os.path.abspath(image_dir % coco_set_name)
    image_name_template = 'COCO_' + coco_set_name + '_%012d'
    dataset = [None] * len(questions)

    unk_ans_count = 0
    for n_q, q in enumerate(questions):
        if (n_q + 1) % 10000 == 0:
            print('processing %d / %d' % (n_q + 1, len(questions)))
        image_id = q['image_id']
        question_id = q['question_id']
        image_name = image_name_template % image_id
        image_path = os.path.join(abs_image_dir, image_name + '.jpg')
        question_str = q['question']
        question_tokens = text_processing.tokenize(question_str)

        iminfo = dict(image_name=image_name,
                      image_path=image_path,
                      question_id=question_id,
                      question_str=question_str,
                      question_tokens=question_tokens)

        if load_answer:
            ann = qid2ann_dict[question_id]
            all_answers, valid_answers = extract_answers(
                ann['answers'], valid_answer_set)
            if len(valid_answers) == 0:
                valid_answers = ['<unk>']
                unk_ans_count += 1
            iminfo['all_answers'] = all_answers
            iminfo['valid_answers'] = valid_answers

        dataset[n_q] = iminfo
    print('total %d out of %d answers are <unk>' %
          (unk_ans_count, len(questions)))
    return dataset

示例#15

0

显示文件

    def load_nmn3_model(self):
        """Initialize model session"""
        self.sess = tf.Session(
            config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True),
                                  allow_soft_placement=False,
                                  log_device_placement=False))

        self.raw_question = self.question
        self.question_tokens = text_processing.tokenize(self.raw_question)
        self.assembler = Assembler(self.vocab_layout_file)
        self.vocab_dict = text_processing.VocabDict(self.vocab_question_file)
        self.answer_dict = text_processing.VocabDict(self.vocab_answer_file)

        self.num_vocab_txt = self.vocab_dict.num_vocab
        self.num_vocab_nmn = len(self.assembler.module_names)
        self.num_choices = self.answer_dict.num_vocab
        """Network inputs - placeholders"""
        self.input_seq_batch = tf.placeholder(tf.int32, [None, None])
        self.seq_length_batch = tf.placeholder(tf.int32, [None])
        self.image_feat_batch = tf.placeholder(tf.float32,
                                               [None, H_feat, W_feat, D_feat])
        self.expr_validity_batch = tf.placeholder(tf.bool, [None])
        """The model for testing"""
        self.nmn3_model_tst = NMN3Model(self.image_feat_batch,
                                        self.input_seq_batch,
                                        self.seq_length_batch,
                                        T_decoder=T_decoder,
                                        num_vocab_txt=self.num_vocab_txt,
                                        embed_dim_txt=embed_dim_txt,
                                        num_vocab_nmn=self.num_vocab_nmn,
                                        embed_dim_nmn=embed_dim_nmn,
                                        lstm_dim=lstm_dim,
                                        num_layers=num_layers,
                                        assembler=self.assembler,
                                        encoder_dropout=False,
                                        decoder_dropout=False,
                                        decoder_sampling=False,
                                        num_choices=self.num_choices)

        self.snapshot_saver = tf.train.Saver(max_to_keep=None)
        self.snapshot_saver.restore(self.sess, self.snapshot_file)
        self.input_seq = np.zeros((T_encoder, 1), np.int32)
        self.seq_length = np.zeros(1, np.int32)
        self.image_feat = np.zeros((1, H_feat, W_feat, D_feat), np.float32)
        self.image_feat = self.pool5_val

示例#16

0

显示文件

def get_capital_seqs(text):
    tokens = tokenize(text).split()
    seqs = []
    pos = 0
    while pos < len(tokens):
        if is_abbr_token(tokens[pos]):
            seqs.append(tokens[pos])
            pos += 1
        elif is_capital_token(tokens[pos]):
            end = pos + 1
            while end < len(tokens):
                if not is_capital_token(tokens[end]):
                    break
                end += 1
            seqs.append(' '.join(tokens[pos:end]))
            pos = end + 1
        else:
            pos += 1
    return seqs

示例#17

0

显示文件

文件： callfunc.py 项目： rsimiq/Text-Mining-Open-Course

__license__ = "None"
__version__ = "1.0"
__maintainer__ = "Ahirton Lopes"
__email__ = "*****@*****.**"
"""

import text_processing as tp


'''
-----------------------------------------------------------------------------------------------------------------------
(1) TESTE DO PROCESSADOR TEXTUAL
-----------------------------------------------------------------------------------------------------------------------
'''

docsTokenized = tp.tokenize(['Governo de SP vai pagar 25 mil a quem denunciou suspeito de matar ambulante no metro. Secretaria de Seguranca Publica fixou valor nesta quarta (18) no Diario Oficial.'])
#print(docsTokenized)

#docsTokenizedSentence = tp.tokenize_sentence(['Governo de SP vai pagar 25 mil a quem denunciou suspeito de matar ambulante no metro. Secretaria de Seguranca Publica fixou valor nesta quarta (18) no Diario Oficial.'])
#print(docsTokenizedSentence)

docsTagged = tp.tagging(docsTokenized)
print (docsTagged)

#docsRemovedTerms = tp.remove_words_with((docsTokenized),['Governo'])
#print(docsRemovedTerms)

#docsStopwordRemoval = tp.remove_stopwords(docsTokenized)
#print(docsStopwordRemoval)

#docsStemming = tp.stemming(docsTokenized)

示例#18

0

显示文件

def write_tokenized_text(collection_path, file_path):
    print('Exporting tokenized text...')
    tokenized_text = tokenize(create_lowercase_text(collection_path))
    file = open(file_path, 'w+b')
    pickle.dump(tokenized_text, file, protocol=pickle.HIGHEST_PROTOCOL)
    file.close()

示例#19

0

显示文件

def write_vocabulary(collection_path, file_path):
    print('Exporting vocabulary...')
    vocab = vocabulary(tokenize(create_lowercase_text(collection_path)))
    file = open(file_path, 'w+b')
    pickle.dump(vocab, file, protocol=pickle.HIGHEST_PROTOCOL)
    file.close()

示例#20

0

显示文件

def build_imdb(image_set,
               valid_answer_set,
               annotation_set_name=None):
    annotation_file = os.path.join(data_dir,
                                   'vqacp_v2_%s_annotations.json')
    question_file = os.path.join(data_dir,
                                 'vqacp_v2_%s_questions.json')

    print('building imdb %s' % image_set)
    has_answer = False
    has_gt_layout = False
    load_gt_layout = False
    load_answer = False

    annotation_set_name = (annotation_set_name
                           if annotation_set_name is not None else image_set)

    if os.path.exists(annotation_file % annotation_set_name):
        with open(annotation_file % annotation_set_name) as f:
            annotations = json.load(f)
            qid2ann_dict = {ann['question_id']: ann for ann in annotations}
        load_answer = True
    '''
    if image_set in ['train2014', 'val2014']:
        load_answer = True
        load_gt_layout = False
        with open(annotation_file % image_set) as f:
            annotations = json.load(f)["annotations"]
            qid2ann_dict = {ann['question_id']: ann for ann in annotations}
        #qid2layout_dict = np.load(gt_layout_file % image_set)[()]
    else:
        load_answer = False
        load_gt_layout = False '''

    with open(question_file % image_set) as f:
        questions = json.load(f)

    image_name_template = 'COCO_' + '%s' + '_%012d'
    imdb = [None]*(len(questions)+1)

    unk_ans_count = 0
    for n_q, q in tqdm(enumerate(questions)):
        if (n_q+1) % 10000 == 0:
            print('processing %d / %d' % (n_q+1, len(questions)))
        image_id = q['image_id']
        question_id = q['question_id']
        coco_split = q['coco_split']
        image_name = image_name_template % (coco_split, image_id)
        feature_path = image_name + '.npy'
        question_str = q['question']
        question_tokens = text_processing.tokenize(question_str)

        iminfo = dict(image_name=image_name,
                      image_id=image_id,
                      question_id=question_id,
                      feature_path=feature_path,
                      question_str=question_str,
                      question_tokens=question_tokens)

        # load answers
        if load_answer:
            ann = qid2ann_dict[question_id]
            all_answers, valid_answers = extract_answers(ann['answers'],
                                                         valid_answer_set)
            if len(valid_answers) == 0:
                valid_answers = ['<unk>']
                unk_ans_count += 1
            iminfo['all_answers'] = all_answers
            iminfo['valid_answers'] = valid_answers
            has_answer = True

        if load_gt_layout:
            has_gt_layout = True

        imdb[n_q+1] = iminfo
    print('total %d out of %d answers are <unk>' % (unk_ans_count,
                                                    len(questions)))
    header = create_header("vqa", has_answer=has_answer,
                           has_gt_layout=has_gt_layout)
    imdb[0] = header
    return imdb

示例#21

0

显示文件

文件： search_methods.py 项目： clounarae/friweb

def boolean_search(client_request,
                   inverted_index,
                   collection_path,
                   n_results=10):
    '''
    Give n_results document IDs corresponding to the user's querry based on the boolean search model
    '''
    time_start = time.clock()
    all_doc_id = get_all_doc_id(collection_path)
    request = tokenize(client_request)
    final_docs = []
    documents = []
    if (len(request) == 1):
        final_docs = inverted_index[request[0]]
    # handling and elements first
    while (next((operator for operator in request if operator == 'and'),
                False)):
        index_treated = []
        forbidden_doc = list(set())
        i = request.index('and')
        if (request[i - 2] == 'not'):
            index_treated = [i - 2, i - 1, i]
            forbidden_doc.extend(element
                                 for element in inverted_index[request[i - 1]]
                                 if element not in forbidden_doc)
        elif (request[i - 2] != 'not'):
            if type(request[i - 1]) == str:  # mot and ..
                doc_treated = inverted_index[request[i - 1]]
            else:  # [ liste doc ] and ..
                doc_treated = request[i - 1]
            documents.extend(element for element in doc_treated
                             if element not in documents)
            index_treated = [i - 1, i]
        if (request[i + 1] == 'not'):
            index_treated.extend((i + 1, i + 2))
            forbidden_doc.extend(element
                                 for element in inverted_index[request[i + 2]]
                                 if element not in forbidden_doc)
        elif (request[i + 1] != 'not'):
            index_treated.append(i + 1)
            if type(request[i + 1]) == str:
                doc_treated = inverted_index[request[i + 1]]
            else:
                doc_treated = request[i + 1]
            if (documents != []):
                documents = intersection(documents, doc_treated)
            else:
                documents.extend(element for element in doc_treated
                                 if element not in documents)
        if len(forbidden_doc) > 0:  # Removing forbidden docs from results
            for docId in documents.copy():
                if docId in forbidden_doc:
                    documents.remove(docId)
        request.insert(index_treated[0], documents)
        for j in range(len(index_treated)):
            request.pop(index_treated[1])  #
        final_docs = documents
    # Handling or elements
    while (next((operator for operator in request if operator == 'or'),
                False)):
        index_treated = []
        i = request.index('or')
        if (request[i - 2] == 'not'):
            index_treated = [i - 2, i - 1, i]
            all_docs = all_doc_id.copy()
            for docId in inverted_index[request[i - 1]]:
                all_docs.remove(docId)
            documents.extend(element for element in all_docs
                             if element not in documents)
        else:
            if type(request[i - 1]) == str:
                doc_treated = inverted_index[request[i - 1]]
            else:
                doc_treated = request[i - 1]
            documents.extend(element for element in doc_treated
                             if element not in documents)
            index_treated = [i - 1, i]
        if (request[i + 1] == 'not'):
            index_treated.extend((i + 1, i + 2))
            all_doc = all_doc_id.copy()
            for docId in inverted_index[request[i + 2]]:
                all_doc.remove(docId)
            documents.extend(element for element in all_doc
                             if element not in documents)
        else:
            index_treated.append(i + 1)
            if type(request[i + 1]) == str:
                doc_treated = inverted_index[request[i + 1]]
            else:
                doc_treated = request[i + 1]
            documents.extend(element for element in doc_treated
                             if element not in documents)
        request.insert(index_treated[0], documents)
        for j in range(len(index_treated)):
            request.pop(index_treated[1])
        final_docs = documents
        time_elapsed = time.clock() - time_start
        print('boolean search took {:.2f} s to run the query'.format(
            time_elapsed))
    return final_docs[:n_results]

示例#22

0

显示文件

def get_ngrams(text, n=3):
    tokens = tokenize(text).split()
    res = []
    for i in range(len(tokens) - (n - 1)):
        res.append(' '.join(tokens[i:(i + n)]))
    return res

示例#23

0

显示文件

文件： data_processing.py 项目： ClementNguyen/TDLOG

def processDescription(desc):
    if type(desc)==str:
        text = desc
        text = text_processing.tokenize(text)
        text = text_processing.normalize(text)
        return text