def createToken(): if os.path.exists(TOKEN): with open(TOKEN) as f: return json.loads(f.read()) else: data_set = readDataSet(TRAIN_DATA_SET) preProcessing(data_set) tokenize(data_set) with open(TOKEN, 'w') as f: f.write(json.dumps(data_set)) return data_set
def create_inverted_index(path): ''' Create the inverted index of the collection along the path ''' tracemalloc.start() start_time = time.clock() file_obj = open(path, 'r') lines = file_obj.readlines() inverted_index = {} docID = None for (i, line) in enumerate(lines): if line[0] == '.': if line[1] == 'I': docID = int(line[3:]) elif (line[1] in ["T", "W", "B", "A", "N", "X", "K" ]) and (docID != None): for word in vocabulary( tokenize(select_text_from_doc_part(i, lines))): try: inverted_index[word].append(docID) except: inverted_index[word] = [docID] for word in inverted_index.keys(): inverted_index[word] = sorted(list(set(inverted_index[word]))) snapshot = tracemalloc.take_snapshot() print('inverted index took {} s to be created'.format(time.clock() - start_time)) print("memory stats for inverted index") display_top(snapshot) return inverted_index
def get_capital_words(text): tokens = tokenize(text).split() words = [] for token in tokens: if is_capital_token(token): words.append(token) return words
def input_query_vectorial_model(docs_coordinates, vocabulary, inverted_index, path, weight_function, n_results=10): ''' Give the n_results closest document IDs to the user's querry based on the vectorial search model ''' doc_dict = split_documents(path) n_documents = get_number_of_documents(path) request_id = 0 user_input = input('Faites une recherche vectorielle : ') start_time = time.clock() doc_dict[request_id] = tokenize(user_input.lower( )) #On demande une requête et on fait comme si elle était le document n°0 docs_coordinates[request_id] = np.asarray([ weight_function(request_id, term, doc_dict, inverted_index, n_documents) for term in vocabulary ]) similarity_dict = { docID: compute_cos_similarity(request_id, docID, docs_coordinates) for docID in doc_dict.keys() } similarity_to_query = sorted(similarity_dict.items(), key=operator.itemgetter(1), reverse=True) results = [ result[0] for result in similarity_to_query[1:n_results + 1] if result[1] > 0 ] elapsed_time = time.clock() - start_time print( 'vectorial search took {:.2f} s to run the query'.format(elapsed_time)) return results
def test(): params = createBayesParams() test_data_set = readDataSet(TEST_DATA_SET) preProcessing(test_data_set) tokenize(test_data_set) total = 0.0 s = 0.0 for topic in test_data_set.keys(): cnt = 0.0 for sample in test_data_set[topic].values(): if (topic == classSpecificBayes(sample, params)): cnt += 1 print(topic, cnt / len(test_data_set[topic])) total += cnt s += len(test_data_set[topic]) print('total', total / s)
def processText(text): words = text_processing.tokenize(text) words = text_processing.normalize(words) new_words = [] for word in words: if word not in stopwords_: new_words.append(word) return new_words
def _extract_question_word(self, text): text = tokenize(text) for qw in self.question_words: q_phrase = re.match(qw['re'], text, re.IGNORECASE) if q_phrase: q_word = qw['word'] break q_phrase = q_phrase.group(0) return re.search(q_word, q_phrase, re.IGNORECASE).group(0)
def normalized_freq_weight(term, docID, doc_dict, inverted_index, n_documents): ''' Compute the normalized frequency weight ''' max_term_freq = max([ log_term_freq_in_doc(t, docID, doc_dict) for t in tokenize(doc_dict[docID]) ]) return log_term_freq_in_doc(term, docID, doc_dict) / max_term_freq
def compute_linear_reg(lowercase_string_of_interesting_data): ''' Compute the coefs for the linear regression ''' half_doc_of_interest = lowercase_string_of_interesting_data[:len( lowercase_string_of_interesting_data) // 2] linear_reg = [] T1 = len(tokenize(lowercase_string_of_interesting_data)) M1 = len(vocabulary(tokenize(lowercase_string_of_interesting_data))) linear_reg.append([log(T1), log(M1)]) T2 = len(tokenize(half_doc_of_interest)) M2 = len(vocabulary(tokenize(half_doc_of_interest))) linear_reg.append([log(T2), log(M2)]) beta = (linear_reg[1][1] - linear_reg[0][1]) / (linear_reg[1][0] - linear_reg[0][0]) K = exp(1 / 2 * (linear_reg[1][1] + linear_reg[0][1] - beta * (linear_reg[1][0] + linear_reg[0][0]))) return beta, K
def process_texts(texts): sentences = create_sentences(texts) preprocessed_sents = [ text_processing.preprocess(sent) for sent in sentences ] tokenized_sents = [ text_processing.tokenize(sent, rm_stopwords=True) for sent in preprocessed_sents ] tokenized_sents = text_processing.make_bigrams(tokenized_sents) return tokenized_sents
def split_documents(path): ''' Create a dictionnary with doc IDs for keys and lists of tokens contained in the said document ''' file_obj = open(path, 'r') lines = file_obj.readlines() documents_splitted = {} docID = None for (i, line) in enumerate(lines): if line[0] == '.': if line[1] == 'I': docID = int(line[3:]) elif (line[1] in ["T", "W", "B", "A", "N", "X", "K" ]) and (docID != None): try: documents_splitted[docID] += tokenize( select_text_from_doc_part(i, lines)) except: documents_splitted[docID] = tokenize( select_text_from_doc_part(i, lines)) return documents_splitted
def processDescription(desc): """ Process products' description """ global idx idx += 1 if (idx % 10000 == 0): print(idx) if type(desc) == str: text = desc text = text_processing.tokenize(text) text = text_processing.normalize(text) return text
def embed_documents(model, posts): embedded_posts = list() for i, post in enumerate(posts): print(100 * i / len(posts)) post = remove_links(post) post = remove_ats(post) posts = remove_retweets(post) tokens = tokenize(post) tokens = remove_consecutive_phrases(tokens) embedded_post = embed_document(model, tokens) embedded_posts.append(embedded_post) embedded_posts.reverse() return embedded_posts
def vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, image_set): print('building vqa %s dataset' % image_set) if image_set in ['train2014', 'val2014']: load_answer = True with open(annotation_file % image_set) as f: annotations = json.load(f)['annotations'] qid2ann_dict = {ann['question_id']: ann for ann in annotations} else: load_answer = False with open(question_file % image_set) as f: questions = json.load(f)['questions'] coco_set_name = image_set.replace('-dev', '') abs_image_dir = os.path.abspath(image_dir % coco_set_name) image_name_template = 'COCO_' + coco_set_name + '_%012d' dataset = [None] * len(questions) unk_ans_count = 0 for n_q, q in enumerate(questions): if (n_q + 1) % 10000 == 0: print('processing %d / %d' % (n_q + 1, len(questions))) image_id = q['image_id'] question_id = q['question_id'] image_name = image_name_template % image_id image_path = os.path.join(abs_image_dir, image_name + '.jpg') question_str = q['question'] question_tokens = text_processing.tokenize(question_str) iminfo = dict(image_name=image_name, image_path=image_path, question_id=question_id, question_str=question_str, question_tokens=question_tokens) if load_answer: ann = qid2ann_dict[question_id] all_answers, valid_answers = extract_answers( ann['answers'], valid_answer_set) if len(valid_answers) == 0: valid_answers = ['<unk>'] unk_ans_count += 1 iminfo['all_answers'] = all_answers iminfo['valid_answers'] = valid_answers dataset[n_q] = iminfo print('total %d out of %d answers are <unk>' % (unk_ans_count, len(questions))) return dataset
def load_nmn3_model(self): """Initialize model session""" self.sess = tf.Session( config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True), allow_soft_placement=False, log_device_placement=False)) self.raw_question = self.question self.question_tokens = text_processing.tokenize(self.raw_question) self.assembler = Assembler(self.vocab_layout_file) self.vocab_dict = text_processing.VocabDict(self.vocab_question_file) self.answer_dict = text_processing.VocabDict(self.vocab_answer_file) self.num_vocab_txt = self.vocab_dict.num_vocab self.num_vocab_nmn = len(self.assembler.module_names) self.num_choices = self.answer_dict.num_vocab """Network inputs - placeholders""" self.input_seq_batch = tf.placeholder(tf.int32, [None, None]) self.seq_length_batch = tf.placeholder(tf.int32, [None]) self.image_feat_batch = tf.placeholder(tf.float32, [None, H_feat, W_feat, D_feat]) self.expr_validity_batch = tf.placeholder(tf.bool, [None]) """The model for testing""" self.nmn3_model_tst = NMN3Model(self.image_feat_batch, self.input_seq_batch, self.seq_length_batch, T_decoder=T_decoder, num_vocab_txt=self.num_vocab_txt, embed_dim_txt=embed_dim_txt, num_vocab_nmn=self.num_vocab_nmn, embed_dim_nmn=embed_dim_nmn, lstm_dim=lstm_dim, num_layers=num_layers, assembler=self.assembler, encoder_dropout=False, decoder_dropout=False, decoder_sampling=False, num_choices=self.num_choices) self.snapshot_saver = tf.train.Saver(max_to_keep=None) self.snapshot_saver.restore(self.sess, self.snapshot_file) self.input_seq = np.zeros((T_encoder, 1), np.int32) self.seq_length = np.zeros(1, np.int32) self.image_feat = np.zeros((1, H_feat, W_feat, D_feat), np.float32) self.image_feat = self.pool5_val
def get_capital_seqs(text): tokens = tokenize(text).split() seqs = [] pos = 0 while pos < len(tokens): if is_abbr_token(tokens[pos]): seqs.append(tokens[pos]) pos += 1 elif is_capital_token(tokens[pos]): end = pos + 1 while end < len(tokens): if not is_capital_token(tokens[end]): break end += 1 seqs.append(' '.join(tokens[pos:end])) pos = end + 1 else: pos += 1 return seqs
__license__ = "None" __version__ = "1.0" __maintainer__ = "Ahirton Lopes" __email__ = "*****@*****.**" """ import text_processing as tp ''' ----------------------------------------------------------------------------------------------------------------------- (1) TESTE DO PROCESSADOR TEXTUAL ----------------------------------------------------------------------------------------------------------------------- ''' docsTokenized = tp.tokenize(['Governo de SP vai pagar 25 mil a quem denunciou suspeito de matar ambulante no metro. Secretaria de Seguranca Publica fixou valor nesta quarta (18) no Diario Oficial.']) #print(docsTokenized) #docsTokenizedSentence = tp.tokenize_sentence(['Governo de SP vai pagar 25 mil a quem denunciou suspeito de matar ambulante no metro. Secretaria de Seguranca Publica fixou valor nesta quarta (18) no Diario Oficial.']) #print(docsTokenizedSentence) docsTagged = tp.tagging(docsTokenized) print (docsTagged) #docsRemovedTerms = tp.remove_words_with((docsTokenized),['Governo']) #print(docsRemovedTerms) #docsStopwordRemoval = tp.remove_stopwords(docsTokenized) #print(docsStopwordRemoval) #docsStemming = tp.stemming(docsTokenized)
def write_tokenized_text(collection_path, file_path): print('Exporting tokenized text...') tokenized_text = tokenize(create_lowercase_text(collection_path)) file = open(file_path, 'w+b') pickle.dump(tokenized_text, file, protocol=pickle.HIGHEST_PROTOCOL) file.close()
def write_vocabulary(collection_path, file_path): print('Exporting vocabulary...') vocab = vocabulary(tokenize(create_lowercase_text(collection_path))) file = open(file_path, 'w+b') pickle.dump(vocab, file, protocol=pickle.HIGHEST_PROTOCOL) file.close()
def build_imdb(image_set, valid_answer_set, annotation_set_name=None): annotation_file = os.path.join(data_dir, 'vqacp_v2_%s_annotations.json') question_file = os.path.join(data_dir, 'vqacp_v2_%s_questions.json') print('building imdb %s' % image_set) has_answer = False has_gt_layout = False load_gt_layout = False load_answer = False annotation_set_name = (annotation_set_name if annotation_set_name is not None else image_set) if os.path.exists(annotation_file % annotation_set_name): with open(annotation_file % annotation_set_name) as f: annotations = json.load(f) qid2ann_dict = {ann['question_id']: ann for ann in annotations} load_answer = True ''' if image_set in ['train2014', 'val2014']: load_answer = True load_gt_layout = False with open(annotation_file % image_set) as f: annotations = json.load(f)["annotations"] qid2ann_dict = {ann['question_id']: ann for ann in annotations} #qid2layout_dict = np.load(gt_layout_file % image_set)[()] else: load_answer = False load_gt_layout = False ''' with open(question_file % image_set) as f: questions = json.load(f) image_name_template = 'COCO_' + '%s' + '_%012d' imdb = [None]*(len(questions)+1) unk_ans_count = 0 for n_q, q in tqdm(enumerate(questions)): if (n_q+1) % 10000 == 0: print('processing %d / %d' % (n_q+1, len(questions))) image_id = q['image_id'] question_id = q['question_id'] coco_split = q['coco_split'] image_name = image_name_template % (coco_split, image_id) feature_path = image_name + '.npy' question_str = q['question'] question_tokens = text_processing.tokenize(question_str) iminfo = dict(image_name=image_name, image_id=image_id, question_id=question_id, feature_path=feature_path, question_str=question_str, question_tokens=question_tokens) # load answers if load_answer: ann = qid2ann_dict[question_id] all_answers, valid_answers = extract_answers(ann['answers'], valid_answer_set) if len(valid_answers) == 0: valid_answers = ['<unk>'] unk_ans_count += 1 iminfo['all_answers'] = all_answers iminfo['valid_answers'] = valid_answers has_answer = True if load_gt_layout: has_gt_layout = True imdb[n_q+1] = iminfo print('total %d out of %d answers are <unk>' % (unk_ans_count, len(questions))) header = create_header("vqa", has_answer=has_answer, has_gt_layout=has_gt_layout) imdb[0] = header return imdb
def boolean_search(client_request, inverted_index, collection_path, n_results=10): ''' Give n_results document IDs corresponding to the user's querry based on the boolean search model ''' time_start = time.clock() all_doc_id = get_all_doc_id(collection_path) request = tokenize(client_request) final_docs = [] documents = [] if (len(request) == 1): final_docs = inverted_index[request[0]] # handling and elements first while (next((operator for operator in request if operator == 'and'), False)): index_treated = [] forbidden_doc = list(set()) i = request.index('and') if (request[i - 2] == 'not'): index_treated = [i - 2, i - 1, i] forbidden_doc.extend(element for element in inverted_index[request[i - 1]] if element not in forbidden_doc) elif (request[i - 2] != 'not'): if type(request[i - 1]) == str: # mot and .. doc_treated = inverted_index[request[i - 1]] else: # [ liste doc ] and .. doc_treated = request[i - 1] documents.extend(element for element in doc_treated if element not in documents) index_treated = [i - 1, i] if (request[i + 1] == 'not'): index_treated.extend((i + 1, i + 2)) forbidden_doc.extend(element for element in inverted_index[request[i + 2]] if element not in forbidden_doc) elif (request[i + 1] != 'not'): index_treated.append(i + 1) if type(request[i + 1]) == str: doc_treated = inverted_index[request[i + 1]] else: doc_treated = request[i + 1] if (documents != []): documents = intersection(documents, doc_treated) else: documents.extend(element for element in doc_treated if element not in documents) if len(forbidden_doc) > 0: # Removing forbidden docs from results for docId in documents.copy(): if docId in forbidden_doc: documents.remove(docId) request.insert(index_treated[0], documents) for j in range(len(index_treated)): request.pop(index_treated[1]) # final_docs = documents # Handling or elements while (next((operator for operator in request if operator == 'or'), False)): index_treated = [] i = request.index('or') if (request[i - 2] == 'not'): index_treated = [i - 2, i - 1, i] all_docs = all_doc_id.copy() for docId in inverted_index[request[i - 1]]: all_docs.remove(docId) documents.extend(element for element in all_docs if element not in documents) else: if type(request[i - 1]) == str: doc_treated = inverted_index[request[i - 1]] else: doc_treated = request[i - 1] documents.extend(element for element in doc_treated if element not in documents) index_treated = [i - 1, i] if (request[i + 1] == 'not'): index_treated.extend((i + 1, i + 2)) all_doc = all_doc_id.copy() for docId in inverted_index[request[i + 2]]: all_doc.remove(docId) documents.extend(element for element in all_doc if element not in documents) else: index_treated.append(i + 1) if type(request[i + 1]) == str: doc_treated = inverted_index[request[i + 1]] else: doc_treated = request[i + 1] documents.extend(element for element in doc_treated if element not in documents) request.insert(index_treated[0], documents) for j in range(len(index_treated)): request.pop(index_treated[1]) final_docs = documents time_elapsed = time.clock() - time_start print('boolean search took {:.2f} s to run the query'.format( time_elapsed)) return final_docs[:n_results]
def get_ngrams(text, n=3): tokens = tokenize(text).split() res = [] for i in range(len(tokens) - (n - 1)): res.append(' '.join(tokens[i:(i + n)])) return res
def processDescription(desc): if type(desc)==str: text = desc text = text_processing.tokenize(text) text = text_processing.normalize(text) return text