def create_ner_tensor(tokenized_context, entities, ner_textdict, return_in_tensor=True): ner_tensor = [ NONE_NER_POS_TOKEN if return_in_tensor else NONE_NER_POS for _ in range(len(tokenized_context)) ] if len(entities) == 0: return ner_tensor pointer_loc = 0 i = 0 j = 0 k = 0 entities_name = tokenize(entities[j]['name']) while i < len(tokenized_context) and entities_name != None: pointer_loc += len(tokenized_context[i]) + 1 if entities[j]['begin_offset'] - pointer_loc <= 0: similarity = fuzz.partial_ratio(tokenized_context[i], entities_name[k]) # print(f'{tokenized_context[i]} vs {entities_name[k]} = {similarity}') if similarity >= WORD_SIMILARITY_THRESHOLD: ner_tensor[i] = ner_textdict.word2index[entities[j]['type']] if return_in_tensor else \ entities[j]['type'] k += 1 if k == len(entities_name): j += 1 k = 0 entities_name = None if j == len(entities) else tokenize( entities[j]['name']) i += 1 return ner_tensor
def sent_tokenize(input_text): tokenized_sents = [] tokenized_input = tokenize(normalize_string(input_text)) sentence = [] for token in tokenized_input: sentence.append(token) if is_end_punctuations(token): tokenized_sents.append(sentence.copy()) sentence = [] return tokenized_sents
def load_context_and_question(df_squad): j = 0 contexts = [] questions = [] start_time = time.time() for taken_topic_idx in range(df_squad.shape[0]): for taken_context_idx in range( len(df_squad.iloc[taken_topic_idx]['paragraphs'])): i = 0 context = df_squad.iloc[taken_topic_idx]['paragraphs'][ taken_context_idx]['context'] contexts.append(tokenize(normalize_string(context))) qas = df_squad.iloc[taken_topic_idx]['paragraphs'][ taken_context_idx]['qas'] while i < len(qas): question = qas[i]['question'] questions.append(tokenize(normalize_string(question))) i += 1 j += 1 if j % 10000 == 0: print(f'{j:04d}: {time.time() - start_time}s') return contexts, questions
def preprocess_data(): data['intent'] = data['intent'].map(intent_mapping) count = 0 for i in data['question']: data.replace(i, tokenize(i), regex=True, inplace=True) if count % 50 == 0: print("CURRENT COLLECT : ", count) count += 1 encode = [] decode = [] for q, i in data.values: encode.append(q) decode.append(i) return {'encode': encode, 'decode': decode}
def prepare_featured_input(input_text, output_file_name='free_input.txt', manual_ne_postag=False, lower=False, seed=42): is_answer_sents = [] is_cased_sents = [] if manual_ne_postag: entities = json.loads( input('Enter the named entities (list of dicts):').replace( '\'', '"')) postags = json.loads( input('Enter the postags (list of lists of lists):').replace( '\'', '"')) else: try: entities = get_ner(input_text)['entities'] postags = get_pos_tag(input_text)['postags'] except TimeoutError as e: print( 'Unable to invoke the NE and/or Pos Tag API. Please check your VPN or your internet connection:', e) exit(1) tokenized_input = tokenize(normalize_string(input_text)) entities = create_ner_tensor(tokenized_input, entities, ner_textdict=None, return_in_tensor=False) postags = create_postags_tensor(tokenized_input, postags, postags_textdict=None, return_in_tensor=False) tokenized_sents, entity_sents, postag_sents = sentenize( tokenized_input, entities, postags) for i in range(len(tokenized_sents)): is_answer_sents.append( get_random_answer_loc(tokenized_sents[i], entity_sents[i], seed=seed)) is_cased = [] for j in range(len(tokenized_sents[i])): is_cased.append( '1' if j < len(tokenized_sents[i]) and any(c.isupper() for c in tokenized_sents[i][j]) \ else '0' ) is_cased_sents.append(is_cased) tokenized_sents = np.array(tokenized_sents) # YES, DIRTY CODE. But have no choice to force the numpy to keep the input as array-of-list instead of pure array is_answer_sents = np.array(is_answer_sents + [[]])[:-1] is_cased_sents = np.array(is_cased_sents + [[]])[:-1] entity_sents = np.array(entity_sents + [[]])[:-1] postag_sents = np.array(postag_sents + [[]])[:-1] is_answer_sents = np.expand_dims(is_answer_sents, axis=-1) is_cased_sents = np.expand_dims(is_cased_sents, axis=-1) entity_sents = np.expand_dims(entity_sents, axis=-1) postag_sents = np.expand_dims(postag_sents, axis=-1) if lower: features = np.concatenate( (is_answer_sents, is_cased_sents, entity_sents, postag_sents), axis=-1) else: features = np.concatenate( (is_answer_sents, entity_sents, postag_sents), axis=-1) with open(output_file_name, 'w', encoding='utf-8') as f_out: for i in range(len(tokenized_sents)): if lower: f_out.write((print_input_along_feature( tokenized_sents[i], features[i]) + '\n').lower()) else: f_out.write((print_input_along_feature(tokenized_sents[i], features[i]) + '\n'))
def preprcoess(speech) -> str: speech = fix(speech) speech = tokenize(speech) speech = fix(speech) return speech