def translate_term( search_term, language, schema_index=load_schema_index(), params=settings.search_params, ): """ Parameters ---------- - search_term (str): - language (str): "english" | "garifuna" | "spanish" - schema_index: - params (dict): Output ------ - translated results | suggestions | None """ # pre-process language = language.lower() search_term = process_search_term(search_term) len_term = len(search_term) # instantiate parser parser = QueryParser(fieldname=language, schema=schema_index.schema) query = parser.parse(search_term) # search with schema_index.searcher() as searcher: suggested = searcher.suggest( fieldname=language, text=search_term, limit=params["limit"], maxdist=params["maxdist"], prefix=min(len_term, params["prefix"]), ) if suggested and (search_term not in suggested): return {"search_term": search_term, "suggested": suggested} else: results = searcher.search(query, limit=5) if results: # format results results = pd.DataFrame(data=[ split_text(results[r]["output"], sep="|") for r in range(len(results)) ], columns=params["languages"]) return {"search_term": search_term, "results": results} else: return {"search_term": search_term}
def split_text(text, max_length, recursive_until=None, step=10): """Split text with multiple characters according to max length. buffer is accumulated with different segments and it will be splitted if it exceed the max length. Return a list of string. """ if len(text) <= max_length: return [text] breaks = [ i for i in re.finditer( ' |\n|\:|\:|\,|\,|\﹐|\。|\ㄧ|\?|\?|\!|\!|\;|\;|\、|\.', text) ] segments = [] start_offset = 0 for k, p in enumerate(breaks): if p.end() - start_offset > max_length: start = start_offset end = breaks[k - 1].end() segment = text[start:end] start_offset = breaks[k - 1].end() segments.append(segment) if segments == []: if len(breaks) == 0: if len(text) < max_length: return [text] else: return [text[:recursive_until]] else: mid = len(breaks) // 2 segments = [ text[:breaks[mid - 1].end()], text[breaks[mid - 1].end():] ] if segments == []: raise Exception(f'something is wrong \n{max_length}\n{text}') for segment in segments: if len(segment) > max_length: if recursive_until: if max_length + step < recursive_until: return split_text(text, max_length + step, recursive_until=recursive_until) else: return [text[:recursive_until]] # raise Exception(f'splitted segment is larger than recursive limit {recursive_until}\n{segment}\n{text}') else: raise Exception( f'splitted segment is larger than {max_length}\n{segment}\n{text}' ) return segments
async def send_author_annotation_edit(cls, msg: Message, author_id: int, page: int): await cls.bot.send_chat_action(msg.chat.id, 'typing') annotation = await AuthorAnnotationAPI.get_by_author_id(author_id) if annotation is None: await cls.try_reply_or_send_message(msg.chat.id, "Нет информации для этого автора!", reply_to_message_id=msg.message_id) return msg_parts = split_text(annotation.body) text = msg_parts[page-1] + f'\n\n<code>Страница {page}/{len(msg_parts)}</code>' keyboard = await get_keyboard(page, len(msg_parts), f"a_ann_{author_id}", only_one=True) await cls.bot.edit_message_text(text, chat_id=msg.chat.id, message_id=msg.message_id, parse_mode="HTML", reply_markup=keyboard)
async def send_book_annotation(cls, msg: Message, book_id: int, page: int): await cls.bot.send_chat_action(msg.chat.id, 'typing') annotation = await BookAnnotationAPI.get_by_book_id(book_id) if annotation is None: await cls.try_reply_or_send_message(msg.chat.id, "Нет аннотации для этой книги!", reply_to_message_id=msg.message_id) return msg_parts = split_text(annotation.body) text = msg_parts[page-1] + f'\n<code>Страница {page}/{len(msg_parts)}</code>' keyboard = await get_keyboard(page, len(msg_parts), f"b_ann_{book_id}", only_one=True) if keyboard is None: keyboard = types.InlineKeyboardMarkup() keyboard.row( types.InlineKeyboardButton("Назад", callback_data=f"book_detail_{book_id}") ) await cls.bot.edit_message_text(text, chat_id=msg.chat.id, message_id=msg.message_id, parse_mode="HTML", reply_markup=keyboard)
filtered = handle_space_and_newline(filtered) if len(filtered) > 5: df_clean.append(filtered) original.append(content) else: removed.append(content) if i % 1000 == 0: print(i) print(len(df_clean)) #%% df_clean_2 = [] original_2 = [] for content, origin in zip(df_clean, original): if len(content) > 512: splitted = split_text(content, 512) df_clean_2 += splitted for i in range(len(splitted)): original_2.append(origin) else: original_2.append(origin) df_clean_2.append(content) print(len(max(df_clean_2, key=len))) print(max(df_clean_2, key=len)) #%% filename = 'cleaned_1202_v5' with open(f"data/{filename}.json", 'w', encoding='utf-8') as f: f.write(json.dumps(df_clean_2, ensure_ascii=False)) with open(f"data/{filename}_original.json", 'w', encoding='utf-8') as f: f.write(json.dumps(original, ensure_ascii=False))
def main(): (infile, resdir, regexp) = Helper.parse_args() utils.split_text(infile, resdir, regexp)
if recursive_until: if max_length + step < recursive_until: return split_text(text, max_length + step, recursive_until=recursive_until) else: return [text[:recursive_until]] # raise Exception(f'splitted segment is larger than recursive limit {recursive_until}\n{segment}\n{text}') else: raise Exception( f'splitted segment is larger than {max_length}\n{segment}\n{text}' ) return segments splitted_comments = [split_text(i, 40, recursive_until=128) for i in comments] # %% with open("pretraining_dataset.txt", 'w') as f: tmp = '\n\n'.join(['\n'.join(i) for i in splitted_comments]) f.write(tmp + '\n') #%% CUSTOM TOKEN tmp_freq = {} for comment in tqdm(comments, desc="Parsing"): eng_vocabs = re.findall(r"[A-Za-z']+", comment) eng_vocabs = [i.lower().strip() for i in eng_vocabs] for vocab in eng_vocabs: if vocab in tmp_freq: tmp_freq[vocab] += 1 else:
# %% df_clean = [] for i, content in enumerate(df): url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" filtered = re.sub(url_regex, '', content) filtered = ''.join(c for c in filtered if c in vocab) filtered = handle_space_and_newline(filtered) df_clean.append(filtered) print(len(df_clean)) #%% df_clean_2 = [] for content in df_clean: if len(content) > 511: name, post_content = content.split('π') for i in split_text(post_content, 511 - len(name)): df_clean_2.append(name + ' ' + i) else: df_clean_2.append(re.sub('π', ' ', content)) print(len(max(df_clean_2, key=len))) print(max(df_clean_2, key=len)) #%% filename = 'post_1202_v1' with open(f"data/{filename}.json", 'w', encoding='utf-8') as f: f.write(json.dumps(df_clean_2, ensure_ascii=False)) # with open(f"data/{filename}_original.json", 'w', encoding='utf-8') as f: # f.write(json.dumps(df_clean, ensure_ascii=False)) # with open(f"data/{filename}_removed.json", 'w', encoding='utf-8') as f: # f.write(json.dumps(df_clean, ensure_ascii=False))