def main(args): # Проверяем, всё ли указали для непредобработанных статей check_args(args, 'preprocessed', preprocessed_required) lemmatized_dict = load_lemmatized(args.lemmatized_path, args.forced) all_files = [f for f in os.listdir(args.texts_path)] new_files = [file for file in all_files if file not in lemmatized_dict] print('Новых текстов: {}'.format(len(new_files))) if new_files: if args.preprocessed: # если файлы уже предобработаны full_lemmatized_dict = collect_texts(lemmatized_dict, args.texts_path, new_files) else: full_lemmatized_dict, not_lemmatized_texts = process_corpus( args.udpipe_path, lemmatized_dict, args.texts_path, new_files, keep_pos=args.keep_pos, keep_punct=args.keep_punct, keep_stops=args.keep_stops) if not_lemmatized_texts: print('Не удалось разобрать следующие файлы:\n{}'.format( '\n'.join(not_lemmatized_texts))) jdump(full_lemmatized_dict, open(args.lemmatized_path, 'w', encoding='utf-8'))
def main(): args = parse_args() preprocessed_required = {0: ['udpipe_path']} # Проверяем, всё ли указали для непредобработанных статей check_args(args, 'preprocessed', preprocessed_required) all_files = [f for f in os.listdir(args.texts_path)] lemmatized_dict = load_lemmatized(args.lemmatized_path, args.forced) new_files = [file for file in all_files if file not in lemmatized_dict] print('Новых текстов: {}'.format(len(new_files)), file=sys.stderr) if new_files: if args.preprocessed: # если файлы уже предобработаны full_lemmatized_dict = collect_texts(lemmatized_dict, args.texts_path, new_files) else: udpipe_model = Model.load(args.udpipe_path) process_pipeline = Pipeline(udpipe_model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu') full_lemmatized_dict, not_lemmatized_texts = process_corpus( process_pipeline, lemmatized_dict, args.texts_path, new_files, keep_pos=args.keep_pos, keep_punct=args.keep_punct, keep_stops=args.keep_stops) if not_lemmatized_texts: print('Не удалось разобрать следующие файлы:\n{}'.format( '\n'.join(not_lemmatized_texts)), file=sys.stderr) jdump(full_lemmatized_dict, open(args.lemmatized_path, 'w', encoding='utf-8'))
def main(args): # всё ли указали для прикрепления ссылок check_args(args, 'with_url', url_required) # Проверяем, всё ли указали для внешней статьи check_args(args, 'included', included_required) if not args.included: check_args(args, 'method', notincl_method_required) rating, verbosed_rating, missed_urls = search(args.target_article_path, args.lang, args.mapping_path, args.corpus_vectors_path, args.top, args.verbose, args.included, args.udpipe_path, args.keep_pos, args.keep_stops, args.keep_punct, args.method, args.embeddings_path, args.bidict_path, args.projection_path, args.no_duplicates, args.with_url, args.url_mapping_path)
def main(): args = parse_args() texts_mapping = jload(open(args.mapping_path)) # для кросс-языковой векторизации должно быть указано направление и путь к общей матрице векторов lang_required = {'cross': ['direction', 'common_output_vectors_path']} check_args(args, 'lang', lang_required) # для кроссязыковой векторизации if args.lang == 'cross': model_required = {'model': ['src_embeddings_path', 'tar_embeddings_path'], 'translation': ['tar_embeddings_path', 'bidict_path'], 'projection': ['src_embeddings_path', 'tar_embeddings_path', 'projection_path'] } check_args(args, 'method', model_required) if args.method == 'translation': args.src_embeddings_path = args.tar_embeddings_path directions = {d: lang for d, lang in zip(['src', 'tar'], args.direction.split('-'))} print(directions) print('Векторизую src') src_vectorized = main_onelang('src', directions['src'], texts_mapping, args.src_lemmatized_path, args.src_embeddings_path, args.src_output_vectors_path, args.method, args.no_duplicates, args.projection_path, args.bidict_path, args.forced) # print(src_vectorized) print('Векторизую tar') tar_vectorized = main_onelang('tar', directions['tar'], texts_mapping, args.tar_lemmatized_path, args.tar_embeddings_path, args.tar_output_vectors_path, args.method, args.no_duplicates, args.projection_path, args.bidict_path, args.forced) # print(tar_vectorized) # собираем общие матрицу и маппинг common_len = len(src_vectorized) + len(tar_vectorized) emb_dim = src_vectorized.shape[1] common_vectorized = np.zeros((common_len, emb_dim)) print(common_vectorized.shape) common2i = {} i2common = {} common_vectorized, common2i, i2common = to_common(texts_mapping, common2i, i2common, common_vectorized, tar_vectorized, directions['tar'], start_from=0) common_vectorized, common2i, i2common = to_common(texts_mapping, common2i, i2common, common_vectorized, src_vectorized, directions['src'], start_from=len(tar_vectorized)) pdump(common_vectorized, open(args.common_output_vectors_path, 'wb')) texts_mapping['cross2i'] = common2i texts_mapping['i2cross'] = i2common jdump(texts_mapping, open(args.mapping_path, 'w', encoding='utf-8')) # print(i2common) # print(common2i) # для векторизации одноязычного корпуса (без сборки общей матрицы векторов и общего маппинга) else: model_required = {'model': ['src_embeddings_path'], 'translation': ['tar_embeddings_path', 'bidict_path'], 'projection': ['src_embeddings_path', 'tar_embeddings_path', 'projection_path'] } check_args(args, 'method', model_required) if args.method == 'translation': args.src_embeddings_path = args.tar_embeddings_path print('Векторизую корпус') src_vectorized = main_onelang('src', args.lang, texts_mapping, args.src_lemmatized_path, args.src_embeddings_path, args.src_output_vectors_path, args.method, args.no_duplicates, args.projection_path, args.bidict_path, args.forced)
similars = search_similar(target_article_vec, corpus_vecs) rating, verbosed_rating, missed_urls = make_rating(target_article, similars, verbose, top, included, texts_mapping, i2lang, with_url, article_data) return rating, verbosed_rating, missed_urls if __name__ == "__main__": args = parse_args() # всё ли указали для прикрепления ссылок url_required = {1: ['url_mapping_path']} check_args(args, 'with_url', url_required) # Проверяем, всё ли указали для внешней статьи included_required = {0: ['udpipe_path', 'embeddings_path', 'method']} check_args(args, 'included', included_required) if not args.included: model_required = { 'model': ['embeddings_path'], 'translation': ['embeddings_path', 'bidict_path'], 'projection': ['embeddings_path', 'projection_path'] } check_args(args, 'method', model_required) rating, verbosed_rating, missed_urls = main(