예제 #1
0
def main(args):

    # Проверяем, всё ли указали для непредобработанных статей
    check_args(args, 'preprocessed', preprocessed_required)

    lemmatized_dict = load_lemmatized(args.lemmatized_path, args.forced)

    all_files = [f for f in os.listdir(args.texts_path)]
    new_files = [file for file in all_files if file not in lemmatized_dict]
    print('Новых текстов: {}'.format(len(new_files)))

    if new_files:

        if args.preprocessed:  # если файлы уже предобработаны
            full_lemmatized_dict = collect_texts(lemmatized_dict,
                                                 args.texts_path, new_files)

        else:

            full_lemmatized_dict, not_lemmatized_texts = process_corpus(
                args.udpipe_path,
                lemmatized_dict,
                args.texts_path,
                new_files,
                keep_pos=args.keep_pos,
                keep_punct=args.keep_punct,
                keep_stops=args.keep_stops)

            if not_lemmatized_texts:
                print('Не удалось разобрать следующие файлы:\n{}'.format(
                    '\n'.join(not_lemmatized_texts)))

        jdump(full_lemmatized_dict,
              open(args.lemmatized_path, 'w', encoding='utf-8'))
예제 #2
0
def main():
    args = parse_args()

    preprocessed_required = {0: ['udpipe_path']}
    # Проверяем, всё ли указали для непредобработанных статей
    check_args(args, 'preprocessed', preprocessed_required)

    all_files = [f for f in os.listdir(args.texts_path)]

    lemmatized_dict = load_lemmatized(args.lemmatized_path, args.forced)

    new_files = [file for file in all_files if file not in lemmatized_dict]
    print('Новых текстов: {}'.format(len(new_files)), file=sys.stderr)

    if new_files:

        if args.preprocessed:  # если файлы уже предобработаны
            full_lemmatized_dict = collect_texts(lemmatized_dict,
                                                 args.texts_path, new_files)

        else:
            udpipe_model = Model.load(args.udpipe_path)
            process_pipeline = Pipeline(udpipe_model, 'tokenize',
                                        Pipeline.DEFAULT, Pipeline.DEFAULT,
                                        'conllu')

            full_lemmatized_dict, not_lemmatized_texts = process_corpus(
                process_pipeline,
                lemmatized_dict,
                args.texts_path,
                new_files,
                keep_pos=args.keep_pos,
                keep_punct=args.keep_punct,
                keep_stops=args.keep_stops)

            if not_lemmatized_texts:
                print('Не удалось разобрать следующие файлы:\n{}'.format(
                    '\n'.join(not_lemmatized_texts)),
                      file=sys.stderr)

        jdump(full_lemmatized_dict,
              open(args.lemmatized_path, 'w', encoding='utf-8'))
예제 #3
0
def main(args):
    # всё ли указали для прикрепления ссылок
    check_args(args, 'with_url', url_required)

    # Проверяем, всё ли указали для внешней статьи
    check_args(args, 'included', included_required)

    if not args.included:
        check_args(args, 'method', notincl_method_required)

    rating, verbosed_rating, missed_urls = search(args.target_article_path, args.lang, args.mapping_path,
                                   args.corpus_vectors_path, args.top, args.verbose, args.included,
                                   args.udpipe_path, args.keep_pos, args.keep_stops, args.keep_punct,
         args.method, args.embeddings_path, args.bidict_path, args.projection_path, args.no_duplicates,
                                   args.with_url, args.url_mapping_path)
예제 #4
0
def main():
    args = parse_args()

    texts_mapping = jload(open(args.mapping_path))

    # для кросс-языковой векторизации должно быть указано направление и путь к общей матрице векторов
    lang_required = {'cross': ['direction', 'common_output_vectors_path']}
    check_args(args, 'lang', lang_required)

    # для кроссязыковой векторизации
    if args.lang == 'cross':
        model_required = {'model': ['src_embeddings_path', 'tar_embeddings_path'],
                          'translation': ['tar_embeddings_path', 'bidict_path'],
                          'projection': ['src_embeddings_path', 'tar_embeddings_path', 'projection_path']
                          }
        check_args(args, 'method', model_required)

        if args.method == 'translation':
            args.src_embeddings_path = args.tar_embeddings_path

        directions = {d: lang for d, lang in zip(['src', 'tar'], args.direction.split('-'))}
        print(directions)

        print('Векторизую src')
        src_vectorized = main_onelang('src', directions['src'], texts_mapping,
                                      args.src_lemmatized_path, args.src_embeddings_path,
                                      args.src_output_vectors_path, args.method, args.no_duplicates,
                                      args.projection_path, args.bidict_path, args.forced)
        # print(src_vectorized)
        print('Векторизую tar')
        tar_vectorized = main_onelang('tar', directions['tar'], texts_mapping,
                                      args.tar_lemmatized_path, args.tar_embeddings_path,
                                      args.tar_output_vectors_path, args.method, args.no_duplicates,
                                      args.projection_path, args.bidict_path, args.forced)
        # print(tar_vectorized)

        # собираем общие матрицу и маппинг
        common_len = len(src_vectorized) + len(tar_vectorized)
        emb_dim = src_vectorized.shape[1]
        common_vectorized = np.zeros((common_len, emb_dim))
        print(common_vectorized.shape)

        common2i = {}
        i2common = {}

        common_vectorized, common2i, i2common = to_common(texts_mapping, common2i, i2common,
                                                          common_vectorized, tar_vectorized,
                                                          directions['tar'], start_from=0)
        common_vectorized, common2i, i2common = to_common(texts_mapping, common2i, i2common,
                                                          common_vectorized, src_vectorized,
                                                          directions['src'],
                                                          start_from=len(tar_vectorized))

        pdump(common_vectorized, open(args.common_output_vectors_path, 'wb'))

        texts_mapping['cross2i'] = common2i
        texts_mapping['i2cross'] = i2common
        jdump(texts_mapping, open(args.mapping_path, 'w', encoding='utf-8'))

        # print(i2common)
        # print(common2i)

    # для векторизации одноязычного корпуса (без сборки общей матрицы векторов и общего маппинга)
    else:
        model_required = {'model': ['src_embeddings_path'],
                          'translation': ['tar_embeddings_path', 'bidict_path'],
                          'projection': ['src_embeddings_path', 'tar_embeddings_path',
                                         'projection_path']
                          }
        check_args(args, 'method', model_required)

        if args.method == 'translation':
            args.src_embeddings_path = args.tar_embeddings_path

        print('Векторизую корпус')
        src_vectorized = main_onelang('src', args.lang, texts_mapping,
                                      args.src_lemmatized_path,
                                      args.src_embeddings_path, args.src_output_vectors_path,
                                      args.method, args.no_duplicates, args.projection_path,
                                      args.bidict_path, args.forced)
예제 #5
0
    similars = search_similar(target_article_vec, corpus_vecs)
    rating, verbosed_rating, missed_urls = make_rating(target_article,
                                                       similars, verbose, top,
                                                       included, texts_mapping,
                                                       i2lang, with_url,
                                                       article_data)

    return rating, verbosed_rating, missed_urls


if __name__ == "__main__":
    args = parse_args()

    # всё ли указали для прикрепления ссылок
    url_required = {1: ['url_mapping_path']}
    check_args(args, 'with_url', url_required)

    # Проверяем, всё ли указали для внешней статьи
    included_required = {0: ['udpipe_path', 'embeddings_path', 'method']}

    check_args(args, 'included', included_required)

    if not args.included:
        model_required = {
            'model': ['embeddings_path'],
            'translation': ['embeddings_path', 'bidict_path'],
            'projection': ['embeddings_path', 'projection_path']
        }
        check_args(args, 'method', model_required)

    rating, verbosed_rating, missed_urls = main(