示例#1
0
def threads_handle(file_names_list):
    dir = os.path.dirname(__file__)
    word2vec_ru = os.path.join(dir, 'vectorizing', '__data__', 'model_ru.bin')
    word2vec_en = os.path.join(dir, 'vectorizing', '__data__', 'model_en.bin')
    pipe_ru = os.path.join(dir, 'vectorizing', '__data__', 'syntagru.model')
    pipe_en = os.path.join(dir, 'vectorizing', '__data__', 'syntagen.udpipe')

    vectorizer = vectorization.Vectorizer(pipe_en=pipe_en,
                                          model_file_en=word2vec_en,
                                          pipe_ru=pipe_ru,
                                          model_file_ru=word2vec_ru,
                                          restrict_vocab=200000,
                                          word_limit=100)

    n_t = news_thred(vectorizer)

    print(dumps(n_t.form_thread(file_names_list)))
示例#2
0
def main():
    vectorizer = vectorization.Vectorizer(
        pipe_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe',
        model_file_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin',
        pipe_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model',
        model_file_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin',
        restrict_vocab=200000,
        word_limit=100)

    clusterer = news_type.news_categories(
        vectorizer,
        model_file_ru=
        '/home/vova/PycharmProjects/TG/clustering/__data__/model_ru',
        model_file_en=
        '/home/vova/PycharmProjects/TG/clustering/__data__/model_en')

    n_t = article_index(
        vectorizer=vectorizer,
        clusterer=clusterer,
        db_path='/home/vova/PycharmProjects/TG/TG.db',
        index_clustering_path=
        '/home/vova/PycharmProjects/TG/clustering/__data__/index')

    files = preprocess.list_files(
        '/home/vova/PycharmProjects/TGmain/2703')[:1000]

    # n_t.clear_db()

    # with open('/home/vova/PycharmProjects/TG/__data__/temp_corp', "rb") as f:
    #     corpus = pickle.loads(f.read())

    # n_t.fit_models(corpus)

    start = time.time()
    n_t.multi_thread_test(files)
    # n_t.get_test_multi()
    # print(n_t.db_get_threads(12352342325, 'ru', 'sports'))
    # for file in files:
    #     n_t.db_delete(file)
    print('time for indexing 200 articles %.2f' % (time.time() - start))
示例#3
0
def main():
    vectorizer = vectorization.Vectorizer(
        pipe_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe',
        model_file_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin',
        pipe_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model',
        model_file_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin',
        restrict_vocab=200000,
        word_limit=100)
    n_t = news_thred(vectorizer=vectorizer)

    start = time.time()
    files = preprocess.list_files(
        '/home/vova/PycharmProjects/TGmain/2703')[:1000]
    print(n_t.form_thread(files))
    print('time for threading %.2f' % (time.time() - start))
示例#4
0
def categories_handle(file_names_list):
    dir = os.path.dirname(__file__)
    word2vec_ru = os.path.join(dir, 'vectorizing', '__data__', 'model_ru.bin')
    word2vec_en = os.path.join(dir, 'vectorizing', '__data__', 'model_en.bin')
    pipe_ru = os.path.join(dir, 'vectorizing', '__data__', 'syntagru.model')
    pipe_en = os.path.join(dir, 'vectorizing', '__data__', 'syntagen.udpipe')

    model_ru = os.path.join(dir, 'clustering', '__data__', 'model_ru')
    model_en = os.path.join(dir, 'clustering', '__data__', 'model_en')

    vectorizer = vectorization.Vectorizer(pipe_en=pipe_en,
                                          model_file_en=word2vec_en,
                                          pipe_ru=pipe_ru,
                                          model_file_ru=word2vec_ru,
                                          restrict_vocab=200000,
                                          word_limit=100)

    n_c = news_categories(vectorizer,
                          model_file_ru=model_ru,
                          model_file_en=model_en)

    print(dumps(n_c.predict_categories(file_names_list)))
示例#5
0
def main():
    vectorizer = vectorization.Vectorizer(
        pipe_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe',
        model_file_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin',
        pipe_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model',
        model_file_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin',
        restrict_vocab=200000,
        word_limit=100)

    n_c = news_categories(
        vectorizer,
        model_file_ru=
        '/home/vova/PycharmProjects/TG/clustering/__data__/model_ru',
        model_file_en=
        '/home/vova/PycharmProjects/TG/clustering/__data__/model_en')

    files = preprocess.list_files(
        '/home/vova/PycharmProjects/TGmain/2703')[:1000]

    print(n_c.predict_categories(files))