def threads_handle(file_names_list): dir = os.path.dirname(__file__) word2vec_ru = os.path.join(dir, 'vectorizing', '__data__', 'model_ru.bin') word2vec_en = os.path.join(dir, 'vectorizing', '__data__', 'model_en.bin') pipe_ru = os.path.join(dir, 'vectorizing', '__data__', 'syntagru.model') pipe_en = os.path.join(dir, 'vectorizing', '__data__', 'syntagen.udpipe') vectorizer = vectorization.Vectorizer(pipe_en=pipe_en, model_file_en=word2vec_en, pipe_ru=pipe_ru, model_file_ru=word2vec_ru, restrict_vocab=200000, word_limit=100) n_t = news_thred(vectorizer) print(dumps(n_t.form_thread(file_names_list)))
def main(): vectorizer = vectorization.Vectorizer( pipe_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe', model_file_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin', pipe_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model', model_file_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin', restrict_vocab=200000, word_limit=100) clusterer = news_type.news_categories( vectorizer, model_file_ru= '/home/vova/PycharmProjects/TG/clustering/__data__/model_ru', model_file_en= '/home/vova/PycharmProjects/TG/clustering/__data__/model_en') n_t = article_index( vectorizer=vectorizer, clusterer=clusterer, db_path='/home/vova/PycharmProjects/TG/TG.db', index_clustering_path= '/home/vova/PycharmProjects/TG/clustering/__data__/index') files = preprocess.list_files( '/home/vova/PycharmProjects/TGmain/2703')[:1000] # n_t.clear_db() # with open('/home/vova/PycharmProjects/TG/__data__/temp_corp', "rb") as f: # corpus = pickle.loads(f.read()) # n_t.fit_models(corpus) start = time.time() n_t.multi_thread_test(files) # n_t.get_test_multi() # print(n_t.db_get_threads(12352342325, 'ru', 'sports')) # for file in files: # n_t.db_delete(file) print('time for indexing 200 articles %.2f' % (time.time() - start))
def main(): vectorizer = vectorization.Vectorizer( pipe_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe', model_file_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin', pipe_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model', model_file_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin', restrict_vocab=200000, word_limit=100) n_t = news_thred(vectorizer=vectorizer) start = time.time() files = preprocess.list_files( '/home/vova/PycharmProjects/TGmain/2703')[:1000] print(n_t.form_thread(files)) print('time for threading %.2f' % (time.time() - start))
def categories_handle(file_names_list): dir = os.path.dirname(__file__) word2vec_ru = os.path.join(dir, 'vectorizing', '__data__', 'model_ru.bin') word2vec_en = os.path.join(dir, 'vectorizing', '__data__', 'model_en.bin') pipe_ru = os.path.join(dir, 'vectorizing', '__data__', 'syntagru.model') pipe_en = os.path.join(dir, 'vectorizing', '__data__', 'syntagen.udpipe') model_ru = os.path.join(dir, 'clustering', '__data__', 'model_ru') model_en = os.path.join(dir, 'clustering', '__data__', 'model_en') vectorizer = vectorization.Vectorizer(pipe_en=pipe_en, model_file_en=word2vec_en, pipe_ru=pipe_ru, model_file_ru=word2vec_ru, restrict_vocab=200000, word_limit=100) n_c = news_categories(vectorizer, model_file_ru=model_ru, model_file_en=model_en) print(dumps(n_c.predict_categories(file_names_list)))
def main(): vectorizer = vectorization.Vectorizer( pipe_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe', model_file_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin', pipe_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model', model_file_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin', restrict_vocab=200000, word_limit=100) n_c = news_categories( vectorizer, model_file_ru= '/home/vova/PycharmProjects/TG/clustering/__data__/model_ru', model_file_en= '/home/vova/PycharmProjects/TG/clustering/__data__/model_en') files = preprocess.list_files( '/home/vova/PycharmProjects/TGmain/2703')[:1000] print(n_c.predict_categories(files))