def main(): dir = os.path.dirname(__file__) word2vec_ru = os.path.join(dir, 'vectorizing', '__data__', 'model_ru.bin') word2vec_en = os.path.join(dir, 'vectorizing', '__data__', 'model_en.bin') pipe_ru = os.path.join(dir, 'vectorizing', '__data__', 'syntagru.model') pipe_en = os.path.join(dir, 'vectorizing', '__data__', 'syntagen.udpipe') temp_corp = os.path.join(dir, '__data__', 'temp_corp') test_set_file = os.path.join(dir, '__data__', 'test_set.json') files = list_files('/home/vova/PycharmProjects/TGmain/2703') files = random.sample(files, 200000) vectorizer = Vectorizer(model_file_en=word2vec_en, model_file_ru=word2vec_ru, pipe_en=pipe_en, pipe_ru=pipe_ru, restrict_vocab=200000, word_limit=100) i = 0 vecs, _ = vectorizer.vectorize_multiple_files_multi(files) with open(temp_corp, "wb") as f: f.write(pickle.dumps(vecs, pickle.HIGHEST_PROTOCOL))
def main(): detector = lang_detect() files = list_files('/home/vova/PycharmProjects/TGmain/2703')[:1000] start = time.time() print(detector.detect_multiple_threads(files)) print('detection time %.2f' % (time.time() - start))
def main(): vectorizer = Vectorizer(model_file_en='__data__/model_en.bin', model_file_ru='__data__/model_ru.bin', pipe_en='__data__/syntagen.udpipe', pipe_ru='__data__/syntagru.model', restrict_vocab=200000, word_limit=100) files = random.sample(list_files('/home/vova/PycharmProjects/TGmain/2703'), 800000) vecs, articles = vectorizer.vectorize_multiple_files_multi(files) with open('/home/vova/PycharmProjects/TG/__data__/temp_corp', "wb") as f: f.write(pickle.dumps(vecs, pickle.HIGHEST_PROTOCOL))
def main(): textprocess = TextProcess( keep_props=True, modelfile_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe', modelfile_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntageru.model') corp = textprocess.gen_tagged_corpus( preprocess.list_files('/home/vova/PycharmProjects/TGmain/2703')[:10]) preprocess.save_object(corp, '/home/vova/PycharmProjects/TG/__data__/text_corp')
def main(): vectorizer = vectorization.Vectorizer( pipe_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe', model_file_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin', pipe_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model', model_file_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin', restrict_vocab=200000, word_limit=100) clusterer = news_type.news_categories( vectorizer, model_file_ru= '/home/vova/PycharmProjects/TG/clustering/__data__/model_ru', model_file_en= '/home/vova/PycharmProjects/TG/clustering/__data__/model_en') n_t = article_index( vectorizer=vectorizer, clusterer=clusterer, db_path='/home/vova/PycharmProjects/TG/TG.db', index_clustering_path= '/home/vova/PycharmProjects/TG/clustering/__data__/index') files = preprocess.list_files( '/home/vova/PycharmProjects/TGmain/2703')[:1000] # n_t.clear_db() # with open('/home/vova/PycharmProjects/TG/__data__/temp_corp', "rb") as f: # corpus = pickle.loads(f.read()) # n_t.fit_models(corpus) start = time.time() n_t.multi_thread_test(files) # n_t.get_test_multi() # print(n_t.db_get_threads(12352342325, 'ru', 'sports')) # for file in files: # n_t.db_delete(file) print('time for indexing 200 articles %.2f' % (time.time() - start))
def main(): vectorizer = vectorization.Vectorizer( pipe_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe', model_file_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin', pipe_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model', model_file_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin', restrict_vocab=200000, word_limit=100) n_t = news_thred(vectorizer=vectorizer) start = time.time() files = preprocess.list_files( '/home/vova/PycharmProjects/TGmain/2703')[:1000] print(n_t.form_thread(files)) print('time for threading %.2f' % (time.time() - start))
def main(): vectorizer = vectorization.Vectorizer( pipe_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe', model_file_en= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin', pipe_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model', model_file_ru= '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin', restrict_vocab=200000, word_limit=100) n_c = news_categories( vectorizer, model_file_ru= '/home/vova/PycharmProjects/TG/clustering/__data__/model_ru', model_file_en= '/home/vova/PycharmProjects/TG/clustering/__data__/model_en') files = preprocess.list_files( '/home/vova/PycharmProjects/TGmain/2703')[:1000] print(n_c.predict_categories(files))
dir = os.path.dirname(__file__) word2vec_ru = os.path.join(dir, 'vectorizing', '__data__', 'model_ru.bin') word2vec_en = os.path.join(dir, 'vectorizing', '__data__', 'model_en.bin') pipe_ru = os.path.join(dir, 'vectorizing', '__data__', 'syntagru.model') pipe_en = os.path.join(dir, 'vectorizing', '__data__', 'syntagen.udpipe') model_path = os.path.join(dir, 'clustering', '__data__', 'model') data_dir = os.path.join(dir, '__data__') data_src = '/home/vova/PycharmProjects/TGmain/2703' target_lang = 'en' vectorizer = Vectorizer(model_file_en=word2vec_en, model_file_ru=word2vec_ru, pipe_en=pipe_en, pipe_ru=pipe_ru) files = random.sample(list_files(data_src), 500) test_data = [] i = 0 for file in files: print('file id : %d' % i) if i % 100 == 0: save_object(test_data, os.path.join(data_dir, 'labels_en')) i += 1 vector, lang, _ = vectorizer.vectorize_article_mean(file) article = parse_article(open(file, "r")) if lang == target_lang: print('\033[1m' + article['title'] + '\033[0m')