示例#1
0
def main():

    dir = os.path.dirname(__file__)
    word2vec_ru = os.path.join(dir, 'vectorizing', '__data__', 'model_ru.bin')
    word2vec_en = os.path.join(dir, 'vectorizing', '__data__', 'model_en.bin')
    pipe_ru = os.path.join(dir, 'vectorizing', '__data__', 'syntagru.model')
    pipe_en = os.path.join(dir, 'vectorizing', '__data__', 'syntagen.udpipe')
    temp_corp = os.path.join(dir, '__data__', 'temp_corp')

    test_set_file = os.path.join(dir, '__data__', 'test_set.json')

    files = list_files('/home/vova/PycharmProjects/TGmain/2703')
    files = random.sample(files, 200000)
    vectorizer = Vectorizer(model_file_en=word2vec_en,
                            model_file_ru=word2vec_ru,
                            pipe_en=pipe_en,
                            pipe_ru=pipe_ru,
                            restrict_vocab=200000,
                            word_limit=100)

    i = 0

    vecs, _ = vectorizer.vectorize_multiple_files_multi(files)

    with open(temp_corp, "wb") as f:
        f.write(pickle.dumps(vecs, pickle.HIGHEST_PROTOCOL))
示例#2
0
def main():
    detector = lang_detect()

    files = list_files('/home/vova/PycharmProjects/TGmain/2703')[:1000]

    start = time.time()
    print(detector.detect_multiple_threads(files))
    print('detection time %.2f' % (time.time() - start))
示例#3
0
def main():
    vectorizer = Vectorizer(model_file_en='__data__/model_en.bin', model_file_ru='__data__/model_ru.bin',
                            pipe_en='__data__/syntagen.udpipe', pipe_ru='__data__/syntagru.model',
                            restrict_vocab=200000, word_limit=100)

    files = random.sample(list_files('/home/vova/PycharmProjects/TGmain/2703'), 800000)
    vecs, articles = vectorizer.vectorize_multiple_files_multi(files)

    with open('/home/vova/PycharmProjects/TG/__data__/temp_corp', "wb") as f:
        f.write(pickle.dumps(vecs, pickle.HIGHEST_PROTOCOL))
示例#4
0
def main():
    textprocess = TextProcess(
        keep_props=True,
        modelfile_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe',
        modelfile_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntageru.model')

    corp = textprocess.gen_tagged_corpus(
        preprocess.list_files('/home/vova/PycharmProjects/TGmain/2703')[:10])
    preprocess.save_object(corp,
                           '/home/vova/PycharmProjects/TG/__data__/text_corp')
示例#5
0
def main():
    vectorizer = vectorization.Vectorizer(
        pipe_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe',
        model_file_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin',
        pipe_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model',
        model_file_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin',
        restrict_vocab=200000,
        word_limit=100)

    clusterer = news_type.news_categories(
        vectorizer,
        model_file_ru=
        '/home/vova/PycharmProjects/TG/clustering/__data__/model_ru',
        model_file_en=
        '/home/vova/PycharmProjects/TG/clustering/__data__/model_en')

    n_t = article_index(
        vectorizer=vectorizer,
        clusterer=clusterer,
        db_path='/home/vova/PycharmProjects/TG/TG.db',
        index_clustering_path=
        '/home/vova/PycharmProjects/TG/clustering/__data__/index')

    files = preprocess.list_files(
        '/home/vova/PycharmProjects/TGmain/2703')[:1000]

    # n_t.clear_db()

    # with open('/home/vova/PycharmProjects/TG/__data__/temp_corp', "rb") as f:
    #     corpus = pickle.loads(f.read())

    # n_t.fit_models(corpus)

    start = time.time()
    n_t.multi_thread_test(files)
    # n_t.get_test_multi()
    # print(n_t.db_get_threads(12352342325, 'ru', 'sports'))
    # for file in files:
    #     n_t.db_delete(file)
    print('time for indexing 200 articles %.2f' % (time.time() - start))
示例#6
0
def main():
    vectorizer = vectorization.Vectorizer(
        pipe_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe',
        model_file_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin',
        pipe_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model',
        model_file_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin',
        restrict_vocab=200000,
        word_limit=100)
    n_t = news_thred(vectorizer=vectorizer)

    start = time.time()
    files = preprocess.list_files(
        '/home/vova/PycharmProjects/TGmain/2703')[:1000]
    print(n_t.form_thread(files))
    print('time for threading %.2f' % (time.time() - start))
示例#7
0
def main():
    vectorizer = vectorization.Vectorizer(
        pipe_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagen.udpipe',
        model_file_en=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_en.bin',
        pipe_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/syntagru.model',
        model_file_ru=
        '/home/vova/PycharmProjects/TG/vectorizing/__data__/model_ru.bin',
        restrict_vocab=200000,
        word_limit=100)

    n_c = news_categories(
        vectorizer,
        model_file_ru=
        '/home/vova/PycharmProjects/TG/clustering/__data__/model_ru',
        model_file_en=
        '/home/vova/PycharmProjects/TG/clustering/__data__/model_en')

    files = preprocess.list_files(
        '/home/vova/PycharmProjects/TGmain/2703')[:1000]

    print(n_c.predict_categories(files))
示例#8
0
dir = os.path.dirname(__file__)
word2vec_ru = os.path.join(dir, 'vectorizing', '__data__', 'model_ru.bin')
word2vec_en = os.path.join(dir, 'vectorizing', '__data__', 'model_en.bin')
pipe_ru = os.path.join(dir, 'vectorizing', '__data__', 'syntagru.model')
pipe_en = os.path.join(dir, 'vectorizing', '__data__', 'syntagen.udpipe')
model_path = os.path.join(dir, 'clustering', '__data__',  'model')
data_dir = os.path.join(dir, '__data__')

data_src = '/home/vova/PycharmProjects/TGmain/2703'

target_lang = 'en'

vectorizer = Vectorizer(model_file_en=word2vec_en, model_file_ru=word2vec_ru,
                            pipe_en=pipe_en, pipe_ru=pipe_ru)

files = random.sample(list_files(data_src), 500)

test_data = []

i = 0

for file in files:
    print('file id : %d' % i)
    if i % 100 == 0:
        save_object(test_data, os.path.join(data_dir, 'labels_en'))
    i += 1

    vector, lang, _ = vectorizer.vectorize_article_mean(file)
    article = parse_article(open(file, "r"))
    if lang == target_lang:
        print('\033[1m' + article['title'] + '\033[0m')