Пример #1
0
def cmd_threads(src_dir):
    # prepare samples
    joblist = list(simpletools.collect_files(src_dir))

    # process with multiprocessing
    X, vecToTitlesFiles = threads_merge(simpletools.parallel_processor(threads_worker, joblist, split=True))
    clusters = hcluster.fclusterdata(X, config.CLUSTERIZATION_THRESHOLD, criterion="distance", metric="cosine")

    # prepare result
    result = []
    groupedClasters = sorted(enumerate(clusters), key=lambda tup: tup[1])
    currentCluster = 0
    currentGroup = {}
    for i, cluster in groupedClasters:
        vec = tuple(X[i])
        title = vecToTitlesFiles[vec]["title"]
        filename = vecToTitlesFiles[vec]["filename"]
        if cluster != currentCluster:
            if currentGroup:
                result.append(currentGroup)
            currentGroup = {
                "title": title,
                "articles": [filename]
            }
        else:
            currentGroup["articles"].append(filename)
        currentCluster = cluster
    if currentGroup:
        result.append(currentGroup)

    return result
Пример #2
0
def cmd_top(src_dir):
    # prepare samples
    joblist = list(simpletools.collect_files(src_dir))

    # process with multiprocessing
    categoriesToVecs, vecToTitlesFiles = top_merge(
        simpletools.parallel_processor(top_worker, joblist, split=True))

    # prepare result
    result = []
    clusterInfo = []
    for category in categoriesToVecs:
        X = categoriesToVecs[category]
        if X.shape[0] > 1:
            clusters = hcluster.fclusterdata(X,
                                             config.CLUSTERIZATION_THRESHOLD,
                                             criterion="distance",
                                             metric="cosine")
            counts = Counter(clusters)
            for cluster in counts:
                info = {
                    'category': config.invertedClasses[category],
                    'title': "",
                    'articles': []
                }
                indexes = [i for i, e in enumerate(clusters) if e == cluster]
                vectors = np.take(X, indexes, axis=0)
                info["title"] = vecToTitlesFiles[tuple(vectors[0])]["title"]
                info["articles"] = [
                    vecToTitlesFiles[tuple(vector)]["filename"]
                    for vector in vectors
                ]
                clusterInfo.append(info)

    result.append({
        "category":
        "any",
        "threads":
        sorted(clusterInfo, key=lambda i: len(i['articles']), reverse=True)
    })
    for category in config.classes:
        threads = list(filter(lambda x: x["category"] == category,
                              clusterInfo))
        threads = [{
            'title': t['title'],
            'articles': t['articles']
        } for t in threads]
        result.append({
            "category":
            category,
            "threads":
            sorted(threads, key=lambda i: len(i['articles']), reverse=True)
        })
    return result
Пример #3
0
def cmd_news(src_dir):
    # prepare samples
    joblist = simpletools.collect_files(src_dir)

    # multiprocessing
    with mp.Pool(PROCESSES_COUNT, init_factory) as pool:
        data = pool.map(is_news_worker, joblist)
        pool.close()
        pool.join()

    # prepare result
    news_articles = [fname for fname in data if fname]
    return {"articles": news_articles}
Пример #4
0
def cmd_categories(src_dir):
    # prepare samples
    joblist = simpletools.collect_files(src_dir)

    # process with multiprocessing
    data = simpletools.parallel_processor(categories_worker, joblist)

    article_by_categories = {cat: [] for cat in config.classes.keys()}
    for i in data:
        cat_id, filename = i[0], i[1]
        category = config.invertedClasses.get(cat_id, None)
        if filename and category:
            article_by_categories[category].append(filename)

    return [{"category": cat, "articles": articles} for cat, articles in article_by_categories.items()]
Пример #5
0
def cmd_languages(src_dir):
    # prepare samples
    joblist = simpletools.collect_files(src_dir)

    # multiprocessing
    data = simpletools.parallel_processor(detect_lang_worker, joblist)

    # prepare result
    pages_by_lang = {}
    for i in data:
        lang, fname = i[0], i[1]
        if lang not in pages_by_lang.keys():
            pages_by_lang[lang] = []
        pages_by_lang[lang].append(fname)

    return pages_by_lang