def cmd_threads(src_dir): # prepare samples joblist = list(simpletools.collect_files(src_dir)) # process with multiprocessing X, vecToTitlesFiles = threads_merge(simpletools.parallel_processor(threads_worker, joblist, split=True)) clusters = hcluster.fclusterdata(X, config.CLUSTERIZATION_THRESHOLD, criterion="distance", metric="cosine") # prepare result result = [] groupedClasters = sorted(enumerate(clusters), key=lambda tup: tup[1]) currentCluster = 0 currentGroup = {} for i, cluster in groupedClasters: vec = tuple(X[i]) title = vecToTitlesFiles[vec]["title"] filename = vecToTitlesFiles[vec]["filename"] if cluster != currentCluster: if currentGroup: result.append(currentGroup) currentGroup = { "title": title, "articles": [filename] } else: currentGroup["articles"].append(filename) currentCluster = cluster if currentGroup: result.append(currentGroup) return result
def cmd_top(src_dir): # prepare samples joblist = list(simpletools.collect_files(src_dir)) # process with multiprocessing categoriesToVecs, vecToTitlesFiles = top_merge( simpletools.parallel_processor(top_worker, joblist, split=True)) # prepare result result = [] clusterInfo = [] for category in categoriesToVecs: X = categoriesToVecs[category] if X.shape[0] > 1: clusters = hcluster.fclusterdata(X, config.CLUSTERIZATION_THRESHOLD, criterion="distance", metric="cosine") counts = Counter(clusters) for cluster in counts: info = { 'category': config.invertedClasses[category], 'title': "", 'articles': [] } indexes = [i for i, e in enumerate(clusters) if e == cluster] vectors = np.take(X, indexes, axis=0) info["title"] = vecToTitlesFiles[tuple(vectors[0])]["title"] info["articles"] = [ vecToTitlesFiles[tuple(vector)]["filename"] for vector in vectors ] clusterInfo.append(info) result.append({ "category": "any", "threads": sorted(clusterInfo, key=lambda i: len(i['articles']), reverse=True) }) for category in config.classes: threads = list(filter(lambda x: x["category"] == category, clusterInfo)) threads = [{ 'title': t['title'], 'articles': t['articles'] } for t in threads] result.append({ "category": category, "threads": sorted(threads, key=lambda i: len(i['articles']), reverse=True) }) return result
def cmd_news(src_dir): # prepare samples joblist = simpletools.collect_files(src_dir) # multiprocessing with mp.Pool(PROCESSES_COUNT, init_factory) as pool: data = pool.map(is_news_worker, joblist) pool.close() pool.join() # prepare result news_articles = [fname for fname in data if fname] return {"articles": news_articles}
def cmd_categories(src_dir): # prepare samples joblist = simpletools.collect_files(src_dir) # process with multiprocessing data = simpletools.parallel_processor(categories_worker, joblist) article_by_categories = {cat: [] for cat in config.classes.keys()} for i in data: cat_id, filename = i[0], i[1] category = config.invertedClasses.get(cat_id, None) if filename and category: article_by_categories[category].append(filename) return [{"category": cat, "articles": articles} for cat, articles in article_by_categories.items()]
def cmd_languages(src_dir): # prepare samples joblist = simpletools.collect_files(src_dir) # multiprocessing data = simpletools.parallel_processor(detect_lang_worker, joblist) # prepare result pages_by_lang = {} for i in data: lang, fname = i[0], i[1] if lang not in pages_by_lang.keys(): pages_by_lang[lang] = [] pages_by_lang[lang].append(fname) return pages_by_lang