manager = Manager() finish = Value("i", 0) res = manager.list() args = parser.parse_args() keyword_path = args.keyword start_str = args.start.split("-") end_str = args.end.split("-") start = date(int(start_str[0]), int(start_str[1]), int(start_str[2])) end = date(int(end_str[0]), int(end_str[1]), int(end_str[2])) keywords = list() if keyword_path is not None: with open(keyword_path, "r", encoding="utf-8-sig") as keywds: for word in keywds: keywords.append(word.replace("\n", "")) now = start dbutil = DBUtility() while True: if now > end: break dt_queue.put(str(now)) now += timedelta(days = 1) for _ in range(WORKERS): t = Process(target = Download, args = (dt_queue, writelock, dbutil, finish, res, keywords)) t.daemon = True t.start() total = dt_queue.qsize()
transform=plt.gcf().transFigure) plt.subplots_adjust(left=0.3) plt.savefig(os.path.join(base_path, export_dir + "/" + filename + ".png")) if __name__ == '__main__': lock = Lock() writelock = Lock() manager = Manager() num = Value("i", 0) finish = Value("i", 0) dbutils = DBUtility() if not os.path.exists(os.path.join(base_path, export_dir)): os.mkdir(os.path.join(base_path, export_dir)) days = (end - start).days contents = list() sys.stdout.write('\n') sys.stdout.write('\r') sys.stdout.write("Calculating Similarity... 0%") sys.stdout.flush() for _ in range(WORKERS): t = Process(target=Calculate_Sim,
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from multiprocessing import cpu_count import pickle import csv from utility.DBUtility import DBUtility dbutils = DBUtility() stpwrdpath = "stop_word_all.txt" STOPWORDS = list() aus_accounts = [ "华人瞰世界", "今日悉尼", "微悉尼", "澳洲微报", "悉尼印象", "Australia News", "澳洲中文台" ] aus_articles = list() with open(stpwrdpath, 'r', encoding="utf-8-sig") as stopwords: for word in stopwords: STOPWORDS.append(word.replace("\n", "")) with open("./segmentation.pickle", "rb") as f: segementations = pickle.load(f) articles = dbutils.GetArticles({}) for article in articles: if article["account"] not in aus_articles: aus_articles.append(article["_id"]) sn_list = list() contents = list()