def run_script(): # Step 1, read url from text file crawl_dict = StoreHelper.parse_file("./resource/url_list") # step 2 total_dict = {} for location, url_list in crawl_dict.items(): file_name = "./data/post/%s.dat" % location print (file_name) if StoreHelper.is_file_exist(file_name): total_dict.update(Main.get_frequency_from_file(file_name)) # sort dict total_dict = sorted(total_dict.items(), key=operator.itemgetter(1), reverse=True) StoreHelper.store_data(total_dict, "word_frequency.dat")
def generate_all_text(): crawl_dict = StoreHelper.parse_file("./resource/url_list") count_numbers = 0 for location in crawl_dict.keys(): file_name = "./data/post/%s.dat" % location positions = StoreHelper.load_data(file_name, []) for url, web_source in positions: if 'data scientist' in web_source.lower(): text_content = HTMLHelper.get_text(web_source) # text_dict = WordFrequency.get_frequency_dict(text_content) # output = [str(item) for item in text_dict] # output.extend([" ", text_content, " ", url]) StoreHelper.save_file(text_content, "./data/datascientist/%04d.txt" % count_numbers) count_numbers += 1 else: print ("Data Scientist not found in %s!" % url)