def download_pic(pic_data): split = pic_data.split("~") pic_dir = pic_save_path + split[0] + "/" pic_url = split[1] t.is_dir_existed(pic_dir) while True: proxy_ip = t.get_proxy_ip() print(proxy_ip) try: resp = requests.get(pic_url, proxies=proxy_ip, timeout=5) if resp is not None: print("下载图片:" + resp.request.url) pic_name = pic_url.split("/")[-1] with open(pic_dir + pic_name, "wb+") as f: f.write(resp.content) return None except Exception as e: print(e)
def download_pic(pic_data): split = pic_data.split("~") pic_dir = c.ZZS_FLS_MZT_SAVE_PATH + split[0] + "/" pic_url = split[1] t.is_dir_existed(pic_dir) while True: proxy_ip = t.get_proxy_ip() print(proxy_ip) try: resp = requests.get(pic_url, proxies=proxy_ip, timeout=5) if resp is not None: print("下载图片:" + resp.request.url) pic_name = pic_url.split("/")[-1] with open(pic_dir + pic_name, "wb+") as f: f.write(resp.content) return None except Exception as e: print(e)
def __init__(self): if not t.is_dir_existed(save_file, mkdir=False): # 1.创建工作薄 self.workbook = xlwt.Workbook() # 2.创建工作表,第二个参数用于确认同一个cell单元是否可以重设值 self.sheet = self.workbook.add_sheet(u"豆瓣音乐Top 250", cell_overwrite_ok=True) # 3.初始化表头 self.headTitles = [u'图片链接', u'歌名', u'歌手', u'发行时间', u'分类', u'评分', u'评分人数', u'歌曲详情页'] for i, item in enumerate(self.headTitles): self.sheet.write(0, i, item, self.style('Monaco', 220, bold=True)) self.workbook.save(save_file)
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-seg_file", default='test.txt', help='seg_file') parser.add_argument("-title_file", default='test.txt', help='title_file') parser.add_argument('-output_file', default='output.txt', help='output_file') args = parser.parse_args() seg_file = args.seg_file title_file = args.title_file seg_text = [] title_text = [] with open(seg_file) as f: for line in f: seg_text.append(line) f.close() with open(title_file) as f: for lien in f: title_text.append(line) t.is_dir_existed(c.outputs_documents_path) ppt_existed(ppt_file_name) presentation = Presentation(ppt_file_name) read_rules(presentation, rules_path) presentation.save(ppt_file_name)
return None except Exception as e: pass def fetch_meizi_pic(url): print("解析接口:" + url) try: resp = requests.get(url).json() return resp['results'] except Exception as e: print(e) if __name__ == '__main__': t.is_dir_existed(pic_save_dir) t.is_dir_existed(c.outputs_logs_path) print("检测图片图片url文件是否存在:") if t.is_dir_existed(pic_urls_file, mkdir=False): print("url文件已存在!") else: print("url文件不存在,开始解析图片接口...") cur_page = 1 while True: results = fetch_meizi_pic(api_url + str(cur_page)) if results is not None and len(results) > 0: for result in results: t.write_str_data(result['url'], pic_urls_file) cur_page += 1 else: break
visual_range=[10, 2500], visual_text_color="#fff", symbol_size=15, is_visualmap=True) return geo # 交友宣言词云 def draw_word_wc(name, count): wc = WordCloud(width=1300, height=620) wc.add("", name, count, word_size_range=[20, 100], shape='diamond') wc.render() if __name__ == '__main__': if not t.is_dir_existed(result_save_file, mkdir=False): for i in range(1, 777): time.sleep(random.randint(2, 10)) fetch_data(i) else: raw_data = pd.read_csv(result_save_file) word_result = word_pattern.sub("", ''.join(analysis_word(raw_data))) words = [ word for word in jb.cut(word_result, cut_all=False) if len(word) >= 3 ] exclude_words = [ '一辈子', '不相离', '另一半', '业余时间', '性格特点', '茫茫人海', '男朋友', '找对象', '谈恋爱', '有时候', '女孩子', '哈哈哈', '加微信', '兴趣爱好', '是因为', '不良嗜好', '男孩子', '为什么', '没关系', '不介意', '没什么', '交朋友', '大大咧咧', '大富大贵', '联系方式', '打招呼', '有意者', '晚一点', '哈哈哈', '以上学历', '是不是', '给我发', '不怎么', '第一次', '越来越', '遇一人',
# 技能标签 skill_list = [] for skills in data['技能标签']: for skill in skills.strip().replace("[", "").replace("]", "").replace( "'", "").split(','): skill_list.append(skill) counter = dict(Counter(skill_list)) counter.pop('') counter.pop('Android') make_wc(counter, pic_save_path + "wc_4.jpg") # 处理数据 if __name__ == '__main__': t.is_dir_existed(pic_save_path) if not t.is_dir_existed(result_save_file, mkdir=False): fetch_data(1) for cur_page in range(2, max_page + 1): # 随缘休息5-15s time.sleep(random.randint(5, 15)) fetch_data(cur_page) else: raw_data = pd.read_csv(result_save_file) # data_analysis(raw_data) # 筛选电子商务公司 dzsw_result = raw_data.loc[raw_data["行业领域"].str.find("电子商务") != -1, ["行业领域", "公司全名"]] dzsw_result.to_csv(c.outputs_logs_path + "dzsw.csv", header=False, index=False,
paragraph.font.name = '微软雅黑' paragraph.font.color.rgb = RGBColor(255, 255, 255) # 读取配置文件调用模板的方法 def read_rules(prs, filename): if os.path.exists(filename): with open(filename, 'r+', encoding='utf-8') as f: for rule in f: word_list = rule.replace('\n', '').split(',') if 'png' in rule or 'jpg' in rule: if len(word_list) == 1: model_1(prs, os.path.join(c.res_pictures, word_list[0])) else: model_3(prs, word_list[0], os.path.join(c.res_pictures, word_list[1])) else: if len(word_list) == 1: model_2(prs, word_list[0]) elif len(word_list) == 2: model_4(prs, word_list[0], word_list[1]) elif len(word_list) == 4: model_5(prs, word_list[0], word_list[1], word_list[2], word_list[3]) if __name__ == '__main__': t.is_dir_existed(c.outputs_documents_path) ppt_existed(ppt_file_name) presentation = Presentation(ppt_file_name) read_rules(presentation, rules_path) presentation.save(ppt_file_name)
# 获取套图Url里所有的图片 def catch_pic_diagrams(url): resp = requests.get(url).content if resp is not None: soup = t.get_bs(resp) # 拿标题建文件夹 title = soup.select("h1.article-title a")[0].text imgs = soup.select('article.article-content img') for img in imgs[:-1]: t.write_str_data(title + "~" + str(img['src']), c.ZZS_FLS_MZT_URL_FILE_PATH + c.ZZS_FLS_MZT_URL_FILE_NAME) if __name__ == '__main__': t.is_dir_existed(c.ZZS_FLS_MZT_URL_FILE_PATH) cur_page = 1 while True: results = catch_pic_diagrams_url(c.ZZS_FLS_MZT_URL + str(cur_page)) if results is not None and len(results) > 0: for result in results: catch_pic_diagrams(result) cur_page += 1 else: break # 加载下载列表 data_list = t.load_list_from_file(c.ZZS_FLS_MZT_URL_FILE_PATH + c.ZZS_FLS_MZT_URL_FILE_NAME) pool = multiprocessing.Pool() pool.map(download_pic, data_list)
# 阅读量访问线程 class Reader(t.Thread): def __init__(self, t_name, func): self.func = func t.Thread.__init__(self, name=t_name) def run(self): self.func() # 阅读操作 def reading(): while True: read_article_url(url_list[random.randint(0, len(url_list) - 1)]) if __name__ == '__main__': print("判断文章链接文件是否存在:") if not tools.is_dir_existed(articles_file, mkdir=False): print("链接文件不存在,抓取链接...") count = int(get_page_count()) for i in range(1, count + 1): get_article_url(base_article_list + str(i) + '?viewmode=contents') else: print("链接文件存在") print("加载文章链接文件...") url_list = tools.load_list_from_file(articles_file) for i in range(100): reader = Reader("线程" + str(i), reading) reader.start()
height=600, background_color='#404a59') attr, value = geo.cast(data) geo.add("", attr, value, visual_range=[10, 2500], visual_text_color="#fff", symbol_size=15, is_visualmap=True) return geo # 交友宣言词云 def draw_word_wc(name, count): wc = WordCloud(width=1300, height=620) wc.add("", name, count, word_size_range=[20, 100], shape='diamond') wc.render() if __name__ == '__main__': if not t.is_dir_existed(result_save_file, mkdir=False): for i in range(1, 777): time.sleep(random.randint(2, 10)) fetch_data(i) else: raw_data = pd.read_csv(result_save_file) word_result = word_pattern.sub("", ''.join(analysis_word(raw_data))) words = [word for word in jb.cut(word_result, cut_all=False) if len(word) >= 3] exclude_words = [ '一辈子', '不相离', '另一半', '业余时间', '性格特点', '茫茫人海', '男朋友', '找对象', '谈恋爱', '有时候', '女孩子', '哈哈哈', '加微信', '兴趣爱好', '是因为', '不良嗜好', '男孩子', '为什么', '没关系', '不介意', '没什么', '交朋友', '大大咧咧', '大富大贵', '联系方式', '打招呼', '有意者', '晚一点', '哈哈哈', '以上学历', '是不是', '给我发', '不怎么', '第一次', '越来越', '遇一人', '择一人', '无数次', '符合条件', '什么样', '全世界', '比较简单', '浪费时间', '不知不觉',
salary_index = list(salary.index) salary_index.sort(key=lambda x: int(x)) final_salary = salary.reindex(salary_index) plt.title("薪资条形图") final_salary.plot(kind='bar', rot=0) plt.xlabel("薪资/K") plt.ylabel("公司/个") plt.savefig(pic_save_path + 'result_7.jpg') plt.close(7) # 技能标签 skill_list = [] for skills in data['技能标签']: for skill in skills.strip().replace("[", "").replace("]", "").replace("'", "").split(','): skill_list.append(skill) counter = dict(Counter(skill_list)) counter.pop('') make_wc(counter, pic_save_path + "wc_4.jpg") # 处理数据 if __name__ == '__main__': t.is_dir_existed(pic_save_path) if not t.is_dir_existed(result_save_file, mkdir=False): fetch_data(1) for cur_page in range(2, max_page + 1): fetch_data(cur_page) else: raw_data = pd.read_csv(result_save_file) data_analysis(raw_data)
# 获取套图Url里所有的图片 def catch_pic_diagrams(url): resp = requests.get(url).content if resp is not None: soup = t.get_bs(resp) # 拿标题建文件夹 title = soup.select("h1.article-title a")[0].text imgs = soup.select('article.article-content img') for img in imgs[:-1]: t.write_str_data(title + "~" + str(img['src']), file_save_path) if __name__ == '__main__': t.is_dir_existed(c.outputs_logs_path) cur_page = 1 while True: results = catch_pic_diagrams_url(index_url + str(cur_page)) if results is not None and len(results) > 0: for result in results: catch_pic_diagrams(result) cur_page += 1 else: break # 加载下载列表 data_list = t.load_list_from_file(file_save_path) pool = multiprocessing.Pool() pool.map(download_pic, data_list)
return url_list # 获取套图Url里所有的图片 def catch_pic_diagrams(url): resp = requests.get(url).content if resp is not None: soup = t.get_bs(resp) # 拿标题建文件夹 title = soup.select("h1.article-title a")[0].text imgs = soup.select('article.article-content img') for img in imgs[:-1]: t.write_str_data(title + "~" + str(img['src']), file_save_path) if __name__ == '__main__': t.is_dir_existed(c.outputs_logs_path) cur_page = 1 while True: results = catch_pic_diagrams_url(index_url + str(cur_page)) if results is not None and len(results) > 0: for result in results: catch_pic_diagrams(result) cur_page += 1 else: break # 加载下载列表 data_list = t.load_list_from_file(file_save_path) pool = multiprocessing.Pool() pool.map(download_pic, data_list)