def get_url(self): hu = huxiu() tmt = tmtpost() zao = zaodu() chan = chanpin() pmt = pmtoo() woshi = woshipm() url_list = hu.get_url() + tmt.get_url() + zao.get_url() \ + chan.get_url() + pmt.get_url() + woshi.get_url() # print(url_list) return url_list
def get_news(self, url): """ :param url: 需要抓取内容的URL地址 :url[12]: 判断对应的url,h, m, 3, l分别对应:huxiu, tmtpost, 36kr, leiphone :return: news """ if url: if url[11] == 'c': chan = chanpin() return chan.get_news(url) elif url[12] == 'h': hu = huxiu() return hu.get_news(url) elif url[11] == 'p': pmt = pmtoo() return pmt.get_news(url) elif url[12] == 'm': tmt = tmtpost() return tmt.get_news(url) elif url[12] == 'z': zao = zaodu() return zao.get_news(url) elif url[12] == 'o': woshi = woshipm() return woshi.get_news(url) else: print(self.time_now(), '\tAppear error url=', url, '\n') return None else: print(self.time_now(), '\tAppear error url= None\n') return None
cursor.executemany(INSERT_TERM_REATIONSHIPS, [(last_id, last_taxonomy_id, 0), (last_id, term_id, 0)]) # 尝试提交数据库,失败则回滚操作 try: conn.commit() conn.close() print(time, '\tThe news write MySql Success @SKYNE\n') return True except Exception as e: conn.rollback() conn.close() print(time, '\tThe news write MySql Failed @SKYNE\n') return False if __name__ == '__main__': from spider.huxiu import huxiu news = { 'url': 'https://www.huxiu.com/article/227432.html', 'link': 'https://m.huxiu.com/article/227432.html', 'title': 'vfkhvbjkhbjkhgbhjmgkjh', 'text': 'asdasdqwdqwdasdqwdqwdqwdqwd', 'author': '虎嗅网', 'labels': '金融地产', 'service': 'Article.AddArticle' } hu = huxiu() news = hu.get_news("https://www.huxiu.com/article/236527.html") write(news)
def main(self): print(self.time_now(),'\t程序启动中,请等待......\n') hour_counter = 1 # 打开数据库连接 cfg = configparser.ConfigParser() cfg.read("conf.ini") db_host = cfg.get("database", "host") db_port = cfg.getint("database", "port") db_name = cfg.get("database", "dbname") db_user = cfg.get("database", "user") db_pass = cfg.get("database", "pass") pref_write_file = cfg.getint("preference", "writefile") while(True): print(self.time_now(), '\t-------- 开始处理第{}次调度! --------\n'.format(hour_counter)) db = pymysql.connect(host=db_host, user=db_user, password=db_pass,db=db_name,port=db_port, use_unicode=True, charset="utf8") cur = db.cursor() sql_select_from_web_src = "select id,name,platform_id,url,img from 91_web_src" cur.execute(sql_select_from_web_src) result_data = cur.fetchall() for id,name,platform_id,url,img in result_data: print(" 从91_web_src表中查询到记录:", id, name, platform_id, url) print("\n 处理中,详情请查看log文件......\n") if url.startswith("https://www.huxiu.com"): hu = huxiu() inner_url_list = hu.get_inner_url_list_new(url) for inner_url in inner_url_list: if is_url_processed(inner_url['link']) == True: continue news = hu.get_news(url = inner_url['link']) if pref_write_file == 1: write(news) writeIntoMysql(news, id, name, platform_id, inner_url['img'], inner_url['desc']) elif url.startswith("https://36kr.com"): kr36 = kr () inner_url_list = kr36.get_inner_url_list_new(url) for inner_url in inner_url_list: if is_url_processed(inner_url['link']) == True: continue news = kr36.get_news(url = inner_url['link'], title = inner_url['title'], summary = inner_url['desc']) if pref_write_file == 1: write(news) writeIntoMysql(news, id, name, platform_id, inner_url['img'], inner_url['desc']) elif url.startswith("http://36kr.com"): kr36 = kr () inner_url_list = kr36.get_inner_url_list_new(url) for inner_url in inner_url_list: if is_url_processed(inner_url['link']) == True: continue news = kr36.get_news(url = inner_url['link'], title = inner_url['title'], summary = inner_url['desc']) if pref_write_file == 1: write(news) writeIntoMysql(news, id, name, platform_id, inner_url['img'], inner_url['desc']) cur.close() db.close() print(self.time_now(), '\t======== 第{}次调度处理结束! ========\n'.format(hour_counter)) # 两小时扫描一次数据库 time.sleep(7200) hour_counter += 1