def page_source_get(url): try: rep_data = requests.get(url).text if url.split('.')[-1] == 'json': return json.loads(rep_data) else: return rep_data except requests.exceptions.RequestException: helper.logger_getter().error('Network connection error') exit(1)
def source_get_by_phantomjs(): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = headers['User-Agent'] dcap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(executable_path=helper.CURR_PATH + '/core/phantomjs-2.1.1', desired_capabilities=dcap) helper.logger_getter().debug('The phantomjs is running...') try: driver.implicitly_wait(5) driver.set_page_load_timeout(10) # 设置10秒脚本超时时间 driver.set_script_timeout(10) driver.get(url) source = driver.page_source driver.quit() return source except Exception as e: helper.logger_getter().error(str(e)) driver.quit() exit(1)
def check_new(): # if no txt file existing, first init if not os.path.isfile(helper.TEMP_DIR + '/v2ex_id_data.txt'): id_persistence() helper.logger_getter().info('First init to store id data,exit!') exit(0) # read previous v2ex_id_data.txt and compare with open(helper.TEMP_DIR + '/v2ex_id_data.txt') as f: old_id_list = [_.rstrip() for _ in f.readlines()] new_id_list = [str(_['id']) for _ in html.page_source_get(hot_url) if str(_['id']) not in old_id_list] # if new_id_list is not 0 which means new hot post occurs if len(new_id_list) != 0: id_persistence() mail_body = [] for new_id in new_id_list: for data_collection in html.page_source_get(hot_url): if new_id == str(data_collection['id']): mail_body.append(data_collection['title'] + ': ' + data_collection['url']) helper.mail_send(helper.date_getter() + ' V2exHot Update!', '\n\n'.join(mail_body)) helper.logger_getter().info('V2ex has new hot posts.') else: helper.logger_getter().info('V2ex has no new hot post.')
def check_new(option): if not os.path.isfile(temp_file): helper.logger_getter().info( "First init to store the url of all posts!") data_persistence() exit(0) with open(temp_file) as f: previous_posts = [i.split('\n')[0] for i in f.readlines()] # inside the list of new_posts, there still are the aTag object new_posts = [ i for i in aTags_list if i.get('href') + '|' + i.get_text() not in previous_posts ] if len(new_posts) == 0: helper.logger_getter().info('Yin did not publish any blog yet!') else: for i in new_posts: msg_content = 'Yin published a new blog' blog_url = yinwang_blog + i.get('href') blog_title = i.get_text() helper.logger_getter().info(msg_content) helper.mail_send( helper.date_getter() + ' ' + msg_content + ':' + blog_title, blog_url) # helper.dir_check(helper.CURR_PATH + '/yinblog_back') # html.make_screenshot(blog_url, helper.CURR_PATH + '/yinblog_back/' + blog_title + '.png') data_persistence()
def check_new(option): if not os.path.isfile(helper.TEMP_DIR + '/yinBlog_1stURL.txt'): firstURL_persistence() helper.logger_getter().info( "First init to store the url of the first post!") exit(0) with open(helper.TEMP_DIR + '/yinBlog_1stURL.txt') as f: # if new first url doesn't equal to the record one, upgrade it first! if first_aTag.get('href') != f.readline(): helper.logger_getter().info('Yinwang published a new blog!') helper.logger_getter().info('Renew the first url in the file') firstURL_persistence() blog_url = 'http://www.yinwang.org' + first_aTag.get('href') blog_title = first_aTag.get_text().strip() helper.mail_send('垠神发表了新Blog: ' + blog_title, blog_url) # begin making screenshot helper.dir_check('yinBlogBak') html.make_screenshot( blog_url, helper.CURR_PATH + '/yinBlogBak/' + blog_title + '.png') # decide whether push the screenshot to github repo or not if option: os.system('git add .') os.system("git commit -m 'backup yinwang blog'") os.system('git push origin master') else: helper.logger_getter.info('Yin did not publish any blog yet!')
def main(self): misc = Misc() logger = helper.logger_getter() first_recent_comment = self.get_query_obj.find()[0] comment_content = first_recent_comment.get('comment') comment_id = first_recent_comment.get('objectId') if not helper.file_check('tmp/obj_id'): misc.data_persistence(comment_id) logger.debug('First run and persistent the object id...') else: last_comment_id = open('tmp/obj_id') with open('tmp/obj_id') as f: last_comment_text = f.read().strip() if last_comment_text != comment_id: helper.mail_send(subject='你的博客有了一个新评论!', mail_body=comment_content) # 有了新的评论,要及时把新的id持久化 misc.data_persistence(comment_id) logger.debug('Successfully sent the e-mail.') else: logger.debug('There has no new comment for now.')
def page_source_get(url, pagetype=None): headers = {'User-Agent': generate_user_agent(os='win')} def source_get_by_phantomjs(): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = headers['User-Agent'] dcap["phantomjs.page.settings.loadImages"] = False driver = webdriver.PhantomJS(executable_path=helper.CURR_PATH + '/core/phantomjs-2.1.1', desired_capabilities=dcap) helper.logger_getter().debug('The phantomjs is running...') try: driver.implicitly_wait(5) driver.set_page_load_timeout(10) # 设置10秒脚本超时时间 driver.set_script_timeout(10) driver.get(url) source = driver.page_source driver.quit() return source except Exception as e: helper.logger_getter().error(str(e)) driver.quit() exit(1) if pagetype is not None: return source_get_by_phantomjs() else: for _ in range(3): try: seconds = random.choice([i / 10 for i in range(35, 82)]) rep_data = requests.get(url, headers=headers).text if url.split('.')[-1] == 'json': return json.loads(rep_data) else: return rep_data except requests.exceptions.RequestException as e: helper.logger_getter().debug('Exception: ' + str(e)) helper.logger_getter().debug(url) helper.logger_getter().debug('Sleep for ' + str(seconds) + 's after timeout...') time.sleep(seconds)
def check_new(): with open(temp_data) as f: previous_data_format = [i.split('\n')[0] for i in f.readlines()] new_notification = [ i.text for i in timestamps_list if i.text not in previous_data_format ] for i in new_notification: for j in entries_list: if i == j.find('published').text and j.find('title').text == '': new_notification.remove(i) if len(new_notification) == 0: helper.logger_getter().info('V2ex has no notification 4 you!') else: current_time = time.strftime("%m-%d|%H:%M", time.localtime()) msg_content = 'V2ex has a new notification for you.' helper.mail_send(current_time + ' ' + msg_content, msg_content) helper.logger_getter().info(msg_content) data_persistence() helper.logger_getter().info('Renew the data file')
previous_data_format = [i.split('\n')[0] for i in f.readlines()] new_notification = [ i.text for i in timestamps_list if i.text not in previous_data_format ] for i in new_notification: for j in entries_list: if i == j.find('published').text and j.find('title').text == '': new_notification.remove(i) if len(new_notification) == 0: helper.logger_getter().info('V2ex has no notification 4 you!') else: current_time = time.strftime("%m-%d|%H:%M", time.localtime()) msg_content = 'V2ex has a new notification for you.' helper.mail_send(current_time + ' ' + msg_content, msg_content) helper.logger_getter().info(msg_content) data_persistence() helper.logger_getter().info('Renew the data file') if __name__ == '__main__': if not os.path.isfile(temp_data): data_persistence() helper.logger_getter().info( "First init to store some temp data from v2ex!") exit(0) check_new()