def parser_first_page_article(html, video_id, url): regex = '(<div class="m-feedSection clearfix.*?)<!-- 评论列表 end-->' content_blocks = tools.get_info(html, regex) for content_block in content_blocks: regex = 'data-paopao-feedId="(.*?)"' article_id = tools.get_info(content_block, regex, fetch_one = True) regex = '<img width="50".*?"(http.*?)"' head_url = tools.get_info(content_block, regex, fetch_one = True) regex = '<a.*?data-paopao-ele="userUrl".*?title="(.*?)"' name = tools.get_info(content_block, regex, fetch_one = True) regex = '<p class="feed_por_time">(.*?)</p>' release_time = tools.get_info(content_block, regex, fetch_one = True) release_time = tools.format_time(release_time) release_time = tools.format_date(release_time) regex = '<h3 class="title_icon_right" title="(.*?)">' title = tools.get_info(content_block, regex, fetch_one = True) regex = '<span data-paopao-ele="dispalyContent.*?">(.*?)</span>' content = tools.get_info(content_block, regex, fetch_one = True) regex = '<img width="100%" height="100%" data-lazy="(.*?)"' image_urls = tools.get_info(content_block, regex, split = ',') regex = '<em data-paopao-uvCnt=.*?>(.*?)</em>' watch_count = tools.get_info(content_block, regex, fetch_one = True) watch_count = tools.get_int(watch_count) regex = '<em data-paopao-agreeCnt="(.*?)">' up_count = tools.get_info(content_block, regex, fetch_one = True) regex = '<em data-paopao-commentCnt="(.*?)">' comment_count = tools.get_info(content_block, regex, fetch_one = True) log.debug(''' id: %s 节目id %s 头像地址: %s 名字: %s 发布时间: %s 标题: %s 内容: %s 图片地址: %s 观看量: %s 点赞量: %s 评论量: %s '''%(article_id, video_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count)) if self_base_parser.add_article(article_id, head_url, name, release_time, title, content, image_urls, watch_count, up_count, comment_count, program_id = video_id, gender = random.randint(0,1), url = url, info_type = 3, emotion = random.randint(0,2), collect = 0, source = '爱奇艺'): # 解析評論 regex = "\['wallId'\] = \"(.*?)\"" wall_id = tools.get_info(html, regex, fetch_one = True) parser_comment(article_id, wall_id) else: break
def save_video_info(release_time='', content='', url='', author='', title='', image_url='', site_name='', play_count=None, comment_count=None, praise_count=None, summary='', time_length=None): domain = tools.get_domain(url) content_info = { 'domain': domain, 'uuid': tools.get_uuid(title, domain), 'site_name': site_name, 'image_url': image_url, 'title': title, 'author': author, 'url': url, 'content': content, 'release_time': tools.format_date(release_time), 'play_count': play_count, 'comment_count': comment_count, 'praise_count': praise_count, 'time_length': time_length, 'record_time': tools.get_current_date(), 'summary': summary } log.debug(tools.dumps_json(content_info)) es.add('video_news', content_info, content_info['uuid'])
def get_release_time_in_paragraph(paragraph_pos): if self._paragraphs: while paragraph_pos >= 0: content = self.__replace_str(self._paragraphs[paragraph_pos], '<(.|\n)*?>', '<>') release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True) if release_time: return tools.format_date(release_time) paragraph_pos -= 1 return None
def get_release_time_old(self): if self._content_start_pos and self._content_end_pos: content = self.__replace_str('\n'.join(self._paragraphs[self._content_start_pos - RELEASE_TIME_OFFSET: self._content_end_pos + RELEASE_TIME_OFFSET]), '<(.|\n)*?>', '<>') else: content = self.__replace_str(self._text, '<(.|\n)*?>', '<>') release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True) if not release_time: release_time = tools.get_info(self.__replace_str(self._text, '<(.|\n)*?>', '<>'), DAY_TIME_REGEXS, fetch_one = True) release_time = tools.format_date(release_time) return release_time
def parser_comment(article_id): page = 1 is_continue = True while True and is_continue: url = 'https://m.weibo.cn/api/comments/show?id=%s&page=%s' % ( article_id, page) comment_json = tools.get_json_by_requests(url) msg = comment_json.get('msg') if msg == '暂无数据': break comment_datas = comment_json.get('data', {}).get('data', []) for comment_data in comment_datas: comment_id = comment_data.get('id') release_time = comment_data.get('created_at') release_time = tools.format_date(release_time) come_from = comment_data.get('source') content = comment_data.get('text') praise_count = comment_data.get('like_counts') user_name = comment_data.get('user', {}).get('screen_name') head_url = comment_data.get('user', {}).get('profile_image_url') emotion = random.randint(0, 2) hot_id = comment_id log.debug(''' id: %s 发布时间:%s 来自: %s 内容: %s 点赞数: %s 用户名 %s 头像地址 %s ''' % (comment_id, release_time, come_from, content, praise_count, user_name, head_url)) if not self_base_parser.add_comment( comment_id, None, article_id, user_name, head_url, None, content, praise_count, release_time, emotion, hot_id): is_continue = False break page += 1
def save_weibo_info(table, site_id='', release_time='', video_url='', user_name='', content='', _id='', url='', reposts_count='', comments_count='', attitudes_count='', is_debug=False): if es.get('weibo_article', _id): log.debug('%s 已存在'%content) return False content_info = { 'transmit_count': reposts_count, # 转发数 'comment_count': comments_count, 'up_count': attitudes_count, 'url': url, 'id': _id, #int 'video_url': video_url, 'content': content, 'release_time': tools.format_date(release_time), 'record_time' : tools.get_current_date(), 'user_name': user_name } log.debug(tools.dumps_json(content_info)) es.add('weibo_article', content_info, data_id = _id) return True
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("
", url): regex = '.*?(/GovPublicInfo.+?000)' new_url = tools.get_info(url, regex) new_url = new_url[0] new_url = 'http://www.luzhou.gov.cn' + new_url else: new_url = 'http://www.luzhou.gov.cn' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h2 class="title">(.*?)</h2>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span>发布时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) #文章来源 regexs = '<span>文章来源:(.*?)</span>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) #点击数 regexs = '<span>点击数.*?src="(.*?)"></script>' times_script_url = tools.get_info(html, regexs) times_script_url = ''.join(times_script_url) times_script_url = 'http://www.luzhou.gov.cn' + times_script_url watched_count_html, request = tools.get_html_by_requests(times_script_url) regexs = '\'(\d*?)\'' watched_count = tools.get_info(watched_count_html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<div class="conTxt">(.*?)</div>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, title, source_url, release_time, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url, code='gb2312') regexs = 'charset=(.*?)"' code = tools.get_info(html, regexs) code = code and code[0] or 'gb2312' html, request = tools.get_html_by_requests(source_url, code=code) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match('/', url): new_url = 'http://www.scpolicec.edu.cn' + url else: new_url = 'http://www.scpolicec.edu.cn/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = ['<div class="main_title">(.*?)<div class="top_about">', '<h1>(.*?)</h1>', '<title>(.*?)</title>', '<div class="contentPageTitle">(.*?)</div>'] title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>','<small>时间:</small>(.*?)<small>', '<h2><span>更新时间:(.*?)</span>'] release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if not release_time: regexs = '</a> 发布时间:(.*?) 点击数' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) # #作者 regexs = ['作者:(.*?) 【'] author = tools.get_info(html, regexs) author = author and author[0] or '' #author = tools.del_html_tag(author) #文章来源 regexs = '来源:(.*?)</a>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)
发表时间'] watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<p style="text-align: center;">(.*?)</table>', '<div class="contentPageContent">(.*?)</table>' '<div id="endtext" style="width:900px;">(.*?)<div id="pages"></div>', '<div id="articleContnet">(.*?)<div class="page_css">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id,url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
title = title and title[0] or '' title = tools.del_html_tag(title) # 时间 regexs = ['<div class="top_about"><a editurl=\'.*?\'>(.*?)</a>'] release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if not release_time: regexs = '<small>时间:</small>(.*?)<small>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if not release_time: regexs = '</a> 发布时间:(.*?) 点击数' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) # # #作者 regexs = ['作者:(.*?) 【', '作者:(.*?) 来源'] author = tools.get_info(html, regexs) author = author and author[0] or '' # author = tools.del_html_tag(author) # # 文章来源 # regexs = '来源:(.*?)</a>' # origin = tools.get_info(html, regexs) # origin = origin and origin[0] or '' # origin = tools.del_html_tag(origin) # # # # #点击数 regexs = ['浏览:<font id="hits">(\d*?)</font>次', '点击数:(\d*?)
发表时间']
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) root_url = url_info['url'] user_id = url_info['remark']['user_id'] head_url = url_info['remark']['head_url'] user_name = url_info['remark']['user_name'] gender = url_info['remark']['gender'] program_id = url_info['remark']['program_id'] page_count = 50 is_continue = True for i in range(0, page_count + 1): if not is_continue: break weibo_content_url = root_url + '&page=%d' % i headers = { "Cache-Control": "max-age=0", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D100103type%253D401%2526q%253D%26uicode%3D10000011", "Accept-Language": "zh-CN,zh;q=0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Host": "m.weibo.cn", "Accept-Encoding": "gzip, deflate, br", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" } html = tools.get_json_by_requests(weibo_content_url, headers=headers) cards = tools.get_json_value(html, 'data.cards') if len(cards) < 2: base_parser.update_url('mms_urls', root_url, Constance.DONE) return for card in cards: mblog = tools.get_json_value(card, 'mblog') if not mblog: continue url = tools.get_json_value(card, 'scheme') article_id = tools.get_json_value(mblog, 'id') article_url = 'https://m.weibo.cn/status/' + article_id headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Cookie": "_T_WM=e0a91a3ed6286a67e649ce567fbbd17a; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D100103type%253D401%2526q%253D%26fid%3D2304131560851875_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011", "Host": "m.weibo.cn", "Accept-Language": "zh-CN,zh;q=0.8", "Upgrade-Insecure-Requests": "1", "Connection": "keep-alive" } origin_html, r = tools.get_html_by_requests(url, headers=headers) if not origin_html: continue # 精确到具体时分秒 需进入到article_url release_time = mblog['created_at'] release_time = tools.format_time(release_time) # release_time = get_release_time(mblog) release_time = tools.format_date(release_time) come_from = tools.get_json_value(mblog, 'source') regexs = ['"text": "(.+?)",'] content = ''.join(tools.get_info(origin_html, regexs)) # content = tools.del_html_tag(content) content = content.replace('\\', '') regexs = ['"pic_ids": \[(.*?)\],'] image_url = ''.join(tools.get_info(origin_html, regexs)) image_url = tools.del_html_tag(image_url).replace('\"', '').replace( '\\n', '') if image_url: image_url = image_url.split(',') for i in range(len(image_url)): image_url[i] = 'http://wx2.sinaimg.cn/large/' + image_url[ i] + '.jpg' image_url = ','.join(image_url) regexs = ['"stream_url": "(.*?)"'] video_url = ''.join(tools.get_info(origin_html, regexs)) transpond_count = tools.get_json_value(mblog, 'reposts_count') praise_count = tools.get_json_value(mblog, 'attitudes_count') comments_count = tools.get_json_value(mblog, 'comments_count') log.debug(''' 原文地址: %s 博主ID: %s 文章id %s 发布时间: %s 来自: %s 内容: %s 图片地址: %s 视频地址: %s 评论数: %s 转发数: %s 点赞数: %s ''' % (article_url, user_id, article_id, release_time, come_from, content, image_url, video_url, comments_count, transpond_count, praise_count)) if self_base_parser.add_article(article_id, head_url, user_name, release_time, None, content, image_url, None, praise_count, comments_count, program_id=program_id, gender=gender, url=article_url, info_type=1, emotion=random.randint(0, 2), collect=0, source='新浪微博'): if comments_count > 0: parser_comment(article_id) else: is_continue = False break base_parser.update_url('mms_urls', root_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url else: new_url = 'http://www.xuyong.gov.cn'+url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<td class="titlestyle1037" align="center">(.*?)</td></tr>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span class="timestyle1037" >(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) # #作者 # regexs = '<span>作者:(.*?)</span>' # author = tools.get_info(html, regexs) # author = author and author[0] or '' # author = tools.del_html_tag(author) #文章来源 # regexs = '采编: (.*?)阅读' # origin = tools.get_info(html, regexs) # origin = origin and origin[0] or '' # origin = tools.del_html_tag(origin) # #点击数 # regexs = '阅读:(\d*?)次' # watched_count = tools.get_info(html, regexs) # watched_count = watched_count and watched_count[0] or '' # watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<tr><td class="contentstyle1037" >(.*?) <tr><td class="pagestyle1037" align="left">'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s content = %s ''' % (depth+1, source_url, title, release_time, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def get_release_time(self): content = self.__del_html_tag(self._text) release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one = True) release_time = tools.format_date(release_time) return release_time
def get_release_time(self): content = self.__replace_str(self._text, '<(.|\n)*?>', '<>') release_time = tools.get_info(content, DAY_TIME_REGEXS, fetch_one=True) release_time = tools.format_date(release_time) return release_time
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url else: new_url = 'http://www.jiangyang.gov.cn/template/default/' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<div class="tit">(.*?)</div>' title = tools.get_info(html, regexs) if not title: regexs = '<h1>(.*?)</h1>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<label>(.*?)</label>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' if release_time: release_time = tools.format_date(release_time) if not release_time: regexs = '<span class="time">发布时间:(.*?)</span><span class="source"></span></p>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' #release_time = tools.format_date(release_time) #文章来源 regexs = '<label>来源:(.*?)</label>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # 内容 regexs = ['<div class="content" id="nr" style="">(.*?)</div>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) if not content: regexs = '<p style="text-align: center;"(.*?)</div>.*?<div class="content">' content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s origin = %s content = %s ''' % (depth + 1, source_url, title, release_time, origin, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, origin=origin, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def parser(url_info): url_info['_id'] = str(url_info['_id']) log.debug('处理 \n' + tools.dumps_json(url_info)) source_url = url_info['url'] depth = url_info['depth'] website_id = url_info['site_id'] description = url_info['remark'] html, request = tools.get_html_by_requests(source_url) if html == None: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return # 判断中英文 regex = '[\u4e00-\u9fa5]+' chinese_word = tools.get_info(html, regex) if not chinese_word: base_parser.update_url('op_urls', source_url, Constance.EXCEPTION) return urls = tools.get_urls(html) for url in urls: if re.match("http", url): new_url = url elif re.match("
", url): regex = '.*?(/Survey.+?html)' new_url = tools.get_info(url, regex) if new_url: new_url = new_url[0] new_url = 'http://www.longmatan.gov.cn' + new_url else: new_url = 'http://www.longmatan.gov.cn' + url base_parser.add_url('op_urls', website_id, new_url, depth + 1) # 取当前页的文章信息 # 标题 regexs = '<h2 class="title">(.*?)</h2>' title = tools.get_info(html, regexs) title = title and title[0] or '' title = tools.del_html_tag(title) #时间 regexs = '<span>发布时间:(.*?)</span>' release_time = tools.get_info(html, regexs) release_time = release_time and release_time[0] or '' release_time = tools.format_date(release_time) #作者 regexs = '<span>作者:(.*?)</span>' author = tools.get_info(html, regexs) author = author and author[0] or '' author = tools.del_html_tag(author) #文章来源 regexs = '<span>文章来源:(.*?)</span>' origin = tools.get_info(html, regexs) origin = origin and origin[0] or '' origin = tools.del_html_tag(origin) # #点击数 regexs = '<span>点击数:(\d*?)<span' watched_count = tools.get_info(html, regexs) watched_count = watched_count and watched_count[0] or '' watched_count = tools.del_html_tag(watched_count) # 内容 regexs = ['<div class="conTxt">(.*?)</div>'] content = tools.get_info(html, regexs) content = content and content[0] or '' content = tools.del_html_tag(content) log.debug(''' depth = %s url = %s title = %s release_time = %s author = %s origin = %s watched_count = %s content = %s ''' % (depth + 1, source_url, title, release_time, author, origin, watched_count, content)) if content and title: base_parser.add_op_info('op_content_info', website_id, url=source_url, title=title, release_time=release_time, author=author, origin=origin, watched_count=watched_count, content=content) # 更新source_url为done base_parser.update_url('op_urls', source_url, Constance.DONE)
def save_info(table, site_id, site_name='', url='', title='', content='', release_time='', image_url='', video_url='', is_out_link=1, download_image=False, is_debug=False, es_read_status='', info_type=''): # global num # if num<2000: # num+=1 # image_recogs=image_recog(image_url) # else: # image_recogs=5 if not download_image: sexy_image_url = image_url local_image_path = '' else: file_local_path = tools.get_conf_value('config.conf', 'files', 'zhejiang_app_save_path') if image_url: img_name = 'images/' + tools.get_current_date( date_format='%Y-%m-%d') + "/" + tools.get_current_date( date_format='%Y%m%d%H%M%S.%f') + '.jpg' tools.download_file(image_url, file_local_path, img_name) local_image_path = file_local_path + img_name sexy_image_url = local_image_path else: local_image_path = '' sexy_image_url = '' if len(content) > 400: temporary_content = content[0:400] else: temporary_content = content # record_time = tools.get_current_date() # release_time = tools.format_date(release_time) try: release_time = tools.format_date(release_time) except Exception as e: log.debug(e, release_time, url) record_time = tools.get_current_date() if release_time > record_time: return content_info = { 'site_name': site_name, 'video_url': video_url, 'image_url': image_url, 'temporary_content': temporary_content, 'title': title, # 'video_local_path': local_video_path,\ 'img_stor_path': local_image_path, 'release_time': release_time, 'is_out_link': is_out_link, 'url': url, 'es_read_status': 0, 'site_id': site_id, 'read_status': 0, 'record_time': record_time, # 'sexy_image_url': sexy_image_url, 'sexy_image_status': '', 'image_pron_status': image_recogs } content_info.pop('temporary_content') content_info['content'] = content if db.add(table, content_info): log.debug(content_info)