def insert(data): post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # content = data['content'].replace("'", "\'") content = MySQLdb.escape_string(data['content']) post_name = data['title'] # 插入的sql insert_sql = "insert into wp_posts(post_author, post_date, post_date_gmt, " \ " post_excerpt, to_ping, pinged, post_content_filtered, " \ " post_title, post_content, post_status,comment_status, ping_status, " \ " post_name, post_type, from_type, from_url, from_ctime)" \ " values('1', '{post_date}', '{post_date_gmt}'," \ " '', '', '', '', " \ " '{title}', '{content}', 'publish', 'open', 'open', " \ " '{post_name}', 'post', '{from_type}', '{from_url}', '{from_ctime}')" \ .format(post_author=1, post_date=post_date, post_date_gmt=post_date, title=data['title'], content=content, post_name=post_name, from_type=data['type'], from_url=data['url'], from_ctime=data['send_time']) # 打印sql # service_logger.log(insert_sql) res = SqlService.api(insert_sql, 'execute') if res is not None: # 插入浏览数 views_count = random.randint(1, 80) insert_meta_sql = "insert into wp_postmeta(post_id, meta_key, meta_value) value (%s, 'post_views_count', '%s')" % ( res, views_count) service_logger.log(insert_meta_sql) SqlService.api(insert_meta_sql, 'execute') return res return False
def insert_meta(post_id, attachment_id): insert_meta_sql = "insert into wp_postmeta(post_id, meta_key, meta_value) value (%s, '_thumbnail_id', '%s')" % ( post_id, attachment_id) service_logger.log(insert_meta_sql) res = SqlService.api(insert_meta_sql, 'execute') if res is not None: return res return False
def handle(self): service_logger.log(self.url) try: self._handle() except Exception, err: service_logger.error("task-exception", { "msg": traceback.format_exc(), "url": self.url })
def _others(self): others = { "name": "", "name_cn": "", "year": "", "country": "", "language": "", "font": "", "release_date": "", "score": "", "file_size": "", "movie_duration": "", "director": "", "actors": "", } fields = { 'name_cn': '◎译 名(.*?)<br />', 'name': '◎片 名(.*?)<br />', 'year': '◎年 代(.*?)<br />', 'country': '◎(产 地|国 家)(.*?)<br />', 'category': '◎类 别(.*?)<br />', 'language': '◎语 言(.*?)<br />', 'font': '◎字 幕(.*?)<br />', 'release_date': '◎上映日期(.*?)<br />', 'score': '◎(IMDB评分|豆瓣评分)(.*?)<br />', 'file_size': '◎文件大小(.*?)<br />', 'movie_duration': '◎片 长(.*?)<br />', 'director': '◎导 演(.*?)<br />', 'actors': '◎主 演(.*?)<br />', } for key, regex in fields.items(): try: resu = re.findall(regex, self.html, re.S) if len(resu) > 0: if type(resu[0]).__name__ == 'tuple': value = resu[0][1] else: value = resu[0] # 评分 if key == 'score' and '/' in value: value = value.split('/')[0] value = value.replace(' ', '') value = value.replace("\s", '') value = value.replace(" ", '') value = value.strip() others[key] = value except Exception as e: service_logger.log(key + ':' + regex) service_logger.log('except:' + repr(e)) return others
def insert(object_id, term_taxonomy_id): # 插入的sql insert_sql = "insert into wp_term_relationships(object_id, term_taxonomy_id) values('{object_id}', '{term_taxonomy_id}')" \ .format(object_id=object_id, term_taxonomy_id=term_taxonomy_id) service_logger.log(insert_sql) res = SqlService.api(insert_sql, 'execute') if res is not None: return True return False
def insert(term_id, taxonomy='post_tag'): # taxonomy=post_tag,category # 插入的sql insert_sql = "insert into wp_term_taxonomy(term_id, taxonomy, description) values('{term_id}', '{taxonomy}', '')" \ .format(term_id=term_id, taxonomy=taxonomy) service_logger.log(insert_sql) res = SqlService.api(insert_sql, 'execute') if res is not None: return res return False
def insert(name): slug = urllib.quote(name.encode('utf8')) # 插入的sql insert_sql = "insert into wp_terms(name, slug) values('{name}', '{slug}')" \ .format(name=name, slug=slug) service_logger.log(insert_sql) res = SqlService.api(insert_sql, 'execute') if res is not None: return res return False
def insert_video(data): post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) content = MySQLdb.escape_string(data['content']) post_name = MySQLdb.escape_string(data['title']) movies_name = MySQLdb.escape_string(data['others']['name']) alias_name = MySQLdb.escape_string(data['others']['name_cn']) score = data['others']['score'] if score == '': score = 0 else: score = float(score) # 插入的sql insert_sql = "insert into wp_posts(post_author, post_date, post_date_gmt, " \ " post_excerpt, to_ping, pinged, post_content_filtered, " \ " post_title, post_content, post_status,comment_status, ping_status, " \ " post_name, post_type, from_type, from_url, from_ctime," \ " year, director, movie_duration, file_size, show_font," \ " score, movies_name, alias_name, language, country, actors)" \ " values('1', '{post_date}', '{post_date_gmt}'," \ " '', '', '', '', " \ " '{title}', '{content}', 'publish', 'open', 'open', " \ " '{post_name}', 'post', '{from_type}', '{from_url}', '{from_ctime}', " \ " '{year}', '{director}', '{movie_duration}', '{file_size}', '{show_font}', " \ " '{score}', '{movies_name}', '{alias_name}', '{language}', '{country}', '{actors}')" \ .format(post_author=1, post_date=post_date, post_date_gmt=post_date, title=data['title'], content=content, post_name=post_name, from_type=data['type'], from_url=data['url'], from_ctime=data['send_time'], year=int(data['others']['year']), director=data['others']['director'], movie_duration=data['others']['movie_duration'], file_size=data['others']['file_size'], show_font=data['others']['font'], score=score, movies_name=movies_name, alias_name=alias_name, language=data['others']['language'], country=data['others']['country'], actors=data['others']['actors']) # 打印sql # service_logger.log(insert_sql) res = SqlService.api(insert_sql, 'execute') if res is not None: # 插入浏览数 views_count = random.randint(1, 80) insert_meta_sql = "insert into wp_postmeta(post_id, meta_key, meta_value) value (%s, 'post_views_count', '%s')" % ( res, views_count) service_logger.log(insert_meta_sql) SqlService.api(insert_meta_sql, 'execute') return res return False
def insert(data): create_ts = int(time.time()) # 插入的sql insert_sql = "insert into wp_article(type, parent, category, " \ " title, content, tags, " \ " image, send_time, url, create_time)" \ " values('{type}', '{parent}', '{category}', '{title}', '{content}', '{tags}', " \ " '{image}', '{send_time}', '{url}', {create_time})" \ .format(type=data['type'], parent=data['parent'], category=data['category'], title=data['title'], content=data['content'], tags=data['tags'], image=data['image'], send_time=data['send_time'], url=data['url'], create_time=create_ts) # 打印sql service_logger.log(insert_sql) res = SqlService.api(insert_sql, 'execute') if res is not None: return True return False
def get_douban_image(name, w=480, h=320): image = '' url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=' + name html = get_url_html(url) doc = pq(html) tables = doc('.c-container').items() i = 0 for tb in tables: i = i + 1 txt = pq(tb) title = txt.text() imgObj = txt('img') if name in title: image = imgObj.attr('src') break if i > 8: break if image != '': service_logger.log('百度搜索图片:' + image) image = ImportService.upload_image(image, iscut=False, w=w, h=h) return image
def insert_image(image, post_id, from_type, from_url): post_date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) content = '' post_name = 'origin_' + time.strftime("%Y%m%d%H%M%S", time.localtime()) + str( random.randint(10000, 99999)) guid = Config.BASE_URL + 'wp-content/uploads/' + image mime_type = 'image/' + guid.split('.')[-1] # 插入的sql insert_sql = "insert into wp_posts(post_author, post_date, post_date_gmt, " \ " post_excerpt, to_ping, pinged, post_content_filtered, " \ " post_title, post_content, post_status,comment_status, ping_status, " \ " post_name, post_type, from_type, from_url, " \ " post_parent, guid, post_mime_type)" \ " values('1', '{post_date}', '{post_date_gmt}'," \ " '', '', '', '', " \ " '{title}', '{content}', 'inherit', 'open', 'closed', " \ " '{post_name}', 'attachment', '{from_type}', '{from_url}', " \ " '{post_parent}', '{guid}', '{mime_type}')" \ .format(post_author=1, post_date=post_date, post_date_gmt=post_date, title=post_name, content=content, post_name=post_name, from_type=from_type, from_url=from_url, post_parent=post_id, guid=guid, mime_type=mime_type) # 打印sql # service_logger.log(insert_sql) res = SqlService.api(insert_sql, 'execute') if res is not None: insert_meta1_sql = "insert into wp_postmeta(post_id, meta_key, meta_value) value (%s, '_wp_attached_file', '%s')" % ( res, image) service_logger.log(insert_meta1_sql) SqlService.api(insert_meta1_sql, 'execute') img = Image.open(Config.IMAGE_PATH + '/' + image) imo = { "width": img.size[0], "height": img.size[1], "file": image, "sizes": [] } insert_meta2_sql = "insert into wp_postmeta(post_id, meta_key, meta_value) value (%s, '_wp_attachment_metadata', '%s')" % ( res, phpserialize.dumps(imo)) service_logger.log(insert_meta2_sql) SqlService.api(insert_meta2_sql, 'execute') insert_meta3_sql = "insert into wp_postmeta(post_id, meta_key, meta_value) value (%s, '_thumbnail_id', '%s')" % ( post_id, res) service_logger.log(insert_meta3_sql) SqlService.api(insert_meta3_sql, 'execute') return res return False
def __init__(self, url=None): service_logger.log('#########开始抓取网页########') self.url = url self.handle()
def insert_handle(data, type='article'): cates = [ '技术', 'it', 'IT', 'php', 'python', 'nginx', 'java', 'jquery', 'js', '前端' ] if data['parent'] in cates: data['parent'] = '技术' if data['parent'] == '其他': data['parent'] = '其它' # 插入post数据 if type == 'video': width = 480 height = 320 else: width = 300 height = 200 # 下载图片 image = '' if data['image'] != '': image = ImportService.upload_image(data['image'], iscut=True, w=width, h=height) if type == 'video' and image == '': # image = '2018/11/carousel_bg-e1542977701970.png' service_logger.log('图片下载失败:' + data['image']) # 豆瓣网站下载图片 image = ImportService.get_douban_image(data['others']['name'], w=width, h=height) else: # 豆瓣网站下载图片 image = ImportService.get_douban_image(data['others']['name'], w=width, h=height) if type == 'video' and image == '': service_logger.log('video图片无法下载:' + data['image']) return False # 插入post数据 if type == 'video': ID = PostsModel.insert_video(data) else: ID = PostsModel.insert(data) # 结果 if ID is False: service_logger.log('插入失败:' + data['url']) return False # 插入图片 if image != '': PostsModel.insert_image(image, ID, data['type'], data['url']) # 检查分类是否存在 cate = TermsModel.get(data['parent'], 'category') if cate is False: cate = {} term_id = TermsModel.insert(data['parent']) if term_id: cate['term_id'] = term_id # 插入分类同步记录 cate['term_taxonomy_id'] = TermTaxonomyModel.insert( cate['term_id'], 'category') # 将文章关联分类 if 'term_taxonomy_id' in cate: TermRelationshipsModel.insert(ID, cate['term_taxonomy_id']) # 更新统计数据 TermTaxonomyModel.update_count(cate['term_taxonomy_id']) # 检查标签是否存在 if data['tags'] != '': tags = data['tags'].split(',') for tag in tags: tag = tag.strip() resu = TermsModel.get(tag, 'post_tag') if resu is False: resu = {} resu['term_id'] = TermsModel.insert(tag) # 插入分类同步记录 resu['term_taxonomy_id'] = TermTaxonomyModel.insert( resu['term_id'], 'post_tag') # 将文章关联分类 if 'term_taxonomy_id' in resu: TermRelationshipsModel.insert(ID, resu['term_taxonomy_id']) # 更新统计数据 TermTaxonomyModel.update_count(resu['term_taxonomy_id']) return True