def handle(): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() from system.model import ProxyIpModel result = False beginTime = datetime.datetime.now() message_type = "ProxyIpCheck" try: logger.info('Begin for {0}: {1}'.format(message_type, beginTime.strftime('%H:%M:%S'))) ip_list = ProxyIpModel.objects.exclude(status=3).all() header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',} for ip in ip_list: try: proxy = {'http': 'http://' + ip.proxy_ip+':'+ip.proxy_port, 'https': 'https://' + ip.proxy_ip+':'+ip.proxy_port} # print proxy res = requests.get("http://ip-api.com/json/"+ip.proxy_ip, proxies=proxy, timeout=10, headers=header) if res.status_code == 200: ip.country = res.json()['country'] ip.region = res.json()['regionName'] ip.city = res.json()['city'] ip.test_result = 1 ip.save() else: ip.test_result = 2 ip.save() except Exception as e: ip.test_result = 2 ip.save() pass except Exception as e: logger.error('error: {0} | {1}'.format((datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for {0}: {1} s'.format(message_type, (datetime.datetime.now() - beginTime).seconds)) return result
def SpiderSync(serverId, spiderName=None): from system.model import ServerConfigModel _result = False beginTime = datetime.datetime.now() if spiderName is None: logger.error('error: {0} |{1}'.format((datetime.datetime.now() - beginTime).seconds, '爬虫同步任务启动,需要指定爬虫名')) return False try: logger.info('Send Command of CaptureSpider: {0}'.format(beginTime.strftime('%H:%M:%S'))) _spiderServer = ServerConfigModel.objects.get(id=serverId) # 拼装命令 _url = 'http://{0}:{1}/schedule.json'.format(_spiderServer.ip, _spiderServer.port) _data = {"project": 'default', "spider": spiderName, "task_code": '%s_%s' %(spiderName, _spiderServer.ip), "rule_code": '%s_%s' %(spiderName, _spiderServer.ip), # "rule": StringZipper.b64encode(StringZipper.zip(XML_obj.get_xml())) "rule": '' } requests.post(url=_url, data=_data) # 'curl http://localhost:5500/schedule.json -d project=default -d spider=TestSpider -d code=xxx' _result = True except Exception as e: logger.error('error: {0} | {1}'.format((datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for send command of CaptureSpider: {0} s'.format((datetime.datetime.now() - beginTime).seconds)) return _result
def CaptureSpider(ruleId): from task.model import TaskCaptureListModel from system.model import ServerConfigModel from rule.model import CaptureRuleModel from core.libs.string_zipper import StringZipper from rule.vo import CaptureRuleVo from rule.libs.XMLETConstructor import XMLETConstructor _result = False beginTime = datetime.datetime.now() try: logger.info('Send Command of CaptureSpider: {0}'.format( beginTime.strftime('%H:%M:%S'))) # _job = TaskCaptureListModel.objects.get(rule_id=ruleId) _rule = CaptureRuleModel.objects.get(id=ruleId) _spiderServer = ServerConfigModel.objects.get( id=_rule.scrapy_server_id) _rule_dict = CaptureRuleVo(instance=_rule).data print(_rule_dict) XML_obj = XMLETConstructor(dict(_rule_dict)) print(_rule_dict.get('spider_name')) # 拼装命令 _url = 'http://{0}:{1}/schedule.json'.format(_spiderServer.ip, _spiderServer.port) _data = { "project": 'default', # "spider": "CaptureSpider", "spider": _rule_dict.get('spider_name', "CaptureSpider"), # "spider": 'TestSpider', "task_code": '%s_%s' % (_spiderServer.ip, _rule_dict.get('id')), "rule_code": _rule.rule_code, # "rule": StringZipper.b64encode(StringZipper.zip(XML_obj.get_xml())) "rule": StringZipper.b64encode( StringZipper.zip(_rule_dict.get('xml_data').encode())) } requests.post(url=_url, data=_data) # 'curl http://localhost:5500/schedule.json -d project=default -d spider=TestSpider -d code=xxx' _result = True except Exception as e: logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for send command of CaptureSpider: {0} s'.format( (datetime.datetime.now() - beginTime).seconds)) return _result
def refershViewAmazonSku(): result = False beginTime = datetime.datetime.now() try: logger.info('Begin for RefershViewAmazonSku: {0}'.format(beginTime.strftime('%H:%M:%S'))) with connections['default'].cursor() as cursor: try: cursor.execute( 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_buy_box_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_pv_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_pv_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_pv_week;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_week;' ) cursor.execute( 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_week;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_uv_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_uv_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_uv_week;' ) cursor.execute( 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_buy_box_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_pv_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_pv_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_pv_week;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_total_items_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_total_items_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_total_items_week;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_uv_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_uv_items_conversion_rate_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_uv_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_uv_week;' ) result = True except Exception as e: logger.error('error: {0} | {1}'.format((datetime.datetime.now() - beginTime).seconds, e)) except Exception as e: logger.error('error: {0} | {1}'.format((datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for RefershViewAmazonSku: {0} s'.format((datetime.datetime.now() - beginTime).seconds)) return result
def AsyncCapture(): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() from system.model import ServerConfigModel server_list = ServerConfigModel.objects.all() for sever in server_list: if sever.server_status.running_status != 1: continue try: _result = execcmdCommand( '/var/www/html/amazon_analysis_v1/amazon/cronjobs/job_amazon_sync_capturc_data.sh ' + sever.ip) logger.info('result:{0} | {1}'.format(sever.ip, _result)) except Exception as e: logger.error('faild:{0} | {1}'.format(sever.ip, e)) logger.info('done')
def send(): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() from system.model.MessageContentModel import MessageContentModel result = 0 beginTime = datetime.datetime.now() try: logger.info('Begin for TaskMessage: {0}'.format( beginTime.strftime('%H:%M:%S'))) # 获取待发送任务 @todo config = '' # 执行任务 msg_list = MessageContentModel.objects.filter(status=2).all()[:100] url = 'https://oapi.dingtalk.com/robot/send?access_token=' for msg in msg_list: if msg: token = msg.token title = msg.title text = msg.content msgtype = msg.sending_type if msg.sending_type is not None else "markdown" # text += '\n##### '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') data = { "msgtype": msgtype, 'markdown': { "title": title, "text": text }, "at": { "isAtAll": True } } response = req.api.post(url + token, json=data, timeout=100) if response.status_code == 200 and response.json( )['errcode'] == 0: result = result + 1 msg.status = 1 msg.save() except Exception as e: logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for TaskMessage: {0} s'.format( (datetime.datetime.now() - beginTime).seconds)) return result
def refershView(): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() result = False beginTime = datetime.datetime.now() try: logger.info('Begin for RefershView: {0}'.format( beginTime.strftime('%H:%M:%S'))) with connections['default'].cursor() as cursor: try: cursor.execute( 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_keyword_rank_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_keyword_rank_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_keyword_rank_week;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_week;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_week;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_price_log_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_price_log_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_price_log_week;' ) result = True except Exception as e: import traceback traceback.print_exc() logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) except Exception as e: import traceback traceback.print_exc() logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for RefershView: {0} s'.format( (datetime.datetime.now() - beginTime).seconds)) return result
def addSubscription(request): logger.info("Called addSubscription()") logger.info("With URL=" + request.POST['URL']) args = {} args.update(csrf(request)) if request.method == 'POST': try: url = request.POST['URL'] existed_subscriptions = Subscription.objects.filter(Address=url) if existed_subscriptions.count() == 0: feed = feedparser.parse(url) subscription = Subscription() subscription.Address = url subscription.Name = feed['channel']['title'] subscription.Type = 'Rss' subscription.LastUpdateDate = timezone.now() subscription.save() else: subscription = existed_subscriptions[0] current_user = request.user query = (Q(User_id=current_user.id) & Q(Subscription_id=subscription.id)) existed_user_subscription = UsersSubscriptions.objects.filter(query) if existed_user_subscription.count() != 0: return HttpResponse(status=403) user_subscription = UsersSubscriptions() user_subscription.AddedDate = timezone.now() user_subscription.Subscription = subscription user_subscription.User = request.user user_subscription.save() logger.info("RSS Added.") return subscriptions(request) except Exception as ex: logger.error(ex) return HttpResponse(status=403) else: return HttpResponse(status=403)
def handle(): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() from system.model import MessageTemplateModel, TemplateRunLogModel, MessageContentModel from amazon.dv import CaptureSkuBuyBoxStateDv result = False beginTime = datetime.datetime.now() message_type = "ShoppingCartNotice" try: logger.info('Begin for {0}: {1}'.format( message_type, beginTime.strftime('%H:%M:%S'))) # 获取对应模块的模板列表 template_list = MessageTemplateModel.objects.filter( message_type=message_type, status=1) # 获取模板变量 data = get_var_data(message_type) for template in template_list: # 解析业务条件 condition = json.loads(template.condition) conditions = condition_parse(condition) # 通过条件获取业务数据 queryset = CaptureSkuBuyBoxStateDv.objects.all() queryset = queryset.filter(conditions) data_list = [] log_list = [] for sku in queryset: if TemplateRunLogModel.objects.filter( template_id=template.id, sku_buy_box_state_id=sku.id).count() > 0: continue log_list.append( TemplateRunLogModel(template_id=template.id, sku_buy_box_state_id=sku.id)) data_list.append(sku) if len(data_list) == 0: continue # 通过业务数据,模板变量,组装消息模板 tpl = Template(template.content) data['data'] = data_list ctx = Context(data) text = tpl.render(ctx) # 创建消息 message = MessageContentModel() message.title = template.describe message.sending_type = template.type message.message_type = message_type message.condition = str(conditions) message.content = text # 发送到模板指定的所有群 for dt in template.dingtalk.all(): message.group_name = dt.name message.token = dt.token message.status = 2 message.save() # 写入模板运行日志 TemplateRunLogModel.objects.bulk_create(log_list) except Exception as e: logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for {0}: {1} s'.format( message_type, (datetime.datetime.now() - beginTime).seconds)) return result
def testcron(request=''): logger.info('test inner pro. cronjob') return HttpResponse('test cronjob')
def transferBestseller(ruleId): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() from amazon.model import CaptureSkuBestsellerRankModel, AmazonProductCategoryModel from appfront.model import ProductAsinModel from rule.model import AnalysisRuleModel _result = False beginTime = datetime.datetime.now() logger.info("ruleId:{}".format(ruleId)) try: logger.info('Send Command of TransferData: {0}'.format( beginTime.strftime('%H:%M:%S'))) cfg = AnalysisRuleModel.objects.filter(pk=ruleId).first() sync_last_id = 0 if cfg.sync_last_id is None else int(cfg.sync_last_id) with connections['default'].cursor() as cursor: cursor.execute( "SELECT * FROM analysis_product_bestseller WHERE capture_code = %s AND id > %s ORDER BY id ASC limit 10000", [str(cfg.capture_rule.rule_code), sync_last_id]) rows = [ dict(zip([col[0] for col in cursor.description], row)) for row in cursor.fetchall() ] asins = set() categorys = set() sync_at = timezone.now() for row in rows: asins.add(row['asin']) categorys.add(row['category_id']) product_list = ProductAsinModel.objects.filter( asin__in=asins).values_list('asin', 'combine_type', 'sku') category_list = AmazonProductCategoryModel.objects.filter( code__in=categorys).values_list('code', 'id') sku_map = {} for i in product_list: if sku_map.get(i[0]) is None: sku_map[i[0]] = i elif int(sku_map[i[0]][1]) > int(i[1]): sku_map[i[0]] = i category_map = {} for i in category_list: category_map[i[0]] = i[1] data_list = [] logger.info('rows:{}'.format(len(rows))) for row in rows: if sku_map.get(row['asin']) is None: continue sku = sku_map.get(row['asin'])[2] category_id = category_map.get(row['category_id'], 1) category_title = 'All Category' if row[ 'category_name'] == '' else row['category_name'] rank_page = 5 if row['page'] == '' or row[ 'page'] == 'None' else int(row['page']) data = { 'platform': 'amazon', 'sku': sku, 'asin': row['asin'], 'capture_at': row['capture_at'], 'rank_on': int(row['rank_on']), 'rank_page': rank_page, 'category_id': category_id, 'category_title': category_title } data_list.append(CaptureSkuBestsellerRankModel(**data)) sync_last_id = sync_last_id if int( row['id']) < sync_last_id else int(row['id']) logger.info('records:{0}, times:{1} s'.format( len(data_list), (datetime.datetime.now() - beginTime).seconds)) if len(data_list) > 0: from django.db import transaction with transaction.atomic(): AnalysisRuleModel.objects.filter(pk=cfg.id).update( sync_at=sync_at, sync_last_id=sync_last_id) # 批量写入 n = 5000 m_list = [ data_list[i:i + n] for i in range(0, len(data_list), n) ] for m in m_list: CaptureSkuBestsellerRankModel.objects.bulk_create(m) logger.info('records:{0}, times:{1} s'.format( len(data_list), (datetime.datetime.now() - beginTime).seconds)) cursor.execute( 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_week;' ) except Exception as e: import traceback traceback.print_exc() logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for send command of TransferData: {0} s'.format( (datetime.datetime.now() - beginTime).seconds)) return _result
def transferReview(ruleId): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() from amazon.model import CaptureSkuReviewModel from appfront.model import ProductAsinModel from rule.model import AnalysisRuleModel _result = False beginTime = datetime.datetime.now() try: logger.info('Send Command of TransferData: {0}'.format( beginTime.strftime('%H:%M:%S'))) cfg = AnalysisRuleModel.objects.filter(pk=ruleId).first() sync_last_id = 0 if cfg.sync_last_id is None else int(cfg.sync_last_id) with connections['default'].cursor() as cursor: cursor.execute( "SELECT * FROM analysis_product_review WHERE capture_code = %s AND id > %s ORDER BY id ASC limit 10000", [str(cfg.capture_rule.rule_code), sync_last_id]) rows = [ dict(zip([col[0] for col in cursor.description], row)) for row in cursor.fetchall() ] asins = set() reviews = set() for row in rows: asins.add(row['asin']) reviews.add(row['review_id']) product_list = ProductAsinModel.objects.filter( asin__in=asins).values_list('asin', 'combine_type', 'sku') review_list = CaptureSkuReviewModel.objects.filter( review_id__in=reviews).values_list('review_id', 'id') sku_map = {} for i in product_list: if sku_map.get(i[0]) is None: sku_map[i[0]] = i elif int(sku_map[i[0]][1]) > int(i[1]): sku_map[i[0]] = i review_map = {} for i in review_list: review_map[i[0]] = i[1] data_list = [] logger.info('rows:{}'.format(len(rows))) sync_at = timezone.now() for row in rows: if sku_map.get(row['asin']) is None or review_map.get( row['review_id']) is not None: continue review_at = None if len(row['review_at']) < 1 \ else datetime.datetime.strptime(row['review_at'], '%d %B %Y').replace( tzinfo=datetime.timezone(datetime.timedelta(hours=10))).astimezone(datetime.timezone.utc) sku = sku_map.get(row['asin'])[2] data = { 'platform': 'amazon', 'sku': sku, 'asin': row['asin'], 'link': row['target_url'], 'review_at': review_at, 'review_id': row['review_id'], 'review_rank': row['rank_on'], 'author': row['author'], 'title': '', 'content': '', 'selection': '', 'capture_at': row['capture_at'] } data_list.append(CaptureSkuReviewModel(**data)) sync_last_id = sync_last_id if int( row['id']) < sync_last_id else int(row['id']) logger.info('records:{0}, times:{1} s'.format( len(data_list), (datetime.datetime.now() - beginTime).seconds)) if len(data_list) > 0: _list = {} for d in data_list: _list[d.review_id] = d data_list = list(_list.values()) from django.db import transaction with transaction.atomic(): AnalysisRuleModel.objects.filter(pk=cfg.id).update( sync_at=sync_at, sync_last_id=sync_last_id) # 批量写入 n = 1000 m_list = [ data_list[i:i + n] for i in range(0, len(data_list), n) ] for m in m_list: CaptureSkuReviewModel.objects.bulk_create(m) logger.info('records:{0}, times:{1} s'.format( len(data_list), (datetime.datetime.now() - beginTime).seconds)) cursor.execute( 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_day;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_month;' 'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_week;' ) except Exception as e: import traceback traceback.print_exc() logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for send command of TransferData: {0} s'.format( (datetime.datetime.now() - beginTime).seconds)) return _result
def transferBuybox(ruleId): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() from amazon.model import CaptureSkuBuyBoxStateModel from appfront.model import ProductAsinModel from rule.model import AnalysisRuleModel _result = False beginTime = datetime.datetime.now() try: logger.info('Send Command of TransferData: {0}'.format( beginTime.strftime('%H:%M:%S'))) cfg = AnalysisRuleModel.objects.filter(pk=ruleId).first() sync_last_id = 0 if cfg.sync_last_id is None else int(cfg.sync_last_id) with connections['default'].cursor() as cursor: cursor.execute( "SELECT * FROM analysis_product_buybox WHERE capture_code = %s AND id > %s ORDER BY id ASC limit 10000", [str(cfg.capture_rule.rule_code), sync_last_id]) rows = [ dict(zip([col[0] for col in cursor.description], row)) for row in cursor.fetchall() ] asins = set() for row in rows: asins.add(row['asin']) product_list = ProductAsinModel.objects.filter( asin__in=asins).values_list('asin', 'combine_type', 'sku') sku_map = {} for i in product_list: if sku_map.get(i[0]) is None: sku_map[i[0]] = i elif int(sku_map[i[0]][1]) > int(i[1]): sku_map[i[0]] = i data_list = [] logger.info('rows:{}'.format(len(rows))) sync_at = timezone.now() for row in rows: sold_by_price = 0 if row['sold_by_price'] == '' else float( row['sold_by_price']) sold_by_price = sold_by_price if row[ 'sold_by_price_buybox'] == '' else float( row['sold_by_price_buybox']) sold_by = '' if row['sold_by'] == '' else row['sold_by'] sold_by = sold_by if row['sold_by_buybox'] == '' else row[ 'sold_by_buybox'] if sku_map.get(row['asin']) is None or sold_by_price <= 0: continue sku = sku_map.get(row['asin'])[2] if sold_by in ['Artiss Furnishings']: buy_box_state = 2 else: buy_box_state = 1 data = { 'platform': 'amazon', 'sku': sku, 'asin': row['asin'], 'link': row['target_url'], 'buy_box_state': buy_box_state, 'capture_at': row['capture_at'], 'sold_by': sold_by, 'sold_by_price': sold_by_price } data_list.append(CaptureSkuBuyBoxStateModel(**data)) sync_last_id = sync_last_id if int( row['id']) < sync_last_id else int(row['id']) logger.info('records:{0}, times:{1} s'.format( len(data_list), (datetime.datetime.now() - beginTime).seconds)) if len(data_list) > 0: from django.db import transaction with transaction.atomic(): AnalysisRuleModel.objects.filter(pk=cfg.id).update( sync_at=sync_at, sync_last_id=sync_last_id) # 批量写入 n = 1000 m_list = [ data_list[i:i + n] for i in range(0, len(data_list), n) ] for m in m_list: CaptureSkuBuyBoxStateModel.objects.bulk_create(m) except Exception as e: import traceback traceback.print_exc() logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for send command of TransferData: {0} s'.format( (datetime.datetime.now() - beginTime).seconds)) return _result
def AnalysisData(ruleId): os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings") # 你的django的settings文件 django.setup() from core.libs.analysis_utils import ResultData from core.libs.xml_utils import XmlObject from rule.model import AnalysisRuleModel _result = False beginTime = datetime.datetime.now() try: logger.info('Send Command of AnalysisData: {0}'.format( beginTime.strftime('%H:%M:%S'))) cfg = AnalysisRuleModel.objects.filter(pk=ruleId).first() config = XmlObject(xmlstr=cfg.xml_data) chunks = config.getElement("chunk").text table = config.getElement("table").text fields = config.getElement(xpath="//fields/field") pages_xpath = config.getElement("page").text model_list = [] # 多页数据 analysis_last_id = 0 if cfg.analysis_last_id is None else cfg.analysis_last_id analysis_at = timezone.now() analysis_code = uuid.uuid4() with connections['default'].cursor() as cursor: cursor.execute( 'SELECT task_code,job_code,request_code,capture_code,target_url,capture_at,last_id,html FROM res_capture_html WHERE capture_code = %s AND last_id > %s ORDER BY last_id ASC limit 1000', [str(cfg.capture_rule.rule_code), int(analysis_last_id)]) page_list = [ dict(zip([col[0] for col in cursor.description], row)) for row in cursor.fetchall() ] logger.info('pages:{0} | {1}'.format(cfg.capture_rule.rule_code, len(page_list))) for page in range(len(page_list)): data = XmlObject(xmlstr=page_list[page]['html'], type="html") pages = [] if pages_xpath is None else data.getElement( xpath=pages_xpath) curr_page = 1 if len(pages) < 1 else int(pages[0]) chunk_list = data.getElement(xpath=chunks) chunk_len = len(chunk_list) # 多模块数据 for idx, chunk in enumerate(chunk_list): if page_list[page]['last_id'] == '' or page_list[page][ 'last_id'] is None: continue # 公共字段,非业务数据 model = { 'capture_at': page_list[page]['capture_at'], 'target_url': page_list[page]['target_url'], 'task_code': page_list[page]['task_code'], 'capture_code': page_list[page]['capture_code'], 'job_code': page_list[page]['job_code'], 'request_code': page_list[page]['request_code'], 'analysis_code': analysis_code, 'analysis_at': str(analysis_at), 'last_id': int(page_list[page]['last_id']) } analysis_last_id = analysis_last_id if int( page_list[page]['last_id'] ) < analysis_last_id else int(page_list[page]['last_id']) passed = True # 每条数据字段列表 for field in fields: name = field.findtext('./name') mode = field.findtext('./mode') position = field.findtext('./position') position_group = field.findtext('./position-group') pattern_group = field.findtext('./pattern-group') value_set = field.xpath('./value-set/mapper/item') pattern = field.findtext('./pattern') required = field.findtext('./required') default = field.findtext('./default') value_list = [] if position == '' else chunk.xpath( position) position_group = int(position_group) pattern_group = int(pattern_group) values = [] if len(value_set) == 0 else [ v.findtext("./value") for v in value_set ] model[name] = '' if mode == '2' and len(value_list) == 0: # 节点不存在 model[name] = '2' if len( values) <= 1 else values[1] elif mode == '2' and len(value_list) > 0: # 节点存在 model[name] = '1' if len( values) <= 1 else values[0] elif mode == '3': # 判断节点数 index = (curr_page - 1) * chunk_len + idx + 1 model[name] = index if len( values) <= index else values[int(index)] else: # 匹配节点值 if len(value_list) > 0: target = "" if len( value_list) <= position_group else str( value_list[position_group]).strip() rs = re.compile(r'{}'.format(pattern), re.DOTALL).search(target) model[name] = "" if rs is None else rs[ pattern_group] if required == '1' and model[name] == '': passed = False continue if model[name] == '': model[name] = default if passed: model_list.append(model) # model_list.append(CaptureSkuBuyBoxStateModel(**model)) # CaptureSkuBuyBoxStateModel.objects.bulk_create(model_list) logger.info('records:{0}, times:{1} s'.format( len(model_list), (datetime.datetime.now() - beginTime).seconds)) if len(model_list) > 0: # 批量写入 n = 10000 m_list = [ model_list[i:i + n] for i in range(0, len(model_list), n) ] res = ResultData(table, fields=tuple(model_list[0].keys())) for ml in m_list: res.createInsert(values=[v.values() for v in ml]) from django.db import transaction with transaction.atomic(): _result = res.executeInsert() AnalysisRuleModel.objects.filter(pk=cfg.id).update( analysis_at=analysis_at, analysis_last_id=analysis_last_id) except Exception as e: import traceback traceback.print_exc() logger.error('error: {0} | {1}'.format( (datetime.datetime.now() - beginTime).seconds, e)) logger.info('End for send command of AnalysisData: {0} s'.format( (datetime.datetime.now() - beginTime).seconds)) return _result '''执行系统命令'''