예제 #1
0
def handle():
    os.environ.setdefault("DJANGO_SETTINGS_MODULE", "core.settings")  # 你的django的settings文件
    django.setup()
    from system.model import ProxyIpModel
    result = False
    beginTime = datetime.datetime.now()
    message_type = "ProxyIpCheck"
    try:
        logger.info('Begin for {0}: {1}'.format(message_type, beginTime.strftime('%H:%M:%S')))
        ip_list = ProxyIpModel.objects.exclude(status=3).all()

        header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1',}
        for ip in ip_list:
            try:
                proxy = {'http':  'http://' + ip.proxy_ip+':'+ip.proxy_port,
                         'https':  'https://' + ip.proxy_ip+':'+ip.proxy_port}
                # print proxy
                res = requests.get("http://ip-api.com/json/"+ip.proxy_ip, proxies=proxy, timeout=10, headers=header)
                if res.status_code == 200:
                    ip.country = res.json()['country']
                    ip.region = res.json()['regionName']
                    ip.city = res.json()['city']
                    ip.test_result = 1
                    ip.save()
                else:
                    ip.test_result = 2
                    ip.save()
            except Exception as e:
                ip.test_result = 2
                ip.save()
                pass
    except Exception as e:
        logger.error('error: {0} | {1}'.format((datetime.datetime.now() - beginTime).seconds, e))
    logger.info('End for {0}: {1} s'.format(message_type, (datetime.datetime.now() - beginTime).seconds))
    return result
예제 #2
0
def SpiderSync(serverId, spiderName=None):
    from system.model import ServerConfigModel

    _result = False
    beginTime = datetime.datetime.now()
    if spiderName is None:
        logger.error('error: {0} |{1}'.format((datetime.datetime.now() - beginTime).seconds, '爬虫同步任务启动,需要指定爬虫名'))
        return False
    try:
        logger.info('Send Command of CaptureSpider: {0}'.format(beginTime.strftime('%H:%M:%S')))

        _spiderServer = ServerConfigModel.objects.get(id=serverId)

        # 拼装命令
        _url = 'http://{0}:{1}/schedule.json'.format(_spiderServer.ip, _spiderServer.port)
        _data = {"project": 'default',
                 "spider": spiderName,
                 "task_code": '%s_%s' %(spiderName, _spiderServer.ip),
                 "rule_code": '%s_%s' %(spiderName, _spiderServer.ip),
                 # "rule": StringZipper.b64encode(StringZipper.zip(XML_obj.get_xml()))
                 "rule": ''
                 }

        requests.post(url=_url, data=_data)

        # 'curl http://localhost:5500/schedule.json -d project=default -d spider=TestSpider -d code=xxx'
        _result = True
    except Exception as e:
        logger.error('error: {0} | {1}'.format((datetime.datetime.now() - beginTime).seconds, e))

    logger.info('End for send command of CaptureSpider: {0} s'.format((datetime.datetime.now() - beginTime).seconds))

    return _result
예제 #3
0
def CaptureSpider(ruleId):
    from task.model import TaskCaptureListModel
    from system.model import ServerConfigModel
    from rule.model import CaptureRuleModel
    from core.libs.string_zipper import StringZipper
    from rule.vo import CaptureRuleVo
    from rule.libs.XMLETConstructor import XMLETConstructor

    _result = False
    beginTime = datetime.datetime.now()

    try:
        logger.info('Send Command of CaptureSpider: {0}'.format(
            beginTime.strftime('%H:%M:%S')))

        # _job = TaskCaptureListModel.objects.get(rule_id=ruleId)
        _rule = CaptureRuleModel.objects.get(id=ruleId)
        _spiderServer = ServerConfigModel.objects.get(
            id=_rule.scrapy_server_id)

        _rule_dict = CaptureRuleVo(instance=_rule).data
        print(_rule_dict)
        XML_obj = XMLETConstructor(dict(_rule_dict))
        print(_rule_dict.get('spider_name'))
        # 拼装命令
        _url = 'http://{0}:{1}/schedule.json'.format(_spiderServer.ip,
                                                     _spiderServer.port)
        _data = {
            "project":
            'default',
            # "spider": "CaptureSpider",
            "spider":
            _rule_dict.get('spider_name', "CaptureSpider"),
            # "spider": 'TestSpider',
            "task_code":
            '%s_%s' % (_spiderServer.ip, _rule_dict.get('id')),
            "rule_code":
            _rule.rule_code,
            # "rule": StringZipper.b64encode(StringZipper.zip(XML_obj.get_xml()))
            "rule":
            StringZipper.b64encode(
                StringZipper.zip(_rule_dict.get('xml_data').encode()))
        }

        requests.post(url=_url, data=_data)

        # 'curl http://localhost:5500/schedule.json -d project=default -d spider=TestSpider -d code=xxx'
        _result = True
    except Exception as e:
        logger.error('error: {0} | {1}'.format(
            (datetime.datetime.now() - beginTime).seconds, e))

    logger.info('End for send command of CaptureSpider: {0} s'.format(
        (datetime.datetime.now() - beginTime).seconds))

    return _result
def refershViewAmazonSku():
    result = False
    beginTime = datetime.datetime.now()
    try:
        logger.info('Begin for RefershViewAmazonSku: {0}'.format(beginTime.strftime('%H:%M:%S')))
        with connections['default'].cursor() as cursor:
            try:
                cursor.execute(
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_buy_box_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_pv_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_pv_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_pv_week;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_week;'
                )
                cursor.execute(
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_total_items_week;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_uv_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_uv_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_uv_week;'
                )
                cursor.execute(
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_buy_box_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_pv_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_pv_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_pv_week;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_total_items_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_total_items_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_total_items_week;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_uv_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_uv_items_conversion_rate_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_uv_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_line_uv_week;'
                )
                result = True
            except Exception as e:
                logger.error('error: {0} | {1}'.format((datetime.datetime.now() - beginTime).seconds, e))

    except Exception as e:
        logger.error('error: {0} | {1}'.format((datetime.datetime.now() - beginTime).seconds, e))

    logger.info('End for RefershViewAmazonSku: {0} s'.format((datetime.datetime.now() - beginTime).seconds))

    return result
예제 #5
0
def AsyncCapture():
    os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                          "core.settings")  # 你的django的settings文件
    django.setup()
    from system.model import ServerConfigModel
    server_list = ServerConfigModel.objects.all()
    for sever in server_list:
        if sever.server_status.running_status != 1:
            continue
        try:
            _result = execcmdCommand(
                '/var/www/html/amazon_analysis_v1/amazon/cronjobs/job_amazon_sync_capturc_data.sh '
                + sever.ip)
            logger.info('result:{0} | {1}'.format(sever.ip, _result))
        except Exception as e:
            logger.error('faild:{0} | {1}'.format(sever.ip, e))
    logger.info('done')
예제 #6
0
def send():
    os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                          "core.settings")  # 你的django的settings文件
    django.setup()
    from system.model.MessageContentModel import MessageContentModel
    result = 0
    beginTime = datetime.datetime.now()
    try:
        logger.info('Begin for TaskMessage: {0}'.format(
            beginTime.strftime('%H:%M:%S')))
        # 获取待发送任务 @todo
        config = ''
        # 执行任务
        msg_list = MessageContentModel.objects.filter(status=2).all()[:100]
        url = 'https://oapi.dingtalk.com/robot/send?access_token='
        for msg in msg_list:
            if msg:
                token = msg.token
                title = msg.title
                text = msg.content
                msgtype = msg.sending_type if msg.sending_type is not None else "markdown"
                # text += '\n##### '+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                data = {
                    "msgtype": msgtype,
                    'markdown': {
                        "title": title,
                        "text": text
                    },
                    "at": {
                        "isAtAll": True
                    }
                }
                response = req.api.post(url + token, json=data, timeout=100)
                if response.status_code == 200 and response.json(
                )['errcode'] == 0:
                    result = result + 1
                    msg.status = 1
                    msg.save()
    except Exception as e:
        logger.error('error: {0} | {1}'.format(
            (datetime.datetime.now() - beginTime).seconds, e))
    logger.info('End for TaskMessage: {0} s'.format(
        (datetime.datetime.now() - beginTime).seconds))
    return result
def refershView():
    os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                          "core.settings")  # 你的django的settings文件
    django.setup()
    result = False
    beginTime = datetime.datetime.now()
    try:
        logger.info('Begin for RefershView: {0}'.format(
            beginTime.strftime('%H:%M:%S')))
        with connections['default'].cursor() as cursor:
            try:
                cursor.execute(
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_keyword_rank_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_keyword_rank_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_keyword_rank_week;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_week;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_week;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_price_log_day;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_price_log_month;'
                    'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_price_log_week;'
                )
                result = True
            except Exception as e:
                import traceback
                traceback.print_exc()
                logger.error('error: {0} | {1}'.format(
                    (datetime.datetime.now() - beginTime).seconds, e))
    except Exception as e:
        import traceback
        traceback.print_exc()
        logger.error('error: {0} | {1}'.format(
            (datetime.datetime.now() - beginTime).seconds, e))
    logger.info('End for RefershView: {0} s'.format(
        (datetime.datetime.now() - beginTime).seconds))
    return result
예제 #8
0
def addSubscription(request):
    logger.info("Called addSubscription()")
    logger.info("With URL=" + request.POST['URL'])
    args = {}
    args.update(csrf(request))
    if request.method == 'POST':
        try:
            url = request.POST['URL']
            existed_subscriptions = Subscription.objects.filter(Address=url)

            if existed_subscriptions.count() == 0:
                feed = feedparser.parse(url)
                subscription = Subscription()
                subscription.Address = url
                subscription.Name = feed['channel']['title']
                subscription.Type = 'Rss'
                subscription.LastUpdateDate = timezone.now()
                subscription.save()
            else:
                subscription = existed_subscriptions[0]

            current_user = request.user

            query = (Q(User_id=current_user.id) & Q(Subscription_id=subscription.id))
            existed_user_subscription = UsersSubscriptions.objects.filter(query)

            if existed_user_subscription.count() != 0:
                return HttpResponse(status=403)

            user_subscription = UsersSubscriptions()
            user_subscription.AddedDate = timezone.now()
            user_subscription.Subscription = subscription
            user_subscription.User = request.user
            user_subscription.save()
            logger.info("RSS Added.")
            return subscriptions(request)
        except Exception as ex:
            logger.error(ex)
            return HttpResponse(status=403)
    else:
        return HttpResponse(status=403)
예제 #9
0
def handle():
    os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                          "core.settings")  # 你的django的settings文件
    django.setup()
    from system.model import MessageTemplateModel, TemplateRunLogModel, MessageContentModel
    from amazon.dv import CaptureSkuBuyBoxStateDv
    result = False
    beginTime = datetime.datetime.now()
    message_type = "ShoppingCartNotice"
    try:
        logger.info('Begin for {0}: {1}'.format(
            message_type, beginTime.strftime('%H:%M:%S')))
        # 获取对应模块的模板列表
        template_list = MessageTemplateModel.objects.filter(
            message_type=message_type, status=1)
        # 获取模板变量
        data = get_var_data(message_type)
        for template in template_list:
            # 解析业务条件
            condition = json.loads(template.condition)
            conditions = condition_parse(condition)
            # 通过条件获取业务数据
            queryset = CaptureSkuBuyBoxStateDv.objects.all()
            queryset = queryset.filter(conditions)
            data_list = []
            log_list = []
            for sku in queryset:
                if TemplateRunLogModel.objects.filter(
                        template_id=template.id,
                        sku_buy_box_state_id=sku.id).count() > 0:
                    continue
                log_list.append(
                    TemplateRunLogModel(template_id=template.id,
                                        sku_buy_box_state_id=sku.id))
                data_list.append(sku)
            if len(data_list) == 0:
                continue
            # 通过业务数据,模板变量,组装消息模板
            tpl = Template(template.content)
            data['data'] = data_list
            ctx = Context(data)
            text = tpl.render(ctx)
            # 创建消息
            message = MessageContentModel()
            message.title = template.describe
            message.sending_type = template.type
            message.message_type = message_type
            message.condition = str(conditions)
            message.content = text
            # 发送到模板指定的所有群
            for dt in template.dingtalk.all():
                message.group_name = dt.name
                message.token = dt.token
                message.status = 2
                message.save()
            # 写入模板运行日志
            TemplateRunLogModel.objects.bulk_create(log_list)
    except Exception as e:
        logger.error('error: {0} | {1}'.format(
            (datetime.datetime.now() - beginTime).seconds, e))
    logger.info('End for {0}: {1} s'.format(
        message_type, (datetime.datetime.now() - beginTime).seconds))
    return result
예제 #10
0
def testcron(request=''):

    logger.info('test inner pro. cronjob')

    return HttpResponse('test cronjob')
def transferBestseller(ruleId):
    os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                          "core.settings")  # 你的django的settings文件
    django.setup()
    from amazon.model import CaptureSkuBestsellerRankModel, AmazonProductCategoryModel
    from appfront.model import ProductAsinModel
    from rule.model import AnalysisRuleModel
    _result = False
    beginTime = datetime.datetime.now()
    logger.info("ruleId:{}".format(ruleId))
    try:
        logger.info('Send Command of TransferData: {0}'.format(
            beginTime.strftime('%H:%M:%S')))
        cfg = AnalysisRuleModel.objects.filter(pk=ruleId).first()
        sync_last_id = 0 if cfg.sync_last_id is None else int(cfg.sync_last_id)
        with connections['default'].cursor() as cursor:
            cursor.execute(
                "SELECT * FROM analysis_product_bestseller WHERE capture_code = %s AND id > %s ORDER BY id ASC limit 10000",
                [str(cfg.capture_rule.rule_code), sync_last_id])
            rows = [
                dict(zip([col[0] for col in cursor.description], row))
                for row in cursor.fetchall()
            ]
            asins = set()
            categorys = set()
            sync_at = timezone.now()
            for row in rows:
                asins.add(row['asin'])
                categorys.add(row['category_id'])
            product_list = ProductAsinModel.objects.filter(
                asin__in=asins).values_list('asin', 'combine_type', 'sku')
            category_list = AmazonProductCategoryModel.objects.filter(
                code__in=categorys).values_list('code', 'id')
            sku_map = {}
            for i in product_list:
                if sku_map.get(i[0]) is None:
                    sku_map[i[0]] = i
                elif int(sku_map[i[0]][1]) > int(i[1]):
                    sku_map[i[0]] = i
            category_map = {}
            for i in category_list:
                category_map[i[0]] = i[1]
            data_list = []
            logger.info('rows:{}'.format(len(rows)))
            for row in rows:
                if sku_map.get(row['asin']) is None:
                    continue
                sku = sku_map.get(row['asin'])[2]
                category_id = category_map.get(row['category_id'], 1)
                category_title = 'All Category' if row[
                    'category_name'] == '' else row['category_name']
                rank_page = 5 if row['page'] == '' or row[
                    'page'] == 'None' else int(row['page'])
                data = {
                    'platform': 'amazon',
                    'sku': sku,
                    'asin': row['asin'],
                    'capture_at': row['capture_at'],
                    'rank_on': int(row['rank_on']),
                    'rank_page': rank_page,
                    'category_id': category_id,
                    'category_title': category_title
                }
                data_list.append(CaptureSkuBestsellerRankModel(**data))
                sync_last_id = sync_last_id if int(
                    row['id']) < sync_last_id else int(row['id'])
            logger.info('records:{0}, times:{1} s'.format(
                len(data_list), (datetime.datetime.now() - beginTime).seconds))
            if len(data_list) > 0:
                from django.db import transaction
                with transaction.atomic():
                    AnalysisRuleModel.objects.filter(pk=cfg.id).update(
                        sync_at=sync_at, sync_last_id=sync_last_id)
                    # 批量写入
                    n = 5000
                    m_list = [
                        data_list[i:i + n]
                        for i in range(0, len(data_list), n)
                    ]
                    for m in m_list:
                        CaptureSkuBestsellerRankModel.objects.bulk_create(m)
                    logger.info('records:{0}, times:{1} s'.format(
                        len(data_list),
                        (datetime.datetime.now() - beginTime).seconds))
                    cursor.execute(
                        'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_day;'
                        'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_month;'
                        'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_bestseller_rank_week;'
                    )
    except Exception as e:
        import traceback
        traceback.print_exc()
        logger.error('error: {0} | {1}'.format(
            (datetime.datetime.now() - beginTime).seconds, e))
    logger.info('End for send command of TransferData: {0} s'.format(
        (datetime.datetime.now() - beginTime).seconds))
    return _result
def transferReview(ruleId):
    os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                          "core.settings")  # 你的django的settings文件
    django.setup()
    from amazon.model import CaptureSkuReviewModel
    from appfront.model import ProductAsinModel
    from rule.model import AnalysisRuleModel

    _result = False
    beginTime = datetime.datetime.now()
    try:
        logger.info('Send Command of TransferData: {0}'.format(
            beginTime.strftime('%H:%M:%S')))
        cfg = AnalysisRuleModel.objects.filter(pk=ruleId).first()
        sync_last_id = 0 if cfg.sync_last_id is None else int(cfg.sync_last_id)
        with connections['default'].cursor() as cursor:
            cursor.execute(
                "SELECT * FROM analysis_product_review WHERE capture_code = %s AND id > %s ORDER BY id ASC limit 10000",
                [str(cfg.capture_rule.rule_code), sync_last_id])
            rows = [
                dict(zip([col[0] for col in cursor.description], row))
                for row in cursor.fetchall()
            ]
            asins = set()
            reviews = set()
            for row in rows:
                asins.add(row['asin'])
                reviews.add(row['review_id'])
            product_list = ProductAsinModel.objects.filter(
                asin__in=asins).values_list('asin', 'combine_type', 'sku')
            review_list = CaptureSkuReviewModel.objects.filter(
                review_id__in=reviews).values_list('review_id', 'id')
            sku_map = {}
            for i in product_list:
                if sku_map.get(i[0]) is None:
                    sku_map[i[0]] = i
                elif int(sku_map[i[0]][1]) > int(i[1]):
                    sku_map[i[0]] = i
            review_map = {}
            for i in review_list:
                review_map[i[0]] = i[1]
            data_list = []
            logger.info('rows:{}'.format(len(rows)))
            sync_at = timezone.now()
            for row in rows:
                if sku_map.get(row['asin']) is None or review_map.get(
                        row['review_id']) is not None:
                    continue
                review_at = None if len(row['review_at']) < 1 \
                    else datetime.datetime.strptime(row['review_at'], '%d %B %Y').replace(
                    tzinfo=datetime.timezone(datetime.timedelta(hours=10))).astimezone(datetime.timezone.utc)
                sku = sku_map.get(row['asin'])[2]
                data = {
                    'platform': 'amazon',
                    'sku': sku,
                    'asin': row['asin'],
                    'link': row['target_url'],
                    'review_at': review_at,
                    'review_id': row['review_id'],
                    'review_rank': row['rank_on'],
                    'author': row['author'],
                    'title': '',
                    'content': '',
                    'selection': '',
                    'capture_at': row['capture_at']
                }
                data_list.append(CaptureSkuReviewModel(**data))
                sync_last_id = sync_last_id if int(
                    row['id']) < sync_last_id else int(row['id'])
            logger.info('records:{0}, times:{1} s'.format(
                len(data_list), (datetime.datetime.now() - beginTime).seconds))
            if len(data_list) > 0:
                _list = {}
                for d in data_list:
                    _list[d.review_id] = d
                data_list = list(_list.values())
                from django.db import transaction
                with transaction.atomic():
                    AnalysisRuleModel.objects.filter(pk=cfg.id).update(
                        sync_at=sync_at, sync_last_id=sync_last_id)
                    # 批量写入
                    n = 1000
                    m_list = [
                        data_list[i:i + n]
                        for i in range(0, len(data_list), n)
                    ]
                    for m in m_list:
                        CaptureSkuReviewModel.objects.bulk_create(m)
                    logger.info('records:{0}, times:{1} s'.format(
                        len(data_list),
                        (datetime.datetime.now() - beginTime).seconds))
                    cursor.execute(
                        'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_day;'
                        'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_month;'
                        'REFRESH MATERIALIZED VIEW "public".view_i_amazon_sku_review_rank_week;'
                    )
    except Exception as e:
        import traceback
        traceback.print_exc()
        logger.error('error: {0} | {1}'.format(
            (datetime.datetime.now() - beginTime).seconds, e))
    logger.info('End for send command of TransferData: {0} s'.format(
        (datetime.datetime.now() - beginTime).seconds))
    return _result
def transferBuybox(ruleId):
    os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                          "core.settings")  # 你的django的settings文件
    django.setup()
    from amazon.model import CaptureSkuBuyBoxStateModel
    from appfront.model import ProductAsinModel
    from rule.model import AnalysisRuleModel
    _result = False
    beginTime = datetime.datetime.now()
    try:
        logger.info('Send Command of TransferData: {0}'.format(
            beginTime.strftime('%H:%M:%S')))
        cfg = AnalysisRuleModel.objects.filter(pk=ruleId).first()
        sync_last_id = 0 if cfg.sync_last_id is None else int(cfg.sync_last_id)
        with connections['default'].cursor() as cursor:
            cursor.execute(
                "SELECT * FROM analysis_product_buybox WHERE capture_code = %s AND id > %s ORDER BY id ASC limit 10000",
                [str(cfg.capture_rule.rule_code), sync_last_id])
            rows = [
                dict(zip([col[0] for col in cursor.description], row))
                for row in cursor.fetchall()
            ]
            asins = set()
            for row in rows:
                asins.add(row['asin'])
            product_list = ProductAsinModel.objects.filter(
                asin__in=asins).values_list('asin', 'combine_type', 'sku')
            sku_map = {}
            for i in product_list:
                if sku_map.get(i[0]) is None:
                    sku_map[i[0]] = i
                elif int(sku_map[i[0]][1]) > int(i[1]):
                    sku_map[i[0]] = i
            data_list = []
            logger.info('rows:{}'.format(len(rows)))
            sync_at = timezone.now()
            for row in rows:
                sold_by_price = 0 if row['sold_by_price'] == '' else float(
                    row['sold_by_price'])
                sold_by_price = sold_by_price if row[
                    'sold_by_price_buybox'] == '' else float(
                        row['sold_by_price_buybox'])
                sold_by = '' if row['sold_by'] == '' else row['sold_by']
                sold_by = sold_by if row['sold_by_buybox'] == '' else row[
                    'sold_by_buybox']
                if sku_map.get(row['asin']) is None or sold_by_price <= 0:
                    continue
                sku = sku_map.get(row['asin'])[2]
                if sold_by in ['Artiss Furnishings']:
                    buy_box_state = 2
                else:
                    buy_box_state = 1
                data = {
                    'platform': 'amazon',
                    'sku': sku,
                    'asin': row['asin'],
                    'link': row['target_url'],
                    'buy_box_state': buy_box_state,
                    'capture_at': row['capture_at'],
                    'sold_by': sold_by,
                    'sold_by_price': sold_by_price
                }
                data_list.append(CaptureSkuBuyBoxStateModel(**data))
                sync_last_id = sync_last_id if int(
                    row['id']) < sync_last_id else int(row['id'])
            logger.info('records:{0}, times:{1} s'.format(
                len(data_list), (datetime.datetime.now() - beginTime).seconds))
            if len(data_list) > 0:
                from django.db import transaction
                with transaction.atomic():
                    AnalysisRuleModel.objects.filter(pk=cfg.id).update(
                        sync_at=sync_at, sync_last_id=sync_last_id)
                    # 批量写入
                    n = 1000
                    m_list = [
                        data_list[i:i + n]
                        for i in range(0, len(data_list), n)
                    ]
                    for m in m_list:
                        CaptureSkuBuyBoxStateModel.objects.bulk_create(m)
    except Exception as e:
        import traceback
        traceback.print_exc()
        logger.error('error: {0} | {1}'.format(
            (datetime.datetime.now() - beginTime).seconds, e))
    logger.info('End for send command of TransferData: {0} s'.format(
        (datetime.datetime.now() - beginTime).seconds))
    return _result
예제 #14
0
def AnalysisData(ruleId):
    os.environ.setdefault("DJANGO_SETTINGS_MODULE",
                          "core.settings")  # 你的django的settings文件
    django.setup()
    from core.libs.analysis_utils import ResultData
    from core.libs.xml_utils import XmlObject
    from rule.model import AnalysisRuleModel
    _result = False
    beginTime = datetime.datetime.now()
    try:
        logger.info('Send Command of AnalysisData: {0}'.format(
            beginTime.strftime('%H:%M:%S')))
        cfg = AnalysisRuleModel.objects.filter(pk=ruleId).first()
        config = XmlObject(xmlstr=cfg.xml_data)
        chunks = config.getElement("chunk").text
        table = config.getElement("table").text
        fields = config.getElement(xpath="//fields/field")
        pages_xpath = config.getElement("page").text
        model_list = []
        # 多页数据
        analysis_last_id = 0 if cfg.analysis_last_id is None else cfg.analysis_last_id
        analysis_at = timezone.now()
        analysis_code = uuid.uuid4()
        with connections['default'].cursor() as cursor:
            cursor.execute(
                'SELECT task_code,job_code,request_code,capture_code,target_url,capture_at,last_id,html FROM res_capture_html WHERE capture_code = %s AND last_id > %s ORDER BY last_id ASC limit 1000',
                [str(cfg.capture_rule.rule_code),
                 int(analysis_last_id)])
            page_list = [
                dict(zip([col[0] for col in cursor.description], row))
                for row in cursor.fetchall()
            ]
            logger.info('pages:{0} | {1}'.format(cfg.capture_rule.rule_code,
                                                 len(page_list)))
            for page in range(len(page_list)):
                data = XmlObject(xmlstr=page_list[page]['html'], type="html")
                pages = [] if pages_xpath is None else data.getElement(
                    xpath=pages_xpath)
                curr_page = 1 if len(pages) < 1 else int(pages[0])
                chunk_list = data.getElement(xpath=chunks)
                chunk_len = len(chunk_list)
                # 多模块数据
                for idx, chunk in enumerate(chunk_list):
                    if page_list[page]['last_id'] == '' or page_list[page][
                            'last_id'] is None:
                        continue
                    # 公共字段,非业务数据
                    model = {
                        'capture_at': page_list[page]['capture_at'],
                        'target_url': page_list[page]['target_url'],
                        'task_code': page_list[page]['task_code'],
                        'capture_code': page_list[page]['capture_code'],
                        'job_code': page_list[page]['job_code'],
                        'request_code': page_list[page]['request_code'],
                        'analysis_code': analysis_code,
                        'analysis_at': str(analysis_at),
                        'last_id': int(page_list[page]['last_id'])
                    }
                    analysis_last_id = analysis_last_id if int(
                        page_list[page]['last_id']
                    ) < analysis_last_id else int(page_list[page]['last_id'])
                    passed = True
                    # 每条数据字段列表
                    for field in fields:
                        name = field.findtext('./name')
                        mode = field.findtext('./mode')
                        position = field.findtext('./position')
                        position_group = field.findtext('./position-group')
                        pattern_group = field.findtext('./pattern-group')
                        value_set = field.xpath('./value-set/mapper/item')
                        pattern = field.findtext('./pattern')
                        required = field.findtext('./required')
                        default = field.findtext('./default')
                        value_list = [] if position == '' else chunk.xpath(
                            position)
                        position_group = int(position_group)
                        pattern_group = int(pattern_group)
                        values = [] if len(value_set) == 0 else [
                            v.findtext("./value") for v in value_set
                        ]
                        model[name] = ''
                        if mode == '2' and len(value_list) == 0:  # 节点不存在
                            model[name] = '2' if len(
                                values) <= 1 else values[1]
                        elif mode == '2' and len(value_list) > 0:  # 节点存在
                            model[name] = '1' if len(
                                values) <= 1 else values[0]
                        elif mode == '3':  # 判断节点数
                            index = (curr_page - 1) * chunk_len + idx + 1
                            model[name] = index if len(
                                values) <= index else values[int(index)]
                        else:  # 匹配节点值
                            if len(value_list) > 0:
                                target = "" if len(
                                    value_list) <= position_group else str(
                                        value_list[position_group]).strip()
                                rs = re.compile(r'{}'.format(pattern),
                                                re.DOTALL).search(target)
                                model[name] = "" if rs is None else rs[
                                    pattern_group]
                        if required == '1' and model[name] == '':
                            passed = False
                            continue
                        if model[name] == '':
                            model[name] = default
                    if passed:
                        model_list.append(model)
            # model_list.append(CaptureSkuBuyBoxStateModel(**model))
        # CaptureSkuBuyBoxStateModel.objects.bulk_create(model_list)
        logger.info('records:{0}, times:{1} s'.format(
            len(model_list), (datetime.datetime.now() - beginTime).seconds))
        if len(model_list) > 0:
            # 批量写入
            n = 10000
            m_list = [
                model_list[i:i + n] for i in range(0, len(model_list), n)
            ]
            res = ResultData(table, fields=tuple(model_list[0].keys()))
            for ml in m_list:
                res.createInsert(values=[v.values() for v in ml])
            from django.db import transaction
            with transaction.atomic():
                _result = res.executeInsert()
                AnalysisRuleModel.objects.filter(pk=cfg.id).update(
                    analysis_at=analysis_at, analysis_last_id=analysis_last_id)
    except Exception as e:
        import traceback
        traceback.print_exc()
        logger.error('error: {0} | {1}'.format(
            (datetime.datetime.now() - beginTime).seconds, e))
    logger.info('End for send command of AnalysisData: {0} s'.format(
        (datetime.datetime.now() - beginTime).seconds))
    return _result
    '''执行系统命令'''