Exemplo n.º 1
0
    def run(cls, logger=None, **kwargs):
        """
        更新货币的汇率信息
        @param param_dict:
        """
        logger = logger if 'logger' in kwargs else get_logger()
        logger.info('Update currency STARTED!!!!')

        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            for currency in db.query_match('currency', 'currency_info').fetch_row(maxrows=0):
                currency = currency[0]
                try:
                    logger.debug(str.format('Fetching for currency data for {0}...', currency))
                    data = cm.get_data(url=str.format('http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X'
                                                      '&f=sl1d1t1ba&e=.json', currency))
                    rate, d, t = [val for val in csv.reader(StringIO(data['body']))][0][1:4]
                    rate = float(rate)
                    timestamp = datetime.strptime(' '.join((d, t)), '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%d %H:%M:%S')
                    db.update({'rate': rate, 'update_time': timestamp}, 'currency_info',
                              str.format('currency="{0}"', currency))
                except (ValueError, IOError):
                    continue
                except:
                    raise
        logger.info('Update currency ENDED!!!!')
Exemplo n.º 2
0
    def run(cls, **kwargs):
        logger = kwargs['logger'] if 'logger' in kwargs else get_logger()
        logger.info('BACKUP STARTED')

        try:
            db_spec = getattr(glob, 'DATABASE')[kwargs['DATABASE']]
            # {"host": "127.0.0.1", "port": 1228, "schema": "editor_stores", "username": "******", "password": "******"}
            host = db_spec['host'] if 'host' in db_spec else '127.0.0.1'
            port = db_spec['port'] if 'port' in db_spec else 3306
            schema = db_spec['schema']
            db_user = db_spec['username']
            db_pwd = db_spec['password']

            ssh_user, ssh_host, ssh_port, dst = (None, None, 22, '')
            if 'SSH_USER' in kwargs:
                ssh_user = kwargs['SSH_USER']
            if 'SSH_HOST' in kwargs:
                ssh_host = kwargs['SSH_HOST']
            if 'SSH_PORT' in kwargs:
                ssh_port = int(kwargs['SSH_PORT'])
            if 'SSH_DST' in kwargs:
                dst = kwargs['SSH_DST']
        except (AttributeError, KeyError):
            logger.exception('Invalid database specification.')
            return

        host_str = str.format('-h{0}', host) if host else ''
        port_str = str.format('-P{0}', port) if port else ''

        tmp_file = '/tmp/single_backup.sql'
        # single-transaction备份
        logger.info('EXPORTING...')
        os.system(
            str.format(
                'mysqldump {3} {4} -u {0} -p{1} --single-transaction {2} > {5}',
                db_user, db_pwd, schema, host_str, port_str, tmp_file))

        # 打包
        logger.info('ZIPPING...')
        backup_name = os.path.join(
            getattr(glob, 'STORAGE')['STORAGE_PATH'], 'backups',
            str.format('{0}_auto_backup.7z',
                       datetime.datetime.now().strftime('%Y%m%d_%H%M%S')))
        os.system(
            str.format('7z a -mx7 {0} {1} > /dev/null', backup_name, tmp_file))

        # 移除临时sql文件
        logger.info('REMOVING TEMPORARY SQL FILES...')
        os.remove(tmp_file)

        # SCP
        if ssh_user and ssh_host and ssh_port:
            # 指明了SSH信息,需要上传到远程服务器作为备份
            logger.info('UPLOADING...')
            ssh_port_str = str.format('-P {0}', ssh_port) if ssh_port else ''
            os.system(
                str.format('scp {0} {4} {1}@{2}:{3} > /dev/null', ssh_port_str,
                           ssh_user, ssh_host, dst, backup_name))

        logger.info(str.format('AUTO BACKUP COMPLETED: {0}', backup_name))
Exemplo n.º 3
0
def translate_text_to(gs, text, to, source='', backup_gs=None):
    logger = get_logger()
    try:
        text = text.encode('utf-8')
    except:
        pass

    if not source and is_cht(text):
        source = 'zh-cn'

    result = None
    try:
        result = gs.translate(text, to, source)
    except:
        if not backup_gs:
            logger.info(
                str.format(
                    "Error: gs translate error with text : {0}       source : {1}        target : {2}",
                    text, source, to))
        pass
    if not result and backup_gs:
        try:
            result = backup_gs.translate(text, to, source)
        except:
            logger.info(
                str.format(
                    "Error: backupgs translate error with text : {0}       source : {1}        target : {2}",
                    text, source, to))
            pass

    return result
Exemplo n.º 4
0
    def run(cls, **kwargs):
        logger = kwargs['logger'] if 'logger' in kwargs else get_logger()
        logger.info('PRICE-CHANGE DETECTION STARTED')

        logger.info('CLEARING OUTDATED RECORDS')
        cls.tag_outdated(**kwargs)
        logger.info('GENERATING PRICE TRENDS')
        result = cls.tag_changed(**kwargs)
        if not result:
            logger.info('NO PRICE TRENDS DETECTED')
            return

        dst = kwargs['dst'] if 'dst' in kwargs and kwargs[
            'dst'] else '~/push_works/push.log'
        ssh_user, ssh_host, ssh_port = [None] * 3
        if 'ssh' in kwargs and kwargs['ssh']:
            ssh_str = kwargs['ssh']
            ssh_user, ssh = ssh_str.split('@')
            if ':' in ssh:
                ssh_host, ssh_port = ssh.split(':')
            else:
                ssh_host = ssh
                # Default ssh port
                ssh_port = 22

        if ssh_host:
            # 如果没有SSH信息,说明不需要通过SFTP将结果传输到远端服务器上
            logger.info('UPLOADING PRICE TRENDS')
            # 将变动结果写入临时目录
            file_name = str.format(
                '/tmp/price_change_{0}.log',
                datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
            with open(file_name, 'wb') as f:
                f.write(json.dumps(result, ensure_ascii=False).encode('utf-8'))

            # 指明了SSH信息,需要上传到远程服务器作为备份

            ssh_port_str = str.format('-P {0}', ssh_port) if ssh_port else ''
            ssh_cmd = str.format('scp {0} {4} {1}@{2}:{3} > /dev/null',
                                 ssh_port_str, ssh_user, ssh_host, dst,
                                 file_name)
            logger.info(str.format('UPLOADING: {0}', ssh_cmd))
            os.system(ssh_cmd)
            os.remove(file_name)

        # 发布更新的商品
        updated_brands = set([])
        for k in ('discount_down', 'discount_up', 'price_down', 'price_up'):
            updated_brands = updated_brands.union(
                result['discount_down'].keys())

        for brand in updated_brands:
            PublishRelease(brand).run()

        logger.info('DONE')
Exemplo n.º 5
0
    def run(cls, **kwargs):
        logger = kwargs['logger'] if 'logger' in kwargs else get_logger()
        logger.info('IMAGE CHECK ALERT STARTED')

        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            rs = db.query(
                'SELECT fingerprint, brand_id, image_list, cover_image FROM products_release',
                use_result=True)
            while True:
                bulk = rs.fetch_row(maxrows=100)
                if not bulk:
                    break

                is_err = False
                for fingerprint, brand_id, jlist, jcover in bulk:
                    try:
                        image_list = json.loads(jlist)
                        for path in [tmp['path'] for tmp in image_list]:
                            if not re.search(str.format(r'^{0}_', brand_id),
                                             path):
                                content = str.format(
                                    'fingerprint={0}, image_list={1}',
                                    fingerprint, jlist)
                                logger.error(content)
                                cls.alert(
                                    str.format('INVALID IMAGES: {0}!!!',
                                               fingerprint), content)
                                is_err = True
                                break

                        cover = json.loads(jcover)
                        if not re.search(str.format(r'^{0}_', brand_id),
                                         cover['path']):
                            content = str.format('fingerprint={0}, jcover={1}',
                                                 fingerprint, jcover)
                            logger.error(content)
                            cls.alert(
                                str.format('INVALID IMAGES: {0}!!!',
                                           fingerprint), content)
                            is_err = True
                            break
                    except:
                        cls.alert(
                            str.format('INVALID IMAGES: {0}!!!', fingerprint),
                            str.format(
                                'fingerprint={0}, jlist={1}, jcover={2}',
                                fingerprint, jlist, jcover))
                        raise

                if is_err:
                    break

        logger.info('DONE!')
Exemplo n.º 6
0
    def run(self):
        logger = get_logger()
        # 只处理关键国家的数据
        tmp = info.region_info()
        key_regions = filter(lambda val: tmp[val]['status'] == 1, tmp)
        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            # 删除原有的数据
            logger.info(
                str.format('DELETING OLD RECORDS: brand_id={0}',
                           self.brand_id))
            db.execute(
                str.format('DELETE FROM products_release WHERE brand_id={0}',
                           self.brand_id))
            fp_list = [
                tmp[0] for tmp in db.query(
                    str.format(
                        'SELECT fingerprint FROM products WHERE brand_id={0} GROUP BY fingerprint',
                        self.brand_id)).fetch_row(maxrows=0)
            ]
            self.tot = len(fp_list)
            self.progress = 0

            # 最多每100次就需要提交一次事务
            transaction_max = 100

            logger.info(
                str.format('TOT: {0} fingerprints, brand_id={1}', self.tot,
                           self.brand_id))
            db.start_transaction()
            for self.progress in xrange(self.tot):
                if self.progress % transaction_max == 0:
                    db.commit()
                    db.start_transaction()
                    logger.info(
                        str.format(
                            'PROCESSED {0}/{1} fingerprints, brand_id={2}',
                            self.progress, self.tot, self.brand_id))

                fp = fp_list[self.progress]
                model_list = list(
                    filter(
                        lambda val: val['region'] in key_regions,
                        db.query_match(['*'], 'products', {
                            'fingerprint': fp
                        }).fetch_row(maxrows=0, how=1)))
                if model_list:
                    self.merge_prods(model_list, db)
            db.commit()

            logger.info(str.format('DONE, brand_id={0}', self.brand_id))
Exemplo n.º 7
0
    def run(cls, **kwargs):
        cls.running = True

        logger = kwargs['logger'] if 'logger' in kwargs else get_logger()
        logger.info(str.format("Translate tasker start"))

        db_spec = kwargs['db_spec'] if 'db_spec' in kwargs else 'DB_SPEC'
        start = kwargs['start'] if 'start' in kwargs else 0
        count = kwargs['count'] if 'count' in kwargs else 100

        translate_main(start, count, logger, db_spec)

        logger.info(str.format("Translate tasker end"))

        cls.running = False
Exemplo n.º 8
0
    def run(cls, logger=None, **kwargs):
        logger = logger if 'logger' in kwargs else get_logger(logger_name='monitor')
        logger.info('Monitor STARTED!!!')

        #monitor process quantity, recrawl process quantity,limit interval for recrawl spider
        try:
            monitor_no = getattr(gs, 'MONITOR')['MAX_MONITOR']
        except (AttributeError, KeyError):
            monitor_no = 6
        try:
            recrawl_no = getattr(gs, 'MONITOR')['MAX_RECRAWLER']
        except (AttributeError, KeyError):
            recrawl_no = 12

        interval = kwargs['interval'] if 'interval' in kwargs else 7
        limit_time = datetime.datetime.now() - datetime.timedelta(interval)
        with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
            pid_list = psutil.pids()

            rs = db.query_match(['idmonitor', 'parameter', 'monitor_status', 'monitor_pid', 'recrawl_pid'],
                                'monitor_status', {'enabled': 1}).fetch_row(maxrows=0)
            #update monitor_status

            for idmonitor, parameter, monitor_status, monitor_pid, recrawl_pid in rs:
                #更新monitor_pid,recrawl_pid
                if monitor_pid and int(monitor_pid) not in pid_list:
                    db.update({'monitor_pid': None}, 'monitor_status', str.format('idmonitor="{0}"', idmonitor))
                if recrawl_pid and int(recrawl_pid) not in pid_list:
                    db.update({'recrawl_pid': None}, 'monitor_status', str.format('idmonitor="{0}"', idmonitor))

            #更新
            rs_new = db.query_match(
                ['idmonitor', 'parameter', 'monitor_status', 'monitor_pid', 'recrawl_pid', 'timestamp'],
                'monitor_status', {'enabled': 1}, tail_str='ORDER BY priority desc, timestamp').fetch_row(maxrows=0)

            #空闲product列表,排序后,最早更新的等待monitor
            idle_monitor_list = []
            idle_recrawl_list = []
            monitor_list = []
            recrawl_list = []

            for idmonitor, parameter, monitor_status, monitor_pid, recrawl_pid, timestamp in rs_new:
                #生成monitor_pid、recrawl_pid列表,用于监控和重爬,保证数量
                if monitor_pid is not None:
                    monitor_list.append(int(monitor_pid))
                if recrawl_pid is not None:
                    recrawl_list.append(int(recrawl_pid))

                if monitor_status == '0' and monitor_pid is None and recrawl_pid is None:
                    idle_monitor_list.append((idmonitor, parameter, timestamp))
                if monitor_status == '1' and monitor_pid is None and recrawl_pid is None:
                    idle_recrawl_list.append(( idmonitor, parameter, timestamp))

                    # #爬虫最后更新时间早于最大限制时间,重爬。
                    # update_time = datetime.datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
                    # if update_time <= limit_time:
                    #     db.update({'monitor_status': 1}, 'monitor_status',
                    #               str.format('idmonitor={0}', idmonitor))

            # idle_monitor_list = sorted(idle_monitor_list, key=lambda m: m[2])
            # idle_recrawl_list = sorted(idle_recrawl_list, key=lambda m: m[2])

            #start monitor and set monitor_status if find update
            if len(monitor_list) < monitor_no:
                if len(idle_monitor_list) > monitor_no - len(monitor_list):
                    ready_monitor = idle_monitor_list[:(monitor_no - len(monitor_list))]
                else:
                    ready_monitor = idle_monitor_list

                for idmonitor, parameter, timestamp in ready_monitor:
                    args = json.loads(parameter)
                    #monitor --brand 10009 --region fr --idmonitor 1931 -v

                    logger.info('Monitor started--> idmonitor:%s, brand_id:%s, region:%s' % (
                        idmonitor, args['brand_id'], args['region']))

                    spawn_process(
                        os.path.join(scripts.__path__[0], 'run_crawler.py'),
                        'monitor --brand %s --region %s --idmonitor %s' % (args['brand_id'], args['region'], idmonitor))

            #start recrawl and reset monitor_status after recrawl ended
            if len(recrawl_list) < recrawl_no:
                if len(idle_recrawl_list) > recrawl_no - len(recrawl_list):
                    ready_recrawl = idle_recrawl_list[:(recrawl_no - len(recrawl_list))]
                else:
                    ready_recrawl = idle_recrawl_list

                for idmonitor, parameter, timestamp in ready_recrawl:
                    args = json.loads(parameter)
                    args['idmonitor'] = idmonitor
                    para = '|'.join([str(idmonitor), str(args['brand_id']), args['region']])

                    logger.info('Recrawl started--> idmonitor:%s, brand_id:%s, region:%s' % (
                        idmonitor, args['brand_id'], args['region']))

                    spawn_process(os.path.join(scripts.__path__[0], 'recrawler.py'), para)
Exemplo n.º 9
0
brand = int(brand)
idm = int(idm)
parameter = {'idmonitor': idm, 'brand_id': brand, 'region': region}

with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
    db.update({'monitor_status': 0, 'monitor_pid': None, 'recrawl_pid': os.getpid()}, 'monitor_status',
              str.format('idmonitor={0}', parameter['idmonitor']))
#todo hardcode for DKNY, need to add 'is_offline' for DknySpider
if brand == 10108:
    os.system(
        'python %s %s -r %s' % (
            os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region']))
else:
    os.system('python %s update --brand %s -r %s' % (
        os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region']))
    os.system(
        'python %s %s -r %s' % (
            os.path.join(scripts.__path__[0], 'run_crawler.py'), parameter['brand_id'], parameter['region']))
# os.system('python %s process-tags --cond brand_id=%s' % parameter['brand_id'])
# os.system('python %s release --brand %s' % parameter['brand_id'])


with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:
    db.update({'recrawl_pid': None, 'priority': None}, 'monitor_status',
              str.format('idmonitor={0}', parameter['idmonitor']))

logger = get_logger(logger_name='monitor')

logger.info('Recrawl ended--> idmonitor:%s, brand_id:%s, region:%s' % (
    parameter['idmonitor'], parameter['brand_id'], parameter['region']))
Exemplo n.º 10
0
    def merge_prods(self, prods, db):
        """
        按照国家顺序,挑选主记录
        :param prods:
        """
        logger = get_logger()
        # 将prods转换为unicode
        for idx in xrange(len(prods)):
            prods[idx] = {k: unicodify(prods[idx][k]) for k in prods[idx]}

        # 挑选primary记录
        sorted_prods = sorted(prods,
                              key=lambda k: self.region_order[k['region']])
        main_entry = sorted_prods[0]
        entry = {
            k: unicodify(main_entry[k])
            for k in ('brand_id', 'model', 'name', 'description', 'details',
                      'gender', 'category', 'color', 'url', 'fingerprint')
        }
        if not entry['name']:
            entry['name'] = u'单品'

        mfashion_tags = [
            unicodify(val[0]) for val in db.query(
                str.format(
                    'SELECT DISTINCT p1.tag FROM mfashion_tags AS p1 '
                    'JOIN products_mfashion_tags AS p2 ON p1.idmfashion_tags=p2.id_mfashion_tags '
                    'WHERE p2.idproducts IN ({0})', ','.join(
                        val['idproducts']
                        for val in prods))).fetch_row(maxrows=0)
        ]
        #
        # original_tags = [int(val[0]) for val in
        #                  db.query(str.format('SELECT DISTINCT id_original_tags FROM products_original_tags '
        #                                      'WHERE idproducts IN ({0})',
        #                                      ','.join(val['idproducts'] for val in prods))).fetch_row(
        #                      maxrows=0)]

        entry['mfashion_tags'] = json.dumps(mfashion_tags, ensure_ascii=False)
        entry[
            'original_tags'] = ''  #json.dumps(original_tags, ensure_ascii=False)

        entry['region_list'] = json.dumps([val['region'] for val in prods],
                                          ensure_ascii=False)
        entry['brandname_e'] = info.brand_info()[int(
            entry['brand_id'])]['brandname_e']
        entry['brandname_c'] = info.brand_info()[int(
            entry['brand_id'])]['brandname_c']
        # # 该单品在所有国家的记录中,第一次被抓取到的时间,作为release的fetch_time
        # entry['fetch_time'] = \
        #     sorted(datetime.datetime.strptime(tmp['fetch_time'], "%Y-%m-%d %H:%M:%S") for tmp in prods)[
        #         0].strftime("%Y-%m-%d %H:%M:%S")

        url_dict = {int(val['idproducts']): val['url'] for val in prods}
        offline_dict = {
            int(val['idproducts']): int(val['offline'])
            for val in prods
        }
        price_change_dict = {
            int(val['idproducts']): val['price_change']
            for val in prods
        }
        update_time_dict = {
            int(val['idproducts']):
            datetime.datetime.strptime(val['update_time'], "%Y-%m-%d %H:%M:%S")
            for val in prods
        }
        # pid和region之间的关系
        region_dict = {int(val['idproducts']): val['region'] for val in prods}
        price_list = {}
        # 以pid为主键,将全部的价格历史记录合并起来
        for item in db.query_match(
            ['price', 'price_discount', 'currency', 'date', 'idproducts'],
                self.price_hist, {},
                str.format('idproducts IN ({0})',
                           ','.join(val['idproducts'] for val in prods)),
                tail_str='ORDER BY idprice_history DESC').fetch_row(maxrows=0,
                                                                    how=1):
            pid = int(item['idproducts'])
            region = region_dict[pid]
            offline = offline_dict[pid]
            if pid not in price_list:
                price_list[pid] = []
            price = float(item['price']) if item['price'] else None
            if offline == 0:
                price_discount = float(
                    item['price_discount']) if item['price_discount'] else None
            else:
                price_discount = None
            price_list[pid].append({
                'price':
                price,
                'price_discount':
                price_discount,
                'currency':
                item['currency'],
                'date':
                datetime.datetime.strptime(item['date'], "%Y-%m-%d %H:%M:%S"),
                'price_change':
                price_change_dict[pid],
                'url':
                url_dict[pid],
                'offline':
                offline,
                'code':
                region,
                'country':
                info.region_info()[region]['name_c']
            })

        currency_conv = lambda val, currency: info.currency_info()[currency][
            'rate'] * val

        # 对price_list进行简并操作。
        # 策略:如果有正常的最新价格,则返回正常的价格数据。
        # 如果最新价格为None,则取回溯第一条不为None的数据,同时将price_discount置空。
        # 如果无法找到不为None的价格,则跳过该pid
        for pid, pid_data in price_list.items():
            # 按照时间顺序逆排序,同时只保留price不为None的数据
            # pid_data = sorted(pid_data, key=lambda val: val['date'], reverse=True)

            # 有价格的pid_data子集
            valid_pid_data = filter(lambda val: val['price'], pid_data)

            if pid_data[0]['price']:
                # 正常情况
                price_list[pid] = pid_data[0]
                # 如果当前没有折扣价,查看是否为一周内原价悄悄下降的情况
                currency = valid_pid_data[0]['currency']
                if price_change_dict[pid] == 'D' and len(
                        valid_pid_data
                ) > 1 and currency == valid_pid_data[1]['currency']:
                    if not pid_data[0]['price_discount'] and currency_conv(
                            valid_pid_data[1]['price'],
                            currency) > currency_conv(
                                valid_pid_data[0]['price'],
                                currency) and (datetime.datetime.now() -
                                               valid_pid_data[0]['date']
                                               ) < datetime.timedelta(7):
                        price_list[pid]['price_discount'] = price_list[pid][
                            'price']
                        price_list[pid]['price'] = valid_pid_data[1]['price']
            else:
                # 寻找回溯第一条price不为None的数据。
                # tmp = filter(lambda val: val['price'], pid_data)
                if not valid_pid_data:
                    # 没有价格信息,取消该pid记录
                    price_list.pop(pid)
                else:
                    # 取最近一次价格,同时取消折扣价,保留最新记录的offline状态
                    tmp = valid_pid_data[0]
                    tmp['price_discount'] = None
                    price_list[pid] = tmp

            # 第一次有效价格对应的时间,为fetch_time
            # pid_data = filter(lambda val: val['price'], sorted(pid_data, key=lambda val: val['date']))
            # pid_data = filter(lambda val: val['price'], pid_data)
            if valid_pid_data and pid in price_list:
                price_list[pid]['fetch_time'] = valid_pid_data[-1]['date']
                price_list[pid]['idproducts'] = pid

        # 如果没有价格信息,则不发布
        if not price_list:
            return

        entry['price_list'] = sorted(
            price_list.values(),
            key=lambda val: self.region_order[val['code']])
        entry = release_filter(entry, logger)

        if not entry['price_list']:
            return

        entry['offline'] = entry['price_list'][0]['offline']

        # model的fetch_time的确定:所有对应pid中,fetch_time最早的那个。
        entry['fetch_time'] = min(
            tmp['fetch_time']
            for tmp in entry['price_list']).strftime("%Y-%m-%d %H:%M:%S")

        # 价格排序的列表
        alt_prices = []
        for price_item in entry['price_list']:
            # 将datetime序列化,进而保存在release表中。
            price_item['date'] = price_item['date'].strftime(
                "%Y-%m-%d %H:%M:%S")
            price_item['fetch_time'] = price_item['fetch_time'].strftime(
                "%Y-%m-%d %H:%M:%S")
            if price_item['offline'] == 0:
                if price_item['price_discount']:
                    tmp = map(
                        lambda key_name: currency_conv(price_item[key_name],
                                                       price_item['currency']),
                        ('price', 'price_discount'))
                    tmp.extend([
                        price_item[key]
                        for key in ('price_change', 'price', 'price_discount',
                                    'currency', 'date', 'idproducts')
                    ])
                    alt_prices.append(tmp)
                else:
                    alt_prices.append([
                        currency_conv(price_item['price'],
                                      price_item['currency']), None,
                        price_item['price_change'], price_item['price'],
                        price_item['price_discount'], price_item['currency'],
                        price_item['date'], price_item['idproducts']
                    ])
            else:
                alt_prices.append([
                    currency_conv(price_item['price'], price_item['currency']),
                    None, price_item['price_change'], price_item['price'],
                    price_item['price_discount'], price_item['currency'],
                    price_item['date'], price_item['idproducts']
                ])

        # 返回的价格:如果有折扣价,返回折扣价;如果没有,返回原价
        alt_prices = sorted(alt_prices,
                            key=lambda val: val[1] if val[1] else val[0])

        entry['price'], entry['price_discount'] = alt_prices[
            0][:2] if alt_prices else (None, ) * 2
        entry['price_change'] = alt_prices[0][2] if alt_prices else '0'
        entry['o_price'], entry['o_discount'], entry[
            'o_currency'] = alt_prices[0][3:6]

        # 取消entry['price_list']中的idproducts
        for i in xrange(len(entry['price_list'])):
            entry['price_list'][i].pop('idproducts')
        entry['price_list'] = json.dumps(entry['price_list'],
                                         ensure_ascii=False)

        entry['last_price_ts'] = alt_prices[0][6]
        entry['product_update_ts'] = update_time_dict[
            alt_prices[0][7]].strftime("%Y-%m-%d %H:%M:%S")

        # 搜索字段
        search_text = u' '.join(entry[tmp] if entry[tmp] else ''
                                for tmp in ('name', 'description', 'details',
                                            'model', 'brandname_e',
                                            'brandname_c'))
        search_color = u' '.join(entry['color']) if entry['color'] else u''
        rs = db.query_match(
            ['description_cn', 'description_en', 'details_cn', 'details_en'],
            'products_translate', {
                'fingerprint': entry['fingerprint']
            }).fetch_row()
        part_translate = u' ' + u' '.join(
            unicodify(tmp)
            for tmp in filter(lambda v: v, rs[0])) if rs else ' '
        search_tags = u' '.join(list(set(mfashion_tags)))
        entry['searchtext'] = unicode.format(u'{0} {1} {2} {3}', search_text,
                                             part_translate, search_tags,
                                             search_color)

        p = prods[0]
        checksums = []
        # 爆照checksums中的数据唯一,且顺序和idproducts_image一致
        for tmp in db.query(
                str.format(
                    '''
          SELECT p1.checksum, p3.width, p3.height, p3.path FROM products_image AS p1
          JOIN products AS p2 ON p1.fingerprint=p2.fingerprint
          JOIN images_store AS p3 ON p1.checksum=p3.checksum
          WHERE p2.fingerprint="{0}" ORDER BY p1.idproducts_image
          ''', p['fingerprint'])).fetch_row(maxrows=0, how=1):
            if tmp not in checksums:
                checksums.append(tmp)

        # 如果没有图片,则暂时不添加到release表中
        if not checksums:
            return

        image_list = []
        for val in checksums:
            tmp = {
                'path': val['path'],
                'width': int(val['width']),
                'height': int(val['height'])
            }
            if not image_list:
                entry['cover_image'] = json.dumps(tmp, ensure_ascii=False)
            image_list.append(tmp)

        entry['image_list'] = json.dumps(image_list[:self.max_images],
                                         ensure_ascii=False)

        db.insert(entry, 'products_release')
Exemplo n.º 11
0
    def run(cls, logger=None, **param):
        '''
        通过文件名判断文件创建时间、货号、new or update
        不使用文件属性判断,防止文件误删恢复影响创建时间、修改时间
        '''

        logger = logger if 'logger' in param else get_logger()
        logger.info('PARSE LOG STARTED!!!')

        max_err_cnt = 10
        if 'log-path' in param:
            log_path = param['log-path']
        else:
            log_path = os.sep.join(
                (getattr(gs, 'STORAGE')['STORAGE_PATH'], 'products', 'log'))

        if 'interval' in param:
            interval = param['interval']
        else:
            interval = 1

        if 'start' in param and 'stop' in param:
            start_time = param['start']
            stop_time = param['stop']
        else:
            cur = datetime.datetime.now()
            to_time = cur.strftime('%Y%m%d%H%M%S')
            from_time = (cur -
                         datetime.timedelta(interval)).strftime('%Y%m%d%H%M%S')
            start_time = from_time
            stop_time = to_time

        # 确定收信人
        try:
            group = getattr(gs, 'REPORTS')['CRAWLER_STATUS']
            if not isinstance(group, list):
                group = [group]
            recipients = {}
            for g in group:
                for key, value in getattr(gs, 'EMAIL_GROUPS')[g].items():
                    recipients[key] = value
                    # recipent_addrs = gs.EMAIL_ADDR.values()  # ['*****@*****.**', '*****@*****.**']
        except (TypeError, AttributeError, KeyError):
            return

        files = os.listdir(log_path)
        process_error = {}
        for file in files:
            #通过文件名获取信息
            # file = 'update_10192_20140304172558.log'
            file_name = os.path.basename(os.path.splitext(file)[0])
            if os.path.splitext(file)[1] != '.log':
                continue
            # file_name = file.split('.')[0]
            if re.search(r'^update', file_name) and re.findall(
                    r'_\d{14}', file_name):
                tmp = re.split(r'_', file_name)
                if len(tmp) == 2:
                    (update, dt) = tmp
                    model = 'MIX'
                elif len(tmp) == 3:
                    (update, model, dt) = tmp
            elif re.search(r'^\d{5}_', file_name) and re.findall(
                    r'_\d{8}', file_name):
                update = ''
                dt = re.findall(r'_(\d{8})', file_name)[0]
                dt += '000000'
                model = '_'.join(file_name.split('_')[1:-1])

            #文件创建时间晚于stop_time,文件跳过不处理
            if int(stop_time) < int(dt):
                continue
            else:
                f = open('%s%s%s' % (log_path, os.sep, file), 'rb')
                error_count = 0
                error_time = ''
                error_info = ''
                error_line = 0

                #默认文件最后生成时间为now
                last_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
                lines = f.readlines()
                i = 0
                error_list = []

                #获取文件最后生成时间
                for l in xrange(len(lines) - 1, 0, -1):
                    last = re.findall(
                        r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[[^\[\]]*\]',
                        lines[l])
                    if last:
                        last_time = ''.join(last[0])
                        break

                #文件创建结束时间早于start_time,文件跳过不处理
                if int(start_time) > int(last_time):
                    continue

                process_error[file] = []
                #整理错误行号列表
                while i < len(lines):

                    if re.findall(
                            r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[\S+\] (ERROR|Error)',
                            lines[i]):
                        get_time = ''.join(
                            re.findall(
                                r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[\S+\] ERROR:',
                                lines[i])[0])
                        if int(get_time) > int(stop_time) or int(
                                get_time) < int(start_time):
                            pass
                        else:
                            error_list.append(i)
                            t = []
                            while i < len(lines) - 1 and lines[
                                    i + 1].startswith('\t') and (
                                        lines[i + 1].strip() != ''):
                                t.append(i + 1)
                                i += 1
                            if t:
                                error_list.append(t)
                    i += 1
                # print(error_list)

                process_error[file] = []
                for index in xrange(0, len(error_list)):
                    if len(process_error[file]) >= max_err_cnt:
                        break

                    if type(error_list[index]) is not list:
                        error_lineno = error_list[index] + 1
                        # 使用下面这个
                        # re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[\S+\] (ERROR: .*)', lines[error_list[index]])
                        error_time = ''.join(
                            re.findall(
                                r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[\S+\] ERROR:',
                                lines[error_list[index]])[0])

                        tmp2 = re.findall(
                            r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[\S+\] (ERROR: .*) <GET.*?>(.*)',
                            lines[error_list[index]])
                        if tmp2:
                            error_info = ''.join(tmp2[0])
                        else:
                            tmp2 = re.findall(
                                r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[\S+\] (ERROR: .*) (.*)',
                                lines[error_list[index]])
                            if tmp2:
                                error_info = ''.join(tmp2[0])
                            else:
                                error_info = 'UNKNOWN ERROR'

                        if process_error[file] and process_error[file][-1][
                                'error_info'] != error_info:
                            process_error[file].append({
                                'line_no': error_lineno,
                                'error_time': error_time,
                                'error_info': error_info,
                                'error_count': 1,
                                'Traceback': []
                            })
                        elif process_error[file] and process_error[file][-1][
                                'error_info'] == error_info:
                            process_error[file][-1]['error_count'] += 1
                        else:
                            process_error[file].append({
                                'line_no': error_lineno,
                                'error_time': error_time,
                                'error_info': error_info,
                                'error_count': 1,
                                'Traceback': []
                            })
                    else:
                        for i in range(error_list[index][-1],
                                       error_list[index][0] + 1, -1):
                            detail = re.findall(
                                r'File "(/home/rose/MStore\S+)", line (\S+), in (\S+)',
                                lines[i])
                            if detail:
                                (error_file, error_file_line,
                                 error_function) = detail[0]
                                error_detail = lines[i + 1].strip()
                                if re.findall(r'exceptions.(\S+): ',
                                              lines[error_list[index][-1]]):
                                    exception = re.findall(
                                        r'exceptions.(\S+): ',
                                        lines[error_list[index][-1]])[0]
                                else:
                                    exception = ''
                                if re.findall(r'exceptions.\S+: (.*)',
                                              lines[error_list[index][-1]]):
                                    exception_detail = \
                                        re.findall(r'exceptions.\S+: (.*)', lines[error_list[index][-1]])[0]
                                else:
                                    exception_detail = ''
                                process_error[file][-1]['Traceback'].append([
                                    error_file, error_file_line,
                                    error_function, error_detail, exception,
                                    exception_detail
                                ])
                                break

        cls.sendemail(process_error, recipients)
        logger.info('PARSE LOG EMAIL SENDED!!!')
        logger.info('PARSE LOG ENDED!!!')
Exemplo n.º 12
0
        kclass = __import__(name)
    else:
        mod_name, mod_class = '.'.join(tmp[:-1]), tmp[-1]
        mod = __import__(mod_name, fromlist=[mod_class])
        kclass = getattr(mod, mod_class)

    return kclass


if __name__ == "__main__":
    ret = parse_args(sys.argv)
    section = ret['cmd']
    if not section:
        section = 'CRON_TASK_DEFAULT'

    logger = get_logger()
    logger.info(str.format('TASK {0} STARTED.', section))

    for task_name, task_param in getattr(glob, section, {}).items():
        try:
            class_name = task_param['classname']
            func = getattr(my_import(class_name), 'run')
            if 'param' in task_param:
                func(**task_param['param'])
            else:
                func()

        except (KeyError,):
            logger = get_logger().exception(unicode.format(u'Invalid task name: {0}',
                                                           unicodify(task_name)).encode('utf-8'))
Exemplo n.º 13
0
    def run(cls, logger=None, **kwargs):

        logger = logger if 'logger' in kwargs else get_logger()
        logger.info('Parse Duration Check STARTED')

        log_path = os.sep.join((getattr(gs, 'STORAGE')['STORAGE_PATH'], 'products', 'log'))
        tmp = []
        for x in os.listdir(log_path):
            if re.findall(r'^\d{5}', x):
                tmp.append(re.findall(r'^(\d{5})', x)[0])
        brands = list(set(tmp))

        if not brands:
            pass
        else:
            with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db:

                for brand in brands:
                    file_list = []
                    files = os.listdir(log_path)
                    #过滤文件名
                    for file in files:
                        if re.search(r'^%s' % brand, file):
                            file_list.append(file)
                    file_list.sort(reverse=True)

                    if file_list:
                        for log in file_list:
                            # (product, name, log_time) = re.split(r'_', os.path.basename(os.path.splitext(file)[0]))
                            f = open(log_path + os.sep + log, 'rb')
                            lines = f.readlines()
                            if not lines:
                                continue
                            st = re.findall(
                                r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[[^\[\]]*\] INFO: Spider started, processing the following regions:',
                                lines[0])
                            ed = re.findall(
                                r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})\+\d{4} \[[^\[\]]*\] INFO: Spider closed \(finished\)',
                                lines[-1])
                            regions = re.findall(
                                r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\+\d{4} \[[^\[\]]*\] INFO: Spider started, processing the following regions: (.*)',
                                lines[0])
                            if st and ed and regions:
                                start_time = '%s-%s-%s %s:%s:%s' % (st[0])
                                end_time = '%s-%s-%s %s:%s:%s' % (ed[0])
                                regions_no = len([x.strip() for x in regions[0].split(',')])
                                duration = datetime.datetime.strptime(''.join(ed[0]),
                                                                      '%Y%m%d%H%M%S') - datetime.datetime.strptime(
                                    ''.join(st[0]), '%Y%m%d%H%M%S')

                                db.insert({'start_time': start_time, 'end_time': end_time, 'duration': duration.seconds,
                                           'country_cnt': regions_no, 'brand_id': brand},
                                          'brand_duration', replace=True)
                                logger.info('%s brand-id:%s duration time update' % (
                                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), brand))
                                break
                            else:
                                pass
                    else:
                        pass
        logger.info('Parse Duration Check ENDED!!!')
Exemplo n.º 14
0
def translate_main(start=0, count=100, logger=None, db_spec='DB_SPEC'):
    if not logger:
        logger = get_logger()

    with RoseVisionDb(getattr(global_settings, 'DATABASE')[db_spec]) as db:
        gs = goslate.Goslate()
        proxy_name = get_proxy()
        proxy = urllib2.ProxyHandler({'http': proxy_name
                                      }) if proxy_name else None
        opener = urllib2.build_opener(proxy)
        backup_gs = goslate.Goslate(opener=opener, debug=True)

        sorted_region = get_sorted_region(db)

        fingerprint_start = start
        fingerprint_count = count

        logger.info(str.format("Translate process start"))
        while 1:
            fingerprints = get_fingerprints(db, fingerprint_start,
                                            fingerprint_count)
            if not fingerprints:
                logger.info(str.format("Translate process end"))
                break
            else:
                logger.info(
                    str.format("Translate process offset : {0} count : {1}",
                               fingerprint_start, len(fingerprints)))
                fingerprint_start += fingerprint_count

            for fingerprint in fingerprints:

                is_exist = db.query_match({'fingerprint'},
                                          'products_translate', {
                                              'fingerprint': fingerprint
                                          }).num_rows()
                if is_exist:
                    continue

                product_infos = get_product(db, fingerprint)

                # 按权重排序
                product_infos = sorted(product_infos.items(),
                                       key=lambda e: sorted_region.index(e[0]))
                product_infos = {e[0]: e[1] for e in product_infos}

                final_description_cn = None
                final_details_cn = None
                final_description_en = None
                final_details_en = None

                description_cn = check_cns_region(product_infos, 'description')
                details_cn = check_cns_region(product_infos, 'details')

                if is_chs(description_cn):
                    final_description_cn = description_cn
                elif is_cht(description_cn):
                    final_description_cn = translate_text_to(
                        gs,
                        description_cn,
                        'zh-cn',
                        source='zh-cn',
                        backup_gs=backup_gs)

                if is_chs(details_cn):
                    final_details_cn = details_cn
                elif is_cht(details_cn):
                    final_details_cn = translate_text_to(gs,
                                                         details_cn,
                                                         'zh-cn',
                                                         source='zh-cn',
                                                         backup_gs=backup_gs)

                description_en = check_ens_region(product_infos, 'description')
                details_en = check_ens_region(product_infos, 'details')

                if is_eng(description_en):
                    final_description_en = description_en
                if is_eng(details_en):
                    final_details_en = details_en

                try:
                    if not final_description_cn:
                        for region, info in product_infos.items():
                            if product_infos[region]['description']:
                                final_description_cn = translate_text_to(
                                    gs,
                                    product_infos[region]['description'],
                                    'zh-cn',
                                    backup_gs=backup_gs)
                                break
                    if not final_details_cn:
                        for region, info in product_infos.items():
                            if product_infos[region]['details']:
                                final_details_cn = translate_text_to(
                                    gs,
                                    product_infos[region]['details'],
                                    'zh-cn',
                                    backup_gs=backup_gs)
                                break
                    if not final_description_en:
                        for region, info in product_infos.items():
                            if region != 'cn':  # 尽量不从中文翻译到其他外语
                                if product_infos[region]['description']:
                                    final_description_en = translate_text_to(
                                        gs,
                                        product_infos[region]['description'],
                                        'en',
                                        backup_gs=backup_gs)
                                    break
                    if not final_details_en:
                        for region, info in product_infos.items():
                            if region != 'cn':  # 尽量不从中文翻译到其他外语
                                if product_infos[region]['details']:
                                    final_details_en = translate_text_to(
                                        gs,
                                        product_infos[region]['details'],
                                        'en',
                                        backup_gs=backup_gs)
                                    break

                    if not final_description_en and final_description_cn:
                        final_description_en = translate_text_to(
                            gs,
                            final_description_cn,
                            'en',
                            'zh-cn',
                            backup_gs=backup_gs)
                    if not final_details_en and final_details_cn:
                        final_details_en = translate_text_to(
                            gs,
                            final_details_cn,
                            'en',
                            'zh-cn',
                            backup_gs=backup_gs)
                except:
                    pass

                insert_dict = {}
                if final_description_cn:
                    insert_dict['description_cn'] = final_description_cn
                if final_details_cn:
                    insert_dict['details_cn'] = final_details_cn
                if final_description_en:
                    insert_dict['description_en'] = final_description_en
                if final_details_en:
                    insert_dict['details_en'] = final_details_en

                if insert_dict:
                    insert_dict['fingerprint'] = fingerprint
                    result = db.query_match({'fingerprint'},
                                            'products_translate',
                                            {'fingerprint': fingerprint})
                    try:
                        if result.num_rows() == 0:
                            db.insert(insert_dict, 'products_translate')
                        else:
                            db.update(
                                insert_dict, 'products_translate',
                                str.format('fingerprint="{0}"', fingerprint))
                    except:
                        logger.info(
                            str.format(
                                "Error: Insert or update sql error with {0}",
                                insert_dict))
                        pass
                else:
                    logger.info(
                        str.format(
                            "Error: No insert_dict for fingerprint : {0}",
                            fingerprint))