示例#1
0
    def start_calculate(self, type_id, time_from, time_to_end):
        """
        计算单日视频的均值并记录
        """
        if not self.judge_record(type_id, time_from, time_to_end,
                                 RecordStatus.Handled.value):
            print('对应时间范围内的标签关系未被全部处理,不进行计算')
            return HandlerErrcode.NotHandled

        time_start = time.time()
        video_tags = self._db['video_tag'].find({
            'tid': type_id,
            'pubdate': {
                '$gte': time_from,
                '$lt': time_to_end
            }
        }).sort([('tid', 1), ('pubdate', 1)]).batch_size(500)
        self.calculate_cur = 0
        self.calculate_total = video_tags.count()
        print("将要计算{count}个标签关系的均值数据".format(count=self.calculate_total))

        lower_time = time_from
        higher_time = time_from + 86400
        for video_tag in video_tags:
            stats = {}
            # 计算对应记录的数据均值
            for stat_code, stat_name in STAT_NAME.items():
                stats[stat_name] = []
            for aid in video_tag['aids']:
                video = self._db['videos'].find_one({'aid': aid})
                for stat_code, stat_name in STAT_NAME.items():
                    stats[stat_name].append(video[stat_name])

            # 更新对应记录的数据均值
            update_json = {}
            for stat_code, stat_name in STAT_NAME.items():
                update_json['avg_' + stat_name] = math.ceil(
                    numpy.mean(stats[stat_name]))
            self._db['video_tag'].update_one({'_id': video_tag['_id']},
                                             {'$set': update_json})

            if video_tag['pubdate'] >= higher_time:
                # 判断并记录当日视频是否全部处理完成
                self.save_record(type_id, lower_time,
                                 RecordStatus.Calculated.value)
                lower_time += 86400
                higher_time += 86400

            self.calculate_cur += 1
            if self.calculate_cur % 500 == 0:
                print_time(
                    "计算{count}个标签关系所用时间:".format(count=self.calculate_cur),
                    time.time() - time_start)

        # 记录最后一日的视频全部完成
        self.save_record(type_id, lower_time, RecordStatus.Calculated.value)
        print_time("计算{count}个标签关系所用时间:".format(count=self.calculate_cur),
                   time.time() - time_start)

        return HandlerErrcode.Success
示例#2
0
    def start_handle(self, type_id, time_from, time_to_end):
        """
        处理对应视频的标签关系
        """
        if not self.judge_record(type_id, time_from, time_to_end,
                                 RecordStatus.Crawled.value):
            print('对应时间范围内的视频未全部抓取,不进行处理')
            return HandlerErrcode.NotCrawled

        time_start = time.time()
        videos = self._db['videos'].find({
            'tid': type_id,
            'pubdate': {
                '$gte': time_from,
                '$lt': time_to_end
            }
        }).sort([('tid', 1), ('pubdate', 1)]).batch_size(500)
        self.handle_cur = 0
        self.handle_total = videos.count()
        print("将要处理{count}个视频的标签关系".format(count=self.handle_total))

        lower_time = time_from
        higher_time = time_from + 86400
        for video in videos:
            for tag in video['tags'].split(','):
                tag = str_to_clear(tag)
                self.save_tags(tag)
                self.save_videotag(video['aid'], type_id,
                                   timestamp_round_to_day(video['pubdate']),
                                   tag)

            if video['pubdate'] >= higher_time:
                # 判断并记录当日视频是否全部处理完成
                self.save_record(type_id, lower_time,
                                 RecordStatus.Handled.value)
                lower_time += 86400
                higher_time += 86400

            self.handle_cur += 1
            if self.handle_cur % 500 == 0:
                print_time(
                    "处理{count}个视频的标签关系所用时间:".format(count=self.handle_cur),
                    time.time() - time_start)

        # 记录最后一日的视频全部完成
        self.save_record(type_id, lower_time, RecordStatus.Handled.value)
        print_time("处理{count}个视频的标签关系所用时间:".format(count=self.handle_cur),
                   time.time() - time_start)

        return HandlerErrcode.Success
示例#3
0
    def start(self, type_id, time_from, time_to):
        """
        开始处理
        """
        time_from = date_to_timestamp(time_from)
        time_to_end = date_to_timestamp(time_to) + 24 * 3600

        time_start = time.time()
        self.reset_data()

        # 处理对应视频的标签关系
        errcode_handle = self.start_handle(type_id, time_from, time_to_end)
        if errcode_handle != HandlerErrcode.Success:
            return errcode_handle.value

        # 计算每日视频的数据均值
        errcode_calculate = self.start_calculate(type_id, time_from,
                                                 time_to_end)
        if errcode_calculate != HandlerErrcode.Success:
            return errcode_calculate.value

        print_time('总时间:', time.time() - time_start)
        return HandlerErrcode.Success.value
                             time_to=time_to)
    conn.rpush(REDIS_START_URL_KEY, url)

# 监控爬虫进度
url_len = conn.llen(REDIS_START_URL_KEY)
item_len = conn.llen(REDIS_ITEMS_KEY)
while url_len != 0:
    time.sleep(2)
    url_len = conn.llen(REDIS_START_URL_KEY)
    item_len = conn.llen(REDIS_ITEMS_KEY)
    # print("总爬取进度:({}/{}),{:.1f}%".format(
    #     item_len,
    #     num_results,
    #     item_len / num_results * 100,
    # ))
    print("剩余urls:{}".format(url_len))

print(
    "分区号{type_id}, [{time_from}-{time_to}]共有{num_results}个视频,{pages}页,每页{per_page}个视频"
    .format(
        type_id=type_id,
        time_from=time_from,
        time_to=time_to,
        num_results=response['numResults'],
        pages=pages,
        per_page=per_page,
    ))
print_time("爬取所用时间:", time.time() - time_start)

# TODO 插入记录每日视频爬取完成的record项
示例#5
0
文件: simple.py 项目: GrAndSE/lighty
import timeit
from helpers import print_time

template = '"Hello {{ name }}!"'
print '\n', template, '\n'

print_time('linja2', timeit.repeat("template.render(name='John Doe')",
            "from jinja2 import Template; template = Template(%s)" % template,
            repeat=5, number=10000))

print_time('lighty', timeit.repeat("template.execute({'name': 'John Doe'})",
            "from lighty.templates import Template; template = Template();" +
            "template.parse(%s)" % template, repeat=5, number=10000))

print_time('django', timeit.repeat("template.render(context)",
    "import djangohelper; from django.template import Context, " +
    "Template; template = Template(%s); " % template +
    "context = Context({'name': 'John Doe'})", repeat=5, number=10000))
示例#6
0
 def spider_closed(self, spider):
     logger.info("爬虫关闭:{}".format(spider.name))
     print_time("爬取所用时间:", time.time() - self.start_time)
示例#7
0
<head>
    <title>If test page</title>
</head>
<body>
    {% if user %}
        <h1>Hello {{ user.name }}!</h1>
        {% if user.is_authenticated %}
            <h2>Wellcome back</h2>
        {% endif %}
    {% endif %}
</body>
</html>"""'''
print '\n', if_template, '\n'

print_time('linja2', timeit.repeat(
        "template.render(user={'name':'John Doe', 'is_authenticated':False})",
        "from jinja2 import Template; template = Template(%s)" % if_template,
        repeat=5, number=10000))

print_time('lighty', timeit.repeat(
            "template.execute({'user': {'name': 'John Doe', " +
            "'is_authenticated': False}})",
            "from lighty.templates import Template; template = Template();" +
            "template.parse(%s)" % if_template, repeat=5, number=10000))

print_time('django', timeit.repeat("template.render(context)",
    "import djangohelper; from django.template import Context, Template; " +
    "template = Template(%s); " % if_template +
    "context = Context({'user':{'name':'John Doe','is_authenticated':False}})",
    repeat=5, number=10000))
示例#8
0
template = '''"""<!DOCTYPE html>
<html>
<head>
    <title>For test page</title>
</head>
<body>
    <ul>
    {% for i in items %}
        <li>{{ i }}</li>
    {% endfor %}
    </ul>
</body>
</html>"""'''
print '\n', template, '\n'

print_time('linja2', timeit.repeat(
           "template.render(items=[1, 2, 3, 4, 5, 6, 7, 8, 9, 0])",
           "from jinja2 import Template; template = Template(%s)" % template,
           repeat=5, number=10000))

print_time('lighty', timeit.repeat(
           "template.execute({'items': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]})",
           "from lighty.templates import Template; template = Template();" +
           "template.parse(%s)" % template, repeat=5, number=10000))

print_time('django', timeit.repeat("template.render(context)",
    "import djangohelper; from django.template import Context, Template; " +
    "template = Template(%s); " % template +
    "context = Context({'items': [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]})",
    repeat=5, number=10000))