Пример #1
0
def test_log_encoding(psr):
    # TCP connection timed out: 10060: 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。.
    psr()
    data = cst.read_data(cst.LOG_JSON_PATH)
    for detail in data['log_categories']['retry_logs']['details']:
        assert u'连接尝试失败' in detail

    psr(log_encoding='gbk')
    data = cst.read_data(cst.LOG_JSON_PATH)
    for detail in data['log_categories']['retry_logs']['details']:
        assert u'连接尝试失败' not in detail and 'TCP connection timed out: 10060:' in detail

    # 2018-10-23 18:28:33 [test] 3: test utf8: 测试中文
    parser = psr(execute_main=False, log_encoding=cst.LOG_ENCODING)
    copy(cst.GBK_LOG_PATH, cst.LOG_PATH)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert '2018-10-23 18:28:33 [test] 3: test utf8:' in data[
        'head'] and u'测试中文' not in data['head']

    parser = psr(execute_main=False, log_encoding='gbk')
    copy(cst.GBK_LOG_PATH, cst.LOG_PATH)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert '2018-10-23 18:28:33 [test] 3: test utf8:' in data[
        'head'] and u'测试中文' in data['head']
Пример #2
0
def test_demo_log_files(psr):
    psr()
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    txt_data = cst.read_data(cst.TXT_JSON_PATH)
    for k in cst.PARSE_KEYS:
        if k not in ['last_update_time', 'last_update_timestamp']:
            assert log_data[k] == txt_data[k]

    # 2019-01-01T00_00_01.log
    # 2019-01-01T00_00_02.txt
    for case, data in zip(['log', 'txt'], [log_data, txt_data]):
        cst.check_demo_data(data)

        if case == 'log':
            job = cst.JOB
            ext = 'log'
        else:
            job = cst.JOB_TXT
            ext = 'txt'
        assert data['log_path'].endswith('%s.%s' % (job, ext))
        assert data['json_path'].endswith('%s.json' % job)
        assert data['json_url'].endswith('%s.json' % job)
        assert data['json_url'].startswith('http://%s' % cst.SCRAPYD_SERVER)

        assert data['size'] == cst.SIZE
        assert data['position'] == cst.SIZE
        assert data['status'] == cst.STATUS
        assert data['_head'] == cst.LOG_HEAD_LINES
        assert data['logparser_version'] == cst.LOGPARSER_VERSION
Пример #3
0
def test_disable_telnet(psr):
    last_update_timestamp = 0
    runtime = 0
    cwd = os.getcwd()
    os.chdir(cst.DEMO_PROJECT_PATH)
    try:
        version = '1.5.1' if (cst.ON_WINDOWS or on_fedora) else '1.6.0'
        cmd = 'pip install scrapy==%s' % version
        cst.sub_process(cmd, block=True)
        for name in ['enable_telnet', 'disable_telnet']:
            enable_telnet = name == 'enable_telnet'
            parser = psr(execute_main=False, enable_telnet=enable_telnet)
            # Test MyTelnet.verify_log_file_path()
            if enable_telnet:
                for _name in ['6023', '6024']:
                    _log_file = os.path.join(cst.DEMO_PROJECT_LOG_FOLDER_PATH,
                                             '%s.log' % _name)
                    cst.write_text(
                        _log_file,
                        TELNET_151_PORT_16023.replace(':16023', ':%s' % _name))

            log_file = os.path.join(cst.DEMO_PROJECT_LOG_FOLDER_PATH,
                                    '%s.log' % name)
            cmd = 'scrapy crawl example -s CLOSESPIDER_TIMEOUT=40 -s LOG_FILE=%s' % log_file
            cst.sub_process(cmd)
            time.sleep(10)
            parser.main()
            if enable_telnet:
                log_data = cst.read_data(re.sub(r'.log$', '.json', log_file))
                last_update_timestamp = log_data['crawler_stats'][
                    'last_update_timestamp']
                assert last_update_timestamp
                runtime = log_data['crawler_engine'][
                    'time()-engine.start_time']
                assert runtime
            time.sleep(10)
            parser.main()
            # Issue #4: Stats collected via telnet are not being updated periodically
            if enable_telnet:
                log_data = cst.read_data(re.sub(r'.log$', '.json', log_file))
                assert log_data['crawler_stats'][
                    'last_update_timestamp'] > last_update_timestamp
                assert log_data['crawler_engine'][
                    'time()-engine.start_time'] > runtime
            time.sleep(30)
            parser.main()
            log_data = cst.read_data(re.sub(r'.log$', '.json', log_file))
            assert log_data['latest_matches']['scrapy_version'] == version
            assert log_data['latest_matches']['telnet_console']
            assert log_data['crawler_stats']['source'] == 'log'
            if enable_telnet:
                assert log_data['crawler_engine']
            else:
                assert not log_data['crawler_engine']
    except:
        os.chdir(cwd)
        raise
    finally:
        os.chdir(cwd)
Пример #4
0
def test_log_no_change(psr):
    start_time = time.time()
    psr(parse_round_interval=1, exit_timeout=0.001)  # parse for first time, exit
    parse_time = time.time() - start_time
    exit_timeout = parse_time * 3  # Ensure a sleep
    interval = exit_timeout + 5
    psr(parse_round_interval=interval, exit_timeout=exit_timeout)
    stats = cst.read_data(cst.STATS_JSON_PATH)
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert stats['datas'][cst.PROJECT][cst.SPIDER][cst.JOB]['last_update_time'] == data['last_update_time']
    # last_update_timestamp does not contain the float part of a timestamp, so add '- 2' on the right
    assert stats['last_update_timestamp'] - data['last_update_timestamp'] > interval - 2
Пример #5
0
def test_scrapyd_server(psr):
    default = '127.0.0.1:6800'
    json_url = 'http://%s/logs/%s/%s/%s.json' % (default, cst.PROJECT, cst.SPIDER, cst.JOB)
    psr()
    stats = cst.read_data(cst.STATS_JSON_PATH)
    assert stats['datas'][cst.PROJECT][cst.SPIDER][cst.JOB]['json_url'] == json_url

    localhost = 'localhost:6800'
    json_url = 'http://%s/logs/%s/%s/%s.json' % (localhost, cst.PROJECT, cst.SPIDER, cst.JOB)
    psr(scrapyd_server=localhost)
    stats = cst.read_data(cst.STATS_JSON_PATH)
    assert stats['datas'][cst.PROJECT][cst.SPIDER][cst.JOB]['json_url'] == json_url
Пример #6
0
def test_new_size_read_data(psr):
    appended_log = u'test'
    appended_log_length = len(appended_log)
    parser = psr()
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['logparser_version'] == cst.LOGPARSER_VERSION
    cst.check_demo_data(log_data)
    last_update_timestamp = log_data['last_update_timestamp']

    # Valid but short appended log
    cst.write_text(cst.LOG_PATH, appended_log, append=True)
    time.sleep(2)
    parser.main()
    assert os.path.getsize(cst.APPENDED_LOG_PATH) == 0
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['last_update_timestamp'] > last_update_timestamp
    assert log_data['size'] == cst.SIZE + appended_log_length
    assert log_data['position'] == cst.SIZE
    cst.check_demo_data(
        log_data
    )  # Previous parsed result is not affected by short appended log

    # Mismatching version
    log_data['logparser_version'] = '0.0.0'
    cst.write_text(cst.LOG_JSON_PATH, cst.json_dumps(log_data))
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['logparser_version'] == '0.0.0'

    cst.write_text(cst.LOG_PATH, appended_log, append=True)
    now_size = cst.SIZE + appended_log_length * 2
    parser.main()
    assert os.path.getsize(cst.APPENDED_LOG_PATH) == now_size
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['logparser_version'] == cst.LOGPARSER_VERSION
    assert log_data['size'] == now_size
    assert log_data['position'] == now_size
    cst.check_demo_data(log_data)

    # Broken json file
    cst.write_text(cst.LOG_JSON_PATH, appended_log, append=True)
    cst.write_text(cst.LOG_PATH, appended_log, append=True)
    now_size = cst.SIZE + appended_log_length * 3
    parser.main()
    assert os.path.getsize(cst.APPENDED_LOG_PATH) == now_size
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['size'] == now_size
    assert log_data['position'] == now_size
    cst.check_demo_data(log_data)
Пример #7
0
def test_log_extensions(psr):
    if os.path.exists(cst.STATS_JSON_PATH):
        os.remove(cst.STATS_JSON_PATH)
    psr(log_extensions=[])
    stats = cst.read_data(cst.STATS_JSON_PATH)
    assert stats['datas'] == {}

    psr(log_extensions=['.log'])
    stats = cst.read_data(cst.STATS_JSON_PATH)
    assert len(stats['datas']) == 1 and cst.JOB in stats['datas'][cst.PROJECT][cst.SPIDER]

    psr(log_extensions=['.txt'])
    stats = cst.read_data(cst.STATS_JSON_PATH)
    assert len(stats['datas']) == 1 and cst.JOB_TXT in stats['datas'][cst.PROJECT_TXT][cst.SPIDER_TXT]

    psr(log_extensions=cst.LOG_EXTENSIONS)
    stats = cst.read_data(cst.STATS_JSON_PATH)
    assert (len(stats['datas']) == 2
            and cst.JOB in stats['datas'][cst.PROJECT][cst.SPIDER]
            and cst.JOB_TXT in stats['datas'][cst.PROJECT_TXT][cst.SPIDER_TXT])
Пример #8
0
def test_chunk_size(psr):
    parser = psr(execute_main=False)
    os.remove(cst.TXT_PATH)
    assert not os.path.isdir(cst.TXT_PATH)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert data['first_log_time'] == '2018-10-23 18:28:34'
    assert data['latest_log_time'] == '2018-10-23 18:29:42'
    cst.check_demo_data(data)
    assert os.path.getsize(cst.APPENDED_LOG_PATH) == cst.SIZE

    parser = psr(execute_main=False, chunk_size=10000)  # 15,862 = 9924 + 5938, 15683 = 9938 + 5745
    os.remove(cst.TXT_PATH)
    assert not os.path.isdir(cst.TXT_PATH)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    cst.json_dumps(data)
    assert data['first_log_time'] == '2018-10-23 18:28:34'
    assert data['latest_log_time'] == '2018-10-23 18:29:42'
    cst.check_demo_data(data)
    assert os.path.getsize(cst.APPENDED_LOG_PATH) == 5938 if len(os.linesep) == 2 else 5745
Пример #9
0
def test_keep_data_in_memory(psr):
    datas_full_keys_set = set(cst.META_KEYS + cst.PARSE_KEYS + cst.FULL_EXTENDED_KEYS)
    datas_simplified_keys_set = set(cst.META_KEYS + cst.SIMPLIFIED_KEYS)

    parser = psr(keep_data_in_memory=True)
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    for k in [cst.LOG_PATH, cst.TXT_PATH]:
        assert set(datas_full[k].keys()) == datas_full_keys_set
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    for k in [cst.LOG_PATH, cst.TXT_PATH]:
        assert set(datas_simplified[k].keys()) == datas_full_keys_set
    # keys_redundant
    # DEBUG: Simplify demo_txt/test_txt/2019-01-01T00_00_02 in memory
    os.remove(cst.TXT_PATH)
    parser.main()
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    for k in [cst.LOG_PATH, cst.TXT_PATH]:
        assert set(datas_full[k].keys()) == datas_full_keys_set
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    assert set(datas_simplified[cst.LOG_PATH].keys()) == datas_full_keys_set
    assert set(datas_simplified[cst.TXT_PATH].keys()) == datas_simplified_keys_set

    parser = psr(keep_data_in_memory=False)
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    for k in [cst.LOG_PATH, cst.TXT_PATH]:
        assert set(datas_full[k].keys()) == datas_full_keys_set
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    for k in [cst.LOG_PATH, cst.TXT_PATH]:
        assert set(datas_simplified[k].keys()) == datas_simplified_keys_set
    # New round of parsing, old file with new size, test self.cst.read_data(), found invalid cst.LOG_JSON_PATH
    cst.write_text(cst.LOG_PATH, u'appended_log\n', append=True)
    parser.main()
    cst.write_text(cst.LOG_JSON_PATH, u'')
    cst.write_text(cst.LOG_PATH, END, append=True)
    parser.main()
Пример #10
0
def test_actual_lines(psr):
    """
    2019-01-01 00:00:01 DEBUG 1
    a

    b

    2019-01-01 00:00:01 DEBUG 2
    """
    prefix = u'2019-01-01 00:00:01 DEBUG '
    parser = psr(execute_main=False, log_head_lines=5, log_tail_lines=10)
    # In windows, '\r\n' is stored as: '\r\r\n'
    cst.write_text(cst.LOG_PATH, prefix + '1\na\n\nb\n\n')
    cst.write_text(cst.LOG_PATH, prefix + '2\n', append=True)
    parser.main()
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert '1\na\n\nb\n\n' in log_data['head']
    assert log_data['_head'] == log_data['head']

    for i in range(3, 8):
        cst.write_text(cst.LOG_PATH, prefix + '%s\n' % i, append=True)
    parser.main()
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['_head'] == 5
    for i in range(1, 8):
        if i <= 3:
            assert 'DEBUG %s' % i in log_data['head']
        else:
            assert 'DEBUG %s' % i not in log_data['head']
    head = log_data['head']

    for i in range(8, 12):
        cst.write_text(cst.LOG_PATH, prefix + '%s\n' % i, append=True)
    parser.main()
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['_head'] == 5
    assert log_data['head'] == head
    assert log_data['tail'].startswith('b\n\n')
    for i in range(2, 11):
        assert 'DEBUG %s' % i in log_data['tail']
Пример #11
0
def test_empty_logs_dir(psr):
    parser = psr(execute_main=False)
    # cst.STATS_JSON_PATH is created in Parser.__init__()
    for path in [cst.LOG_PATH, cst.TXT_PATH, cst.STATS_JSON_PATH]:
        os.remove(path)
    parser.main()
    for path in [cst.LOG_PATH, cst.TXT_PATH, cst.LOG_JSON_PATH, cst.TXT_JSON_PATH]:
        assert not os.path.exists(path)
    assert os.path.exists(cst.STATS_JSON_PATH)
    stats = cst.read_data(cst.STATS_JSON_PATH)
    default_stats = dict(status='ok', datas={}, logparser_version=cst.LOGPARSER_VERSION)
    assert set(stats.keys()) == {'status', 'datas', 'settings_py', 'settings',
                                 'last_update_timestamp', 'last_update_time', 'logparser_version'}
    for k, v in default_stats.items():
        assert stats[k] == v
    # last_update_time, comes from last_update_timestamp
    assert cst.string_to_timestamp(stats['last_update_time']) == stats['last_update_timestamp']
Пример #12
0
def test_telnet_fail(psr):
    parser = psr(execute_main=False)
    for name in [
            'telnet_151_port_16023', 'telnet_160_port_16024',
            'telnet_151_no_port'
    ]:
        log_file = os.path.join(cst.DEMO_PROJECT_LOG_FOLDER_PATH,
                                '%s.log' % name)
        cst.write_text(log_file, globals()[name.upper()])
        parser.main()
        log_data = cst.read_data(re.sub(r'.log$', '.json', log_file))
        if name == 'telnet_151_port_16023':
            assert log_data['latest_matches']['scrapy_version'] == '1.5.1'
            assert log_data['latest_matches'][
                'telnet_console'] == '127.0.0.1:16023'
        elif name == 'telnet_160_port_16024':
            assert log_data['latest_matches']['scrapy_version'] == '1.6.0'
            assert log_data['latest_matches'][
                'telnet_console'] == '127.0.0.1:16024'
        else:
            assert log_data['latest_matches']['scrapy_version'] == '1.5.1'
            assert log_data['latest_matches']['telnet_console'] == 'localhost'
        assert not log_data['crawler_engine']
Пример #13
0
def test_new_file_read_data(psr):
    psr()
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    last_update_timestamp = log_data['last_update_timestamp']

    # Skip parsing since data with same size found
    # Old file with old size
    parser = psr(execute_main=False, reset_logs=False)
    for i in range(2):
        time.sleep(2)
        parser.main()
        log_data = cst.read_data(cst.LOG_JSON_PATH)
        assert log_data['last_update_timestamp'] == last_update_timestamp
        cst.check_demo_data(log_data)

    # Old logfile with smaller size
    cst.write_text(cst.LOG_PATH, FRONT + END.replace('memory', ''))
    parser.main()
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['last_update_timestamp'] == last_update_timestamp
    cst.check_demo_data(log_data)
    stats = cst.read_data(cst.STATS_JSON_PATH)
    assert cst.PROJECT not in stats['datas']
    # -> parse in next round
    parser.main()
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['last_update_timestamp'] > last_update_timestamp
    cst.check_demo_data(log_data)
    stats = cst.read_data(cst.STATS_JSON_PATH)
    assert cst.PROJECT in stats['datas']

    # Read data fail
    time.sleep(2)
    cst.write_text(cst.LOG_JSON_PATH, u'')
    psr(reset_logs=False)
    log_data = cst.read_data(cst.LOG_JSON_PATH)
    assert log_data['last_update_timestamp'] > last_update_timestamp
    cst.check_demo_data(log_data)
Пример #14
0
def test_log_headlines_taillines(psr):
    psr(log_head_lines=5, log_tail_lines=10)
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert len(data['head'].split('\n')) == 5
    assert len(data['tail'].split('\n')) == 10
Пример #15
0
def test_log_categories_limit(psr):
    log_categories_limit = 3
    psr(log_categories_limit=log_categories_limit)
    data = cst.read_data(cst.LOG_JSON_PATH)
    cst.check_demo_data(data, log_categories_limit=log_categories_limit)
Пример #16
0
def test_appended_log(psr):
    first_log_time = '2018-10-23 18:28:34'

    parser = psr(execute_main=False, log_head_lines=10, log_tail_lines=50)
    # 2018-10-23 18:28:35 [test] WARNING: warn
    front_head, front_tail = re.split(r'WARNING: warn[^i]', FRONT)
    # {'item': 2}
    # 2018-10-23 18:28:40 [..logstats] INFO: Crawled 3 pages (at 60 pages/min), scraped 2 items (at 60 items/min)
    front_mid, front_tail = front_tail.split("{'item': 2}")

    cst.write_text(cst.LOG_PATH, u'')
    # Test short appended log
    for idx, appended_log in enumerate([u'', u'2018-10-23 18:28:34 DEBUG\n',
                                        u'2018-10-23 18:28:34 INFO\n', u'test\n']):
        cst.write_text(cst.LOG_PATH, appended_log, append=True)
        parser.main()
        data = cst.read_data(cst.LOG_JSON_PATH)
        # Text to be ignored for next round: '2018-10-23 18:28:34 INFO\r\n'
        # appended log: 2018-10-23 18:28:34 DEBUG
        # "_head": "2018-10-23 18:28:34 DEBUG\n",
        # "head": "2018-10-23 18:28:34 DEBUG\n",
        # "tail": "2018-10-23 18:28:34 DEBUG\n",
        if idx >= 2:
            assert data['first_log_time'] == first_log_time
            assert data['_head']
        else:
            assert data['first_log_time'] == cst.NA
            assert not data['_head']
        assert data['finish_reason'] == cst.NA
        assert data['pages'] is None
        assert data['items'] is None

    cst.write_text(cst.LOG_PATH, front_head, append=True)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert data['first_log_time'] == first_log_time
    assert data['latest_log_time'] == '2018-10-23 18:28:35'
    assert data['datas'] == [['2018-10-23 18:28:35', 0, 0, 0, 0]]
    assert data['pages'] == 0
    assert data['items'] == 0
    for k in cst.LATEST_MATCHES_RESULT_DICT.keys():
        if k in ['telnet_console', 'resuming_crawl', 'latest_stat']:
            assert data['latest_matches'][k]
        else:
            assert not data['latest_matches'][k]
    for k in cst.LOG_CATEGORIES_RESULT_DICT.keys():
        assert data['log_categories'][k]['count'] == 0
        assert data['log_categories'][k]['details'] == []
    assert data['shutdown_reason'] == cst.NA
    assert data['finish_reason'] == cst.NA
    assert '[scrapy.utils.log] INFO: Scrapy 1.5.0 started' in data['head']
    assert '[scrapy.utils.log] INFO: Scrapy 1.5.0 started' in data['tail']

    cst.write_text(cst.LOG_PATH, u'WARNING: warn\n' + front_mid, append=True)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert data['first_log_time'] == first_log_time
    assert data['latest_log_time'] == '2018-10-23 18:28:39'
    assert (data['datas'][0] == ['2018-10-23 18:28:35', 0, 0, 0, 0]
            and ['2018-10-23 18:28:37', 1, 60, 0, 0] in data['datas']
            and ['2018-10-23 18:28:38', 2, 60, 1, 60] in data['datas']
            and data['datas'][-1] == ['2018-10-23 18:28:39', 2, 0, 1, 0]
            and len(data['datas']) == 5)
    assert data['pages'] == 2
    assert data['items'] == 1
    for k in cst.LATEST_MATCHES_RESULT_DICT.keys():
        assert data['latest_matches'][k]
    assert data['latest_matches']['latest_item'] == "{'item': 1}"
    for k, (count, __) in cst.LOG_CATEGORIES_RESULT_DICT.items():
        if k == 'error_logs':
            assert data['log_categories'][k]['count'] == 4
        elif k == 'retry_logs':
            assert data['log_categories'][k]['count'] == 0
        else:
            assert data['log_categories'][k]['count'] == count
        if k == 'retry_logs':
            assert data['log_categories'][k]['details'] == []
        else:
            assert data['log_categories'][k]['details']
    assert data['shutdown_reason'] == cst.NA
    assert data['finish_reason'] == cst.NA

    cst.write_text(cst.LOG_PATH, u"{'item': 2}" + front_tail, append=True)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert data['first_log_time'] == first_log_time
    assert data['latest_log_time'] == '2018-10-23 18:29:41'
    assert (data['datas'][0] == ['2018-10-23 18:28:35', 0, 0, 0, 0]
            and data['datas'][-1] == ['2018-10-23 18:29:41', 3, 0, 2, 0]
            and len(data['datas']) == 67)
    assert data['pages'] == 3
    assert data['items'] == 2
    for k in cst.LATEST_MATCHES_RESULT_DICT.keys():
        assert data['latest_matches'][k]
    for k, (count, __) in cst.LOG_CATEGORIES_RESULT_DICT.items():
        assert data['log_categories'][k]['count'] == count
        assert data['log_categories'][k]['details']
    assert data['shutdown_reason'] == cst.NA
    assert data['finish_reason'] == cst.NA

    # 'finish_reason': 'finished',
    # 'finish_time': datetime.datetime(2018, 10, 23, 10, 29, 41, 174719),
    end_head, end_tail = END.split("'finish_time'")
    cst.write_text(cst.LOG_PATH, end_head, append=True)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert data['first_log_time'] == first_log_time
    assert data['latest_log_time'] == '2018-10-23 18:29:41'
    assert (data['datas'][0] == ['2018-10-23 18:28:35', 0, 0, 0, 0]
            and data['datas'][-1] == ['2018-10-23 18:29:41', 3, 0, 2, 0]
            and len(data['datas']) == 67)
    assert data['pages'] == 3
    assert data['items'] == 2
    for k in cst.LATEST_MATCHES_RESULT_DICT.keys():
        assert data['latest_matches'][k]
    for k, (count, __) in cst.LOG_CATEGORIES_RESULT_DICT.items():
        assert data['log_categories'][k]['count'] == count
        assert data['log_categories'][k]['details']
    assert data['shutdown_reason'] == cst.NA
    assert data['finish_reason'] == cst.NA
    # 2018-10-23 18:29:41 [scrapy.extensions.feedexport] INFO: Stored jsonlines feed (2 items) in: file:///
    # 2018-10-23 18:29:41 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
    assert 'INFO: Stored jsonlines feed' in data['tail']
    assert 'INFO: Dumping Scrapy stats:' not in data['tail']

    cst.write_text(cst.LOG_PATH, u"'finish_time'" + end_tail, append=True)
    parser.main()
    data = cst.read_data(cst.LOG_JSON_PATH)
    assert data['first_log_time'] == first_log_time
    assert data['latest_log_time'] == '2018-10-23 18:29:42'
    assert (data['datas'][0] == ['2018-10-23 18:28:35', 0, 0, 0, 0]
            and data['datas'][-1] == ['2018-10-23 18:29:41', 3, 0, 2, 0]
            and len(data['datas']) == 67)
    assert data['pages'] == 3
    assert data['items'] == 2
    for k in cst.LATEST_MATCHES_RESULT_DICT.keys():
        assert data['latest_matches'][k]
    for k, (count, __) in cst.LOG_CATEGORIES_RESULT_DICT.items():
        assert data['log_categories'][k]['count'] == count
        assert data['log_categories'][k]['details']
    assert data['shutdown_reason'] == cst.NA
    assert data['finish_reason'] == 'finished'
    # assert data['size'] == 15883  # != cst.SIZE 15862 '2018-10-23 18:28:34\n' \r\n => 15883
    # assert data['position'] == 15883  # != cst.SIZE 15862
    assert data['size'] == data['position']
    assert '[scrapy.utils.log] INFO: Scrapy 1.5.0 started' in data['head']
    assert '[scrapy.core.engine] INFO: Spider closed' not in data['head']
    assert '[scrapy.utils.log] INFO: Scrapy 1.5.0 started' not in data['tail']
    assert '[scrapy.core.engine] INFO: Spider closed' in data['tail']
Пример #17
0
def test_telnet(psr):
    parser = psr(execute_main=False)

    cwd = os.getcwd()
    os.chdir(cst.DEMO_PROJECT_PATH)
    try:
        for version in ['1.4.0', '1.5.0', '1.5.1', '1.5.2', '1.6.0', 'latest']:
            if version == 'latest':
                cmd = 'pip install --upgrade scrapy'
            else:
                cmd = 'pip install scrapy==%s' % version
            cst.sub_process(cmd, block=True)
            log_file = os.path.join(cst.DEMO_PROJECT_LOG_FOLDER_PATH,
                                    'scrapy_%s.log' % version)
            cmd = 'scrapy crawl example -s CLOSESPIDER_TIMEOUT=20 -s LOG_FILE=%s' % log_file
            if version == '1.5.0':
                cmd += ' -s TELNETCONSOLE_ENABLED=False'
            elif version == '1.5.2':
                cmd += ' -s TELNETCONSOLE_USERNAME=usr123 -s TELNETCONSOLE_PASSWORD=psw456'
            proc = cst.sub_process(cmd)

            time.sleep(10)
            if version == '1.4.0':
                proc.kill()
            parser.main()

            if version != '1.4.0':
                time.sleep(20)
            parser.main()

            log_data = cst.read_data(re.sub(r'.log$', '.json', log_file))
            if version == 'latest':
                assert log_data['latest_matches']['scrapy_version'] >= '1.6.0'
            else:
                assert log_data['latest_matches']['scrapy_version'] == version
            assert log_data['log_categories']['critical_logs']['count'] == 0
            assert log_data['log_categories']['error_logs']['count'] == 0
            if version == '1.5.0':
                assert not log_data['latest_matches']['telnet_console']
            else:
                assert log_data['latest_matches']['telnet_console']
            if version <= '1.5.1':
                assert not log_data['latest_matches']['telnet_username']
                assert not log_data['latest_matches']['telnet_password']
            elif version == '1.5.2':
                assert log_data['latest_matches'][
                    'telnet_username'] == 'usr123'
                assert log_data['latest_matches'][
                    'telnet_password'] == 'psw456'
            else:
                assert not log_data['latest_matches']['telnet_username']
                assert log_data['latest_matches']['telnet_password']
            if version == '1.4.0':
                assert log_data['finish_reason'] == 'N/A'
                assert not log_data['crawler_stats']
                assert not log_data['crawler_engine']
            else:
                assert log_data['finish_reason'] == 'closespider_timeout'
                assert log_data['crawler_stats']
                assert log_data['crawler_stats']['source'] == 'log'
                if version == '1.5.0' or ((cst.ON_WINDOWS or on_fedora)
                                          and version > '1.5.1'):
                    assert not log_data['crawler_engine']
                else:
                    assert log_data['crawler_engine']
                    assert log_data['crawler_engine']['source'] == 'telnet'
    except:
        os.chdir(cwd)
        raise
    finally:
        os.chdir(cwd)
Пример #18
0
def test_jobs_to_keep(psr):
    parser = psr()
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    for datas in [datas_full, datas_simplified]:
        assert set(datas.keys()) == {cst.LOG_PATH, cst.TXT_PATH}
    # delete a logfile
    os.remove(cst.TXT_PATH)
    parser.main()
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    for datas in [datas_full, datas_simplified]:
        assert set(datas.keys()) == {cst.LOG_PATH, cst.TXT_PATH}
    # add a logfile
    copy(cst.LOG_PATH, cst.LOG_TEMP_PATH)
    parser.main()
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    for datas in [datas_full, datas_simplified]:
        assert set(
            datas.keys()) == {cst.LOG_PATH, cst.TXT_PATH, cst.LOG_TEMP_PATH}

    parser = psr(jobs_to_keep=1)
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    for datas in [datas_full, datas_simplified]:
        assert set(datas.keys()) == {cst.LOG_PATH, cst.TXT_PATH}
    # delete a logfile
    os.remove(cst.TXT_PATH)
    parser.main()
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    assert set(datas_full.keys()) == {cst.LOG_PATH, cst.TXT_PATH}
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    assert set(datas_simplified.keys()) == {cst.LOG_PATH}
    # add a logfile
    copy(cst.LOG_PATH, cst.LOG_TEMP_PATH)
    parser.main()
    datas_full = cst.read_data(cst.DATAS_COMPLETE_JSON_PATH)
    datas_simplified = cst.read_data(cst.DATAS_SIMPLIFIED_JSON_PATH)
    for datas in [datas_full, datas_simplified]:
        assert set(datas.keys()) == {cst.LOG_PATH, cst.LOG_TEMP_PATH}