Exemplo n.º 1
0
    def test_parse_line_with_apache_line(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            '%l':
            '-',
            '%>s':
            '200',
            '%h':
            '187.19.211.179',
            '%{User-Agent}i':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            '%b':
            '25084',
            '%{Referer}i':
            '-',
            '%u':
            '-',
            '%t':
            '[30/May/2013:00:01:01 -0300]',
            '%r':
            'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'
        }

        self.assertEqual(ac._parse_line(line), expected)

        line = '123.125.71.39 - - [30/Dec/2012:23:59:57 -0200] "GET /scielo.php?script=sci_nlinks&ref=000144&pid=S0103-4014200000020001300010&lng=pt HTTP/1.1" 200 1878 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'

        expected = {
            '%l':
            '-',
            '%>s':
            '200',
            '%h':
            '123.125.71.39',
            '%{User-Agent}i':
            'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
            '%b':
            '1878',
            '%{Referer}i':
            '-',
            '%u':
            '-',
            '%t':
            '[30/Dec/2012:23:59:57 -0200]',
            '%r':
            'GET /scielo.php?script=sci_nlinks&ref=000144&pid=S0103-4014200000020001300010&lng=pt HTTP/1.1'
        }

        self.assertEqual(ac._parse_line(line), expected)
Exemplo n.º 2
0
    def test_parsed_access_valid_html_access(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S1414-431X2000000300007&script=sci_arttext HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
                        'ip': '187.19.211.179',
                        'code': 'S1414-431X2000000300007',
                        'access_type': 'HTML',
                        'iso_date': '2013-05-30',
                        'iso_datetime': '2013-05-30T00:01:01',
                        'year': '2013',
                        'query_string': {
                            'pid': 'S1414-431X2000000300007',
                            'script': 'sci_arttext'
                        },
                        'day': '30',
                        'original_agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
                        'original_date': '[30/May/2013:00:01:01 -0300]',
                        'script': 'sci_arttext',
                        'month': '05'
                    }

        self.assertEqual(ac.parsed_access(line), expected)
Exemplo n.º 3
0
    def test_parsed_access_valid_html_access_on_new_site(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET  https://www.scielo.br/article/bjmbr/2018.v51n11/e7704/en/ HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            'ip': '187.19.211.179',
            'code': '/article/bjmbr/2018.v51n11/e7704',
            'access_type': 'HTML',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'query_string': None,
            'day': '30',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'script': '',
            'month': '05'
        }

        self.assertEqual(ac.parsed_access(line), expected)
Exemplo n.º 4
0
    def test_parsed_access_valid_pdf_access(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/pdf/bjmbr/v14n4/03.pdf HTTP/1.1" 206 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
                        'ip': '201.14.120.2',
                        'code': '/pdf/bjmbr/v14n4/03.pdf',
                        'access_type': 'PDF',
                        'iso_date': '2013-05-30',
                        'iso_datetime': '2013-05-30T00:01:01',
                        'year': '2013',
                        'day': '30',
                        'month': '05',
                        'original_agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
                        'original_date': '[30/May/2013:00:01:01 -0300]',
                        'query_string': None,
                        'pdf_issn': u'1414-431X',
                        'script': '',
                        'pdf_path': '/pdf/bjmbr/v14n4/03.pdf'
                    }

        self.assertEqual(ac.parsed_access(line), expected)
Exemplo n.º 5
0
    def test_parsed_access_valid_pdf_access_GET_string_without_domain(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '66.249.73.80 - - [30/May/2013:00:01:01 -0300] "GET /pdf/bjmbr/v29n4/18781.pdf HTTP/1.1" 200 32061 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            'ip': '66.249.73.80',
            'code': '/pdf/bjmbr/v29n4/18781.pdf',
            'access_type': 'PDF',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'day': '30',
            'month': '05',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'query_string': None,
            'pdf_issn': u'1414-431X',
            'script': '',
            'pdf_path': '/pdf/bjmbr/v29n4/18781.pdf'
        }

        self.assertEqual(ac.parsed_access(line), expected)
Exemplo n.º 6
0
    def test_parsed_access_valid_pdf_access_on_new_site_path_only(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET /pdf/bjmbr/2018.v51n11/e7704/en HTTP/1.1" 206 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            'ip': '201.14.120.2',
            'code': '/pdf/bjmbr/2018.v51n11/e7704',
            'access_type': 'PDF',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'day': '30',
            'month': '05',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'query_string': None,
            'pdf_issn': u'1414-431X',
            'script': '',
            'pdf_path': '/pdf/bjmbr/2018.v51n11/e7704/en'
        }

        self.assertEqual(ac.parsed_access(line), expected)
Exemplo n.º 7
0
    def test_parsed_access_valid_pdf_access_GET_string_without_domain(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '66.249.73.80 - - [30/May/2013:00:01:01 -0300] "GET /pdf/bjmbr/v29n4/18781.pdf HTTP/1.1" 200 32061 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
                        'ip': '66.249.73.80',
                        'code': '/pdf/bjmbr/v29n4/18781.pdf',
                        'access_type': 'PDF',
                        'iso_date': '2013-05-30',
                        'iso_datetime': '2013-05-30T00:01:01',
                        'year': '2013',
                        'day': '30',
                        'month': '05',
                        'query_string': None,
                        'pdf_issn': u'1414-431X',
                        'script': '',
                        'pdf_path': '/pdf/bjmbr/v29n4/18781.pdf'
                    }

        self.assertEqual(ac.parsed_access(line), expected)
Exemplo n.º 8
0
 def setUp(self):
     self.ac = AccessChecker(
         collection="scl",
         allowed_collections=lambda: [u"scl", u"arg"],
         acronym_to_issn_dict=lambda col: {
             u'zool': u'1984-4670',
             u'bjmbr': u'1414-431X'
         },
     )
Exemplo n.º 9
0
def bulk(collection=None):
    _logger.info('Running as bulk')

    if COUNTER_COMPLIANT:
        ts = TimedSet(expired=checkdatelock)

    ac = AccessChecker(collection)

    proc_coll = get_proc_collection()
    proc_robots_coll = get_proc_robots_collection()

    for logfile in os.popen('ls %s/*' % LOG_DIR):

        logfile = logfile.strip()

        # Verifica se arquivo já foi processado.
        if proc_coll.find({'file_name': logfile}).count() > 0:
            _logger.debug('File already processe %s' % logfile)
            continue

        # Registra em base de dados de arquivos processados o novo arquivo.
        _logger.info("Processing: %s" % logfile)
        proc_coll.insert({'file_name': logfile})

        rq = Local(MONGO_URI, collection)

        with open(logfile, 'rb') as f:

            log_file_line = 0
            for raw_line in f:
                log_file_line += 1
                _logger.debug("Reading line {0} from file {1}".format(str(log_file_line), logfile))
                parsed_line = ac.parsed_access(raw_line)

                if not parsed_line:
                    continue

                if COUNTER_COMPLIANT:
                    # Counter Mode Accesses
                    locktime = 10
                    if parsed_line['access_type'] == "PDF":
                        locktime = 30
                    try:
                        lockid = '_'.join([parsed_line['ip'],
                                           parsed_line['code'],
                                           parsed_line['script']])
                        ts.add(lockid, parsed_line['iso_datetime'], locktime)
                        register_access(rq, parsed_line)
                    except ValueError:
                        continue
                else:
                    # SciELO Mode Accesses
                    register_access(rq, parsed_line)


        rq.send(slp=SLEEP)
        del(rq)
Exemplo n.º 10
0
    def test_pid_is_valid_script_sci_issues_valid_pid(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')
        self.assertEqual(ac._is_valid_html_request('sci_issues', '1414-431X'), True)
Exemplo n.º 11
0
    def test_pid_is_valid_script_sci_issues_valid_pid(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')
        self.assertEqual(ac._is_valid_html_request('sci_issues', '1414-431X'),
                         True)
Exemplo n.º 12
0
    def test_pid_is_valid_not_allowed_issn(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        self.assertEqual(ac._is_valid_html_request('sci_arttext', 'XXXX-XXXX2012000100001'), False)
Exemplo n.º 13
0
    def test_is_bot_method_with_common_user_agent(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        agent = '"Mozilla/5.0 (Windows NT 5.1; rv:26.0) Gecko/20100101 Firefox/26.0"'

        self.assertEqual(ac.is_robot(agent), False)
Exemplo n.º 14
0
    def test_parsed_access_valid_pdf_with_not_allowed_acronym(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/pdf/not_allowed_acronym/v14n4/03.pdf HTTP/1.1" 206 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 15
0
    def test_parsed_access_valid_pdf_with_any_different_access(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '177.191.212.233 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/img/pt/author.gif HTTP/1.1" 304 0 "http://www.scielo.br/scielo.php?script=sci_serial&pid=1415-4757&nrm=iso&rep=&lng=pt" "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 16
0
    def test_parsed_access_invalid_article_access_without_query_string(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 17
0
    def test_is_bot_GoogleBot_sample(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '66.249.75.131 - - [24/Dec/2013:04:49:09 -0200] "GET http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-79722002000200013 HTTP/1.1" 200 102967 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"'
        
        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 18
0
    def test_access_datetime(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        access_date = u'[30/Dec/2012:23:59:57 -0200]'

        self.assertEqual(ac._access_date(access_date).isoformat(), u'2012-12-30T23:59:57')
Exemplo n.º 19
0
    def test_is_bot_method_with_bot_agent(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        agent = '"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'

        self.assertEqual(ac.is_robot(agent), True)
Exemplo n.º 20
0
    def test_pid_is_valid_pdf_request_empty_file_path(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u''

        self.assertEqual(ac._is_valid_pdf_request(request), None)
Exemplo n.º 21
0
    def test_pid_is_valid_pdf_request_invalid_request_not_allowed_acronym(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/pdf/not_allowed_acronym/v96n2/a18v96n2.xxx HTTP/1.1'

        self.assertEqual(ac._is_valid_pdf_request(request), None)
Exemplo n.º 22
0
    def test_pid_is_valid_pdf_request_GET_without_domain(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET /pdf/zool/v29n4/18781.pdf HTTP/1.1'

        self.assertEqual(ac._is_valid_pdf_request(request), {'pdf_issn': u'1984-4670', 'pdf_path': u'/pdf/zool/v29n4/18781.pdf'})
Exemplo n.º 23
0
    def test_access_date_with_invalid_date(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        access_date = u''

        self.assertEqual(ac._access_date(access_date), None)
Exemplo n.º 24
0
    def test_is_bot_Spider_sample(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '180.76.5.118 - - [24/Dec/2013:04:49:09 -0200] "GET http://www.scielo.br/pdf/csc/v11n2/30434.pdf HTTP/1.1" 200 79618 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'
        
        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 25
0
    def test_is_bot_Bing_sample(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '13245  157.56.92.164 - - [30/Nov/2013:03:53:26 -0200] "GET http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-87752010000200013 HTTP/1.1" 200 108777 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"'
        
        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 26
0
    def test_query_string_without_parameters(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        url = u'GET http://www.scielo.br/scielo.php HTTP/1.1'

        self.assertEqual(ac._query_string(url), None)
Exemplo n.º 27
0
    def test_parsed_access_valid_pdf_with_any_different_access(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '177.191.212.233 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/img/pt/author.gif HTTP/1.1" 304 0 "http://www.scielo.br/scielo.php?script=sci_serial&pid=1415-4757&nrm=iso&rep=&lng=pt" "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 28
0
    def test_query_string_without_parameters(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        url = u'GET http://www.scielo.br/scielo.php HTTP/1.1'

        self.assertEqual(ac._query_string(url), None)
Exemplo n.º 29
0
    def test_pid_is_valid_not_allowed_issn(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        self.assertEqual(
            ac._is_valid_html_request('sci_arttext', 'XXXX-XXXX2012000100001'),
            False)
Exemplo n.º 30
0
    def test_is_bot_Spider_sample(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '180.76.5.118 - - [24/Dec/2013:04:49:09 -0200] "GET http://www.scielo.br/pdf/csc/v11n2/30434.pdf HTTP/1.1" 200 79618 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 31
0
    def test_is_bot_Bing_sample(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '13245  157.56.92.164 - - [30/Nov/2013:03:53:26 -0200] "GET http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-87752010000200013 HTTP/1.1" 200 108777 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 32
0
    def test_pid_is_valid_pdf_request_empty_file_path(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u''

        self.assertEqual(ac._is_valid_pdf_request(request), None)
Exemplo n.º 33
0
    def test_is_bot_method_with_bot_agent(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        agent = '"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'

        self.assertEqual(ac.is_robot(agent), True)
Exemplo n.º 34
0
    def test_is_bot_GoogleBot_sample(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '66.249.75.131 - - [24/Dec/2013:04:49:09 -0200] "GET http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-79722002000200013 HTTP/1.1" 200 102967 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 35
0
    def test_is_bot_method_with_common_user_agent(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        agent = '"Mozilla/5.0 (Windows NT 5.1; rv:26.0) Gecko/20100101 Firefox/26.0"'

        self.assertEqual(ac.is_robot(agent), False)
Exemplo n.º 36
0
    def test_parsed_access_invalid_article_access_without_query_string(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 37
0
    def test_access_date_with_invalid_date(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        access_date = u''

        self.assertEqual(ac._access_date(access_date), None)
Exemplo n.º 38
0
    def test_parsed_access_valid_pdf_with_not_allowed_acronym(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/pdf/not_allowed_acronym/v14n4/03.pdf HTTP/1.1" 206 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        self.assertEqual(ac.parsed_access(line), None)
Exemplo n.º 39
0
    def test_access_datetime(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        access_date = u'[30/Dec/2012:23:59:57 -0200]'

        self.assertEqual(
            ac._access_date(access_date).isoformat(), u'2012-12-30T23:59:57')
Exemplo n.º 40
0
 def __init__(self,
              collection,
              mongo_uri=MONGO_URI,
              logs_source=LOGS_SOURCE,
              counter_compliant=None,
              skipped_log_dir=None):
     self._mongo_uri = "%s_%s" % (mongo_uri, collection)
     self._proc_coll = self.get_proc_collection()
     self._collection = collection
     self._logs_source = logs_source
     self._counter_compliant = counter_compliant
     self._ts = utils.TimedSet(expired=utils.checkdatelock)
     self._skipped_log_dir = skipped_log_dir
     self._skipped_log = None
     self._ac = AccessChecker(self._collection)
Exemplo n.º 41
0
    def test_pid_is_valid_pdf_request_invalid_request_not_allowed_acronym(
            self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/pdf/not_allowed_acronym/v96n2/a18v96n2.xxx HTTP/1.1'

        self.assertEqual(ac._is_valid_pdf_request(request), None)
Exemplo n.º 42
0
    def test_pdf_or_html_access_for_html(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result(['scl', 'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})
        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'HTML')

        request = u'GET /scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'HTML')
Exemplo n.º 43
0
    def test_pdf_or_html_access_for_html(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result(['scl', 'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})
        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'HTML')

        request = u'GET /scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'HTML')
Exemplo n.º 44
0
    def test_pid_is_valid_pdf_request_GET_without_domain(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET /pdf/zool/v29n4/18781.pdf HTTP/1.1'

        self.assertEqual(ac._is_valid_pdf_request(request), {
            'pdf_issn': u'1984-4670',
            'pdf_path': u'/pdf/zool/v29n4/18781.pdf'
        })
Exemplo n.º 45
0
    def test_pdf_or_html_access_for_html_on_new_site(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result(['scl', 'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})
        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/article/abcd/2018.v31n3/e1382/pt/ HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'HTML')

        request = u'GET /article/abcd/2018.v31n3/e1382/pt/ HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'HTML')
Exemplo n.º 46
0
    def test_query_string_with_parameters(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        url = u'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'

        self.assertEqual(ac._query_string(url), {u'pid': u'S0100-736X2000000300007', u'script': u'sci_arttext'})

        url = u'GET /scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'

        self.assertEqual(ac._query_string(url), {u'pid': u'S0100-736X2000000300007', u'script': u'sci_arttext'})
Exemplo n.º 47
0
    def test_pdf_or_html_access_for_pdf(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result(['scl', 'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/pdf/isz/v96n2/a18v96n2.pdf HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'PDF')

        request = u'GET /pdf/isz/v96n2/a18v96n2.pdf HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'PDF')
Exemplo n.º 48
0
    def test_pdf_or_html_access_for_pdf(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result(['scl', 'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/pdf/isz/v96n2/a18v96n2.pdf HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'PDF')

        request = u'GET /pdf/isz/v96n2/a18v96n2.pdf HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), u'PDF')
Exemplo n.º 49
0
    def test_pdf_or_html_access_for_files(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/css/screen/styles.css HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), None)

        request = u'GET /css/screen/styles.css HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), None)
Exemplo n.º 50
0
    def test_pdf_or_html_access_for_files(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        request = u'GET http://www.scielo.br/css/screen/styles.css HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), None)

        request = u'GET /css/screen/styles.css HTTP/1.1'

        self.assertEqual(ac._pdf_or_html_access(request), None)
Exemplo n.º 51
0
 def __init__(self, collection, mongo_uri=MONGO_URI, logs_source=LOGS_SOURCE, counter_compliant=None, skipped_log_dir=None):
     self._mongo_uri = "%s_%s" % (mongo_uri, collection)
     self._proc_coll = self.get_proc_collection()
     self._collection = collection
     self._logs_source = logs_source
     self._counter_compliant = counter_compliant
     self._ts = utils.TimedSet(expired=utils.checkdatelock)
     self._skipped_log_dir = skipped_log_dir
     self._skipped_log = None
     self._ac = AccessChecker(self._collection)
Exemplo n.º 52
0
    def test_parse_line_with_apache_line(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
                    '%l': '-',
                    '%>s': '200',
                    '%h': '187.19.211.179',
                    '%{User-Agent}i': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
                    '%b': '25084',
                    '%{Referer}i': '-',
                    '%u': '-',
                    '%t': '[30/May/2013:00:01:01 -0300]',
                    '%r': 'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'
                    }

        self.assertEqual(ac._parse_line(line), expected)

        line = '123.125.71.39 - - [30/Dec/2012:23:59:57 -0200] "GET /scielo.php?script=sci_nlinks&ref=000144&pid=S0103-4014200000020001300010&lng=pt HTTP/1.1" 200 1878 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'

        expected = {
                    '%l': '-',
                    '%>s': '200',
                    '%h': '123.125.71.39',
                    '%{User-Agent}i': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
                    '%b': '1878',
                    '%{Referer}i': '-',
                    '%u': '-',
                    '%t': '[30/Dec/2012:23:59:57 -0200]',
                    '%r': 'GET /scielo.php?script=sci_nlinks&ref=000144&pid=S0103-4014200000020001300010&lng=pt HTTP/1.1'
                    }

        self.assertEqual(ac._parse_line(line), expected)
Exemplo n.º 53
0
    def test_query_string_with_parameters(self):
        accesschecker = self.mocker.patch(AccessChecker)
        accesschecker._allowed_collections()
        self.mocker.result([u'scl', u'arg'])
        accesschecker._acronym_to_issn_dict()
        self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'})

        self.mocker.replay()

        ac = AccessChecker(collection='scl')

        url = u'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'

        self.assertEqual(ac._query_string(url), {
            u'pid': u'S0100-736X2000000300007',
            u'script': u'sci_arttext'
        })

        url = u'GET /scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'

        self.assertEqual(ac._query_string(url), {
            u'pid': u'S0100-736X2000000300007',
            u'script': u'sci_arttext'
        })
Exemplo n.º 54
0
class Bulk(object):

    def __init__(self, collection, mongo_uri=MONGO_URI, logs_source=LOGS_SOURCE, counter_compliant=None, skipped_log_dir=None):
        self._mongo_uri = "%s_%s" % (mongo_uri, collection)
        self._proc_coll = self.get_proc_collection()
        self._collection = collection
        self._logs_source = logs_source
        self._counter_compliant = counter_compliant
        self._ts = utils.TimedSet(expired=utils.checkdatelock)
        self._skipped_log_dir = skipped_log_dir
        self._skipped_log = None
        self._ac = AccessChecker(self._collection)

    def __enter__(self):
        if self._skipped_log_dir:
            now = datetime.datetime.now().isoformat()
            skipped_log = '/'.join([self._skipped_log_dir, now]).replace('//', '/')
            try:
                self._skipped_log = open(skipped_log, 'w')
            except ValueError:
                raise "Invalid directory or file name: %s" % skipped_log

        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self._ts = None
        self._ac = None
        if self._skipped_log:
            self._skipped_log.close()

    def _mongodb_connect(self, mdb_database):

        db_url = urlparse.urlparse(self._mongo_uri)
        conn = pymongo.MongoClient(host=db_url.hostname, port=db_url.port)
        db = conn[db_url.path[1:]]
        if db_url.username and db_url.password:
            db.authenticate(db_url.username, db_url.password)

        return db[mdb_database]

    def get_proc_collection(self):
        """
        The proc collection is a mongodb database that keeps the name of each
        processed file, to avoid processing these files again.
        """
        coll =  self._mongodb_connect('proc_files')
        coll.ensure_index('file_name')

        return coll

    def write_skipped_log(self, line):
        if self._skipped_log:
            self._skipped_log.write("%s \r\n" % line)

    def read_log(self, logfile):
        logfile = logfile.strip()

        # Verifica se arquivo já foi processado.
        if self._proc_coll.find({'file_name': logfile}).count() > 0:
            logger.info('File already processed %s' % logfile)
            return None

        reader = codecs
        if utils.check_file_format(logfile) == 'gzip':
            reader = gzip

        # Registra em base de dados de arquivos processados o novo arquivo.
        logger.info("Processing: %s" % logfile)
        self._proc_coll.insert({'file_name': logfile})

        with reader.open(logfile, 'rb') as f:
            with Local(self._mongo_uri, self._collection) as rq:
                log_file_line = 0
                for raw_line in f:
                    log_file_line += 1
                    logger.debug("Reading line {0} from file {1}".format(
                        str(log_file_line), logfile))
                    logger.debug(raw_line)

                    try:
                        parsed_line = self._ac.parsed_access(raw_line)
                    except ValueError as e:
                        logger.error("%s: %s" % (e.message, raw_line))
                        continue

                    if not parsed_line:
                        continue

                    if COUNTER_COMPLIANT:
                        # Counter Mode Accesses
                        locktime = 10
                        if parsed_line['access_type'] == "PDF":
                            locktime = 30
                        try:
                            lockid = '_'.join([parsed_line['ip'],
                                               parsed_line['code'],
                                               parsed_line['script']])
                            self._ts.add(lockid, parsed_line['iso_datetime'], locktime)
                            rq.register_access(parsed_line)
                        except ValueError:
                            self.write_skipped_log('; '.join([lockid, parsed_line['original_date'], parsed_line['original_agent']]))
                            continue
                    else:
                        pass
                        # SciELO Mode Accesses
                        rq.register_access(parsed_line)
                rq.send()

    def run(self):
        for logfile in os.popen('ls %s/*' % self._logs_source):
            self.read_log(logfile)
Exemplo n.º 55
0
class AccessCheckerTests(unittest.TestCase):
    def setUp(self):
        self.ac = AccessChecker(
            collection="scl",
            allowed_collections=lambda: [u"scl", u"arg"],
            acronym_to_issn_dict=lambda col: {
                u'zool': u'1984-4670',
                u'bjmbr': u'1414-431X'
            },
        )

    def test_GoogleBot_bot_is_not_parsed(self):
        line = '66.249.75.131 - - [24/Dec/2013:04:49:09 -0200] "GET http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-79722002000200013 HTTP/1.1" 200 102967 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_Bing_bot_is_not_parsed(self):
        line = '13245  157.56.92.164 - - [30/Nov/2013:03:53:26 -0200] "GET http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-87752010000200013 HTTP/1.1" 200 108777 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_Spider_bot_sample(self):
        line = '180.76.5.118 - - [24/Dec/2013:04:49:09 -0200] "GET http://www.scielo.br/pdf/csc/v11n2/30434.pdf HTTP/1.1" 200 79618 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_is_robot_detects_bot_useragent(self):
        agent = '"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'
        self.assertEqual(self.ac.is_robot(agent), True)

    def test_is_robot_detects_mozilla_useragent(self):
        agent = '"Mozilla/5.0 (Windows NT 5.1; rv:26.0) Gecko/20100101 Firefox/26.0"'
        self.assertEqual(self.ac.is_robot(agent), False)

    def test_pdf_or_html_access_identifies_urls_of_documents_in_html_v1(self):
        request = u'GET http://www.scielo.br/article/abcd/2018.v31n3/e1382/pt/ HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'HTML')

        request = u'GET /article/abcd/2018.v31n3/e1382/pt/ HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'HTML')

    def test_pdf_or_html_access_identifies_urls_of_documents_in_html_v2(self):
        request = u'GET http://www.scielo.br/j/abdc/a/MYJY5Rgw5gc7mBpqYzBCVJR HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'HTML')

        request = u'GET /j/abdc/a/MYJY5Rgw5gc7mBpqYzBCVJR HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'HTML')

    def test_pdf_or_html_access_identifies_urls_of_documents_in_html_v3(self):
        request = u'GET http://www.scielo.br/j/abdc/a/MYJY5Rgw5gc7mBpqYzBCVJR?format=html HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'HTML')

        request = u'GET /j/abdc/a/MYJY5Rgw5gc7mBpqYzBCVJR?format=html HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'HTML')

    def test_pdf_or_html_access_identifies_urls_of_documents_in_html_v4(self):
        request = u'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'HTML')

        request = u'GET /scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'HTML')

    def test_pdf_or_html_access_identifies_urls_of_documents_in_pdf_v1(self):
        request = u'GET https://www.scielo.br/pdf/abcd/2018.v31n3/e1382/en HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'PDF')

        request = u'GET /pdf/abcd/2018.v31n3/e1382/en HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'PDF')

    def test_pdf_or_html_access_identifies_urls_of_documents_in_pdf_v2(self):
        request = u'GET http://www.scielo.br/pdf/isz/v96n2/a18v96n2.pdf HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'PDF')

        request = u'GET /pdf/isz/v96n2/a18v96n2.pdf HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'PDF')

    def test_pdf_or_html_access_identifies_urls_of_documents_in_pdf_v3(self):
        request = u'GET http://www.scielo.br/j/abdc/a/MYJY5Rgw5gc7mBpqYzBCVJR?format=pdf HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'PDF')

        request = u'GET /j/abdc/a/MYJY5Rgw5gc7mBpqYzBCVJR?format=pdf HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), u'PDF')

    def test_pdf_or_html_access_for_files_on_new_site(self):
        request = u'GET http://www.scielo.br/static/img/favicon.ico HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), None)

        request = u'GET /static/img/favicon.ico HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), None)

    def test_pdf_or_html_access_for_files(self):
        request = u'GET http://www.scielo.br/css/screen/styles.css HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), None)

        request = u'GET /css/screen/styles.css HTTP/1.1'
        self.assertEqual(self.ac._pdf_or_html_access(request), None)

    def test_parse_line_with_apache_line(self):
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            '%l':
            '-',
            '%>s':
            '200',
            '%h':
            '187.19.211.179',
            '%{User-Agent}i':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            '%b':
            '25084',
            '%{Referer}i':
            '-',
            '%u':
            '-',
            '%t':
            '[30/May/2013:00:01:01 -0300]',
            '%r':
            'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'
        }

        self.assertEqual(self.ac._parse_line(line), expected)

        line = '123.125.71.39 - - [30/Dec/2012:23:59:57 -0200] "GET /scielo.php?script=sci_nlinks&ref=000144&pid=S0103-4014200000020001300010&lng=pt HTTP/1.1" 200 1878 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"'

        expected = {
            '%l':
            '-',
            '%>s':
            '200',
            '%h':
            '123.125.71.39',
            '%{User-Agent}i':
            'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)',
            '%b':
            '1878',
            '%{Referer}i':
            '-',
            '%u':
            '-',
            '%t':
            '[30/Dec/2012:23:59:57 -0200]',
            '%r':
            'GET /scielo.php?script=sci_nlinks&ref=000144&pid=S0103-4014200000020001300010&lng=pt HTTP/1.1'
        }

        self.assertEqual(self.ac._parse_line(line), expected)

    def test_parse_line_invalid_line(self):
        self.assertEqual(self.ac._parse_line(''), None)

    def test_access_date(self):
        access_date = u'[30/Dec/2012:23:59:57 -0200]'
        self.assertEqual(
            self.ac._access_date(access_date).date().isoformat(),
            u'2012-12-30')

    def test_access_datetime(self):
        access_date = u'[30/Dec/2012:23:59:57 -0200]'
        self.assertEqual(
            self.ac._access_date(access_date).isoformat(),
            u'2012-12-30T23:59:57')

    def test_access_date_with_invalid_month(self):
        access_date = u'[30/xxx/2012:23:59:57 -0200]'
        self.assertEqual(self.ac._access_date(access_date), None)

    def test_access_date_with_invalid_day(self):
        access_date = u'[xx/Dec/2012:23:59:57 -0200]'
        self.assertEqual(self.ac._access_date(access_date), None)

    def test_access_date_with_invalid_year(self):
        access_date = u'[12/Dec/x012:23:59:57 -0200]'
        self.assertEqual(self.ac._access_date(access_date), None)

    def test_access_date_with_invalid_date(self):
        self.assertEqual(self.ac._access_date(u''), None)

    def test_query_string_with_parameters(self):
        url = u'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'
        self.assertEqual(self.ac._query_string(url), {
            u'pid': u'S0100-736X2000000300007',
            u'script': u'sci_arttext'
        })

        url = u'GET /scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1'
        self.assertEqual(self.ac._query_string(url), {
            u'pid': u'S0100-736X2000000300007',
            u'script': u'sci_arttext'
        })

    def test_query_string_without_parameters(self):
        url = u'GET http://www.scielo.br/scielo.php HTTP/1.1'
        self.assertEqual(self.ac._query_string(url), None)

    def test_pid_is_valid_not_allowed_issn(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_arttext',
                                           'XXXX-XXXX2012000100001'), False)

    def test_pid_is_valid_script_sci_arttext_invalid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_arttext',
                                           '123443212012000100001'), False)

    def test_pid_is_valid_script_sci_arttext_valid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_arttext',
                                           '1414-431X2012000100001'), True)

    def test_pid_is_valid_script_sci_arttext_valid_pid_fbpe(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_arttext',
                                           '1414-431X(12)00100001'), True)

    def test_pid_is_valid_script_sci_abstract_invalid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_abstract',
                                           '1414-431X201200100001'), False)

    def test_pid_is_valid_script_sci_abstract_valid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_abstract',
                                           '1414-431X2012000100001'), True)

    def test_pid_is_valid_script_sci_abstract_valid_pid_fbpe(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_abstract',
                                           '1414-431X(12)00100001'), True)

    def test_pid_is_valid_script_sci_pdf_invalid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_pdf',
                                           '1414-431X9012000100001'), False)

    def test_pid_is_valid_script_sci_pdf_valid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_pdf',
                                           '1414-431X2012000100001'), True)

    def test_pid_is_valid_script_sci_pdf_valid_pid_fbpe(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_pdf', '1414-431X(12)00100001'),
            True)

    def test_pid_is_valid_script_sci_serial_valid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_serial', '1414-431X'), True)

    def test_pid_is_valid_script_sci_issuetoc_valid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_issuetoc',
                                           '1414-431X20120001'), True)

    def test_pid_is_valid_script_sci_issuetoc_invalid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_issuetoc', '1234432120120001'),
            False)

    def test_pid_is_valid_script_sci_issues_valid_pid(self):
        self.assertEqual(
            self.ac._is_valid_html_request('sci_issues', '1414-431X'), True)

    def test_pid_is_valid_pdf_request(self):
        request = u'GET http://www.scielo.br/pdf/zool/v96n2/a18v96n2.pdf HTTP/1.1'
        self.assertEqual(self.ac._is_valid_pdf_request(request), {
            'pdf_issn': u'1984-4670',
            'pdf_path': u'/pdf/zool/v96n2/a18v96n2.pdf'
        })

    def test_pid_is_valid_pdf_request_GET_without_domain(self):
        request = u'GET /pdf/zool/v29n4/18781.pdf HTTP/1.1'
        self.assertEqual(self.ac._is_valid_pdf_request(request), {
            'pdf_issn': u'1984-4670',
            'pdf_path': u'/pdf/zool/v29n4/18781.pdf'
        })

    def test_pid_is_valid_pdf_request_new_site(self):
        request = u'GET https://www.scielo.br/pdf/bjmbr/2018.v51n11/e7704/en HTTP/1.1'
        self.assertEqual(self.ac._is_valid_pdf_request(request), {
            'pdf_issn': u'1414-431X',
            'pdf_path': u'/pdf/bjmbr/2018.v51n11/e7704/'
        })

    def test_pid_is_valid_pdf_request_GET_without_domain_new_site(self):
        request = u'GET /pdf/bjmbr/2018.v51n11/e7704/en HTTP/1.1'
        self.assertEqual(self.ac._is_valid_pdf_request(request), {
            'pdf_issn': u'1414-431X',
            'pdf_path': u'/pdf/bjmbr/2018.v51n11/e7704/'
        })

    def test_pid_is_valid_pdf_request_empty_file_path(self):
        self.assertEqual(self.ac._is_valid_pdf_request(u''), None)

    def test_pid_is_valid_pdf_request_invalid_request_not_allowed_acronym(
            self):
        request = u'GET http://www.scielo.br/pdf/not_allowed_acronym/v96n2/a18v96n2.xxx HTTP/1.1'
        self.assertEqual(self.ac._is_valid_pdf_request(request), None)
Exemplo n.º 56
0
class ClassicSiteURLParsingTests(unittest.TestCase):
    def setUp(self):
        self.ac = AccessChecker(
            collection="scl",
            allowed_collections=lambda: [u"scl", u"arg"],
            acronym_to_issn_dict=lambda col: {
                u'zool': u'1984-4670',
                u'bjmbr': u'1414-431X'
            },
        )

    def test_404_responses_must_return_None(self):
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S1414-431X2000000300007&script=sci_arttext HTTP/1.1" 404 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_site_assets_must_return_None(self):
        line = '177.191.212.233 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/img/pt/author.gif HTTP/1.1" 304 0 "http://www.scielo.br/scielo.php?script=sci_serial&pid=1415-4757&nrm=iso&rep=&lng=pt" "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_document_url(self):
        """URL de artigo em HTML no padrão do site clássico.
        """
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S1414-431X2000000300007&script=sci_arttext HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        expected = {
            'ip': '187.19.211.179',
            'code': 'S1414-431X2000000300007',
            'access_type': 'HTML',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'query_string': {
                'pid': 'S1414-431X2000000300007',
                'script': 'sci_arttext'
            },
            'day': '30',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'script': 'sci_arttext',
            'month': '05'
        }

        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_document_url_missing_script(self):
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S1414-431X2000000300007 HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_document_url_missing_pid(self):
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?script=sci_arttext HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_main_module_without_querystring(self):
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_document_pdf_relative_url(self):
        line = '66.249.73.80 - - [30/May/2013:00:01:01 -0300] "GET /pdf/bjmbr/v29n4/18781.pdf HTTP/1.1" 200 32061 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        expected = {
            'ip': '66.249.73.80',
            'code': '/pdf/bjmbr/v29n4/18781.pdf',
            'access_type': 'PDF',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'day': '30',
            'month': '05',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'query_string': None,
            'pdf_issn': u'1414-431X',
            'script': '',
            'pdf_path': '/pdf/bjmbr/v29n4/18781.pdf'
        }
        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_document_pdf_url(self):
        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/pdf/bjmbr/v14n4/03.pdf HTTP/1.1" 200 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        expected = {
            'ip': '201.14.120.2',
            'code': '/pdf/bjmbr/v14n4/03.pdf',
            'access_type': 'PDF',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'day': '30',
            'month': '05',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'query_string': None,
            'pdf_issn': u'1414-431X',
            'script': '',
            'pdf_path': '/pdf/bjmbr/v14n4/03.pdf'
        }
        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_document_pdf_url_v2(self):
        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S0044-59672020000100012&script=sci_pdf&tlng=en HTTP/1.1" 200 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_journal_homepage(self):
        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?script=sci_serial&pid=0100-879X&lng=en&nrm=iso HTTP/1.1" 200 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        self.assertEqual(self.ac.parsed_access(line), None)

    def test_document_pdf_with_unknown_journal_acronym(self):
        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/pdf/not_allowed_acronym/v14n4/03.pdf HTTP/1.1" 200 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        self.assertEqual(self.ac.parsed_access(line), None)
Exemplo n.º 57
0
class OPACURLParsingTests(unittest.TestCase):
    def setUp(self):
        self.ac = AccessChecker(
            collection="scl",
            allowed_collections=lambda: [u"scl", u"arg"],
            acronym_to_issn_dict=lambda col: {
                u'zool': u'1984-4670',
                u'bjmbr': u'1414-431X'
            },
        )

    def test_document_url_v1(self):
        """URL de artigo em HTML no padrão do novo site. Este padrão já foi
        suplantado, mas podem haver instâncias que o utilizam. 
        """
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET  https://www.scielo.br/article/bjmbr/2018.v51n11/e7704/en/ HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            'ip': '187.19.211.179',
            'code': '/article/bjmbr/2018.v51n11/e7704/',
            'access_type': 'HTML',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'query_string': None,
            'day': '30',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'script': '',
            'month': '05'
        }

        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_document_url_v2(self):
        """URL de artigo em HTML no padrão do novo site.
        """
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET  https://www.scielo.br/j/bjmbr/a/F5Zr9TrzfmMgz9kvGZL3rZB HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            'ip': '187.19.211.179',
            'code': 'F5Zr9TrzfmMgz9kvGZL3rZB',
            'access_type': 'HTML',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'query_string': None,
            'day': '30',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'script': '',
            'month': '05'
        }

        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_document_url_v3(self):
        """URL de artigo em HTML no padrão do novo site.
        """
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET  https://www.scielo.br/j/bjmbr/a/F5Zr9TrzfmMgz9kvGZL3rZB?format=html HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            'ip': '187.19.211.179',
            'code': 'F5Zr9TrzfmMgz9kvGZL3rZB',
            'access_type': 'HTML',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'query_string': None,
            'day': '30',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'script': '',
            'month': '05'
        }

        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_document_relative_url(self):
        """URL de artigo em HTML no padrão do novo site. Trata-se da mesma URL
        do caso `test_document_url` mas com a URL relativa e não absoluta.
        """
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET  /article/bjmbr/2018.v51n11/e7704/en/ HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            'ip': '187.19.211.179',
            'code': '/article/bjmbr/2018.v51n11/e7704/',
            'access_type': 'HTML',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'query_string': None,
            'day': '30',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'script': '',
            'month': '05'
        }
        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_pdf_url_v1(self):
        """URL de artigo em PDF no padrão do novo site. Este padrão já foi
        suplantado, mas podem haver instâncias que o utilizam. 
        """
        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET https://www.scielo.br/pdf/bjmbr/2018.v51n11/e7704/en HTTP/1.1" 200 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        expected = {
            'ip': '201.14.120.2',
            'code': '/pdf/bjmbr/2018.v51n11/e7704/',
            'access_type': 'PDF',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'day': '30',
            'month': '05',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'query_string': None,
            'pdf_issn': u'1414-431X',
            'script': '',
            'pdf_path': '/pdf/bjmbr/2018.v51n11/e7704/'
        }
        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_pdf_url_v2(self):
        """URL de artigo em HTML no padrão do novo site.
        """
        line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET  https://www.scielo.br/j/bjmbr/a/F5Zr9TrzfmMgz9kvGZL3rZB?format=pdf HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'

        expected = {
            'ip': '187.19.211.179',
            'code': 'F5Zr9TrzfmMgz9kvGZL3rZB_pdf',
            'access_type': 'PDF',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'query_string': None,
            'day': '30',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'script': '',
            'month': '05'
        }

        self.assertEqual(self.ac.parsed_access(line), expected)

    def test_pdf_relative_url_v1(self):
        """URL de artigo em PDF no padrão do novo site. Trata-se da mesma URL
        do caso `test_pdf_url` mas com a URL relativa e não absoluta.
        """
        line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET /pdf/bjmbr/2018.v51n11/e7704/en HTTP/1.1" 200 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"'
        expected = {
            'ip': '201.14.120.2',
            'code': '/pdf/bjmbr/2018.v51n11/e7704/',
            'access_type': 'PDF',
            'iso_date': '2013-05-30',
            'iso_datetime': '2013-05-30T00:01:01',
            'year': '2013',
            'day': '30',
            'month': '05',
            'http_code': '200',
            'original_agent':
            'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
            'original_date': '[30/May/2013:00:01:01 -0300]',
            'query_string': None,
            'pdf_issn': u'1414-431X',
            'script': '',
            'pdf_path': '/pdf/bjmbr/2018.v51n11/e7704/'
        }
        self.assertEqual(self.ac.parsed_access(line), expected)