def test_parsed_access_valid_html_access_on_new_site(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET https://www.scielo.br/article/bjmbr/2018.v51n11/e7704/en/ HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"' expected = { 'ip': '187.19.211.179', 'code': '/article/bjmbr/2018.v51n11/e7704', 'access_type': 'HTML', 'iso_date': '2013-05-30', 'iso_datetime': '2013-05-30T00:01:01', 'year': '2013', 'query_string': None, 'day': '30', 'original_agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'original_date': '[30/May/2013:00:01:01 -0300]', 'script': '', 'month': '05' } self.assertEqual(ac.parsed_access(line), expected)
def test_parse_line_with_apache_line(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"' expected = { '%l': '-', '%>s': '200', '%h': '187.19.211.179', '%{User-Agent}i': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', '%b': '25084', '%{Referer}i': '-', '%u': '-', '%t': '[30/May/2013:00:01:01 -0300]', '%r': 'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1' } self.assertEqual(ac._parse_line(line), expected) line = '123.125.71.39 - - [30/Dec/2012:23:59:57 -0200] "GET /scielo.php?script=sci_nlinks&ref=000144&pid=S0103-4014200000020001300010&lng=pt HTTP/1.1" 200 1878 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"' expected = { '%l': '-', '%>s': '200', '%h': '123.125.71.39', '%{User-Agent}i': 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)', '%b': '1878', '%{Referer}i': '-', '%u': '-', '%t': '[30/Dec/2012:23:59:57 -0200]', '%r': 'GET /scielo.php?script=sci_nlinks&ref=000144&pid=S0103-4014200000020001300010&lng=pt HTTP/1.1' } self.assertEqual(ac._parse_line(line), expected)
def test_parsed_access_valid_pdf_access_GET_string_without_domain(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '66.249.73.80 - - [30/May/2013:00:01:01 -0300] "GET /pdf/bjmbr/v29n4/18781.pdf HTTP/1.1" 200 32061 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"' expected = { 'ip': '66.249.73.80', 'code': '/pdf/bjmbr/v29n4/18781.pdf', 'access_type': 'PDF', 'iso_date': '2013-05-30', 'iso_datetime': '2013-05-30T00:01:01', 'year': '2013', 'day': '30', 'month': '05', 'original_agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'original_date': '[30/May/2013:00:01:01 -0300]', 'query_string': None, 'pdf_issn': u'1414-431X', 'script': '', 'pdf_path': '/pdf/bjmbr/v29n4/18781.pdf' } self.assertEqual(ac.parsed_access(line), expected)
def test_parsed_access_valid_pdf_access_on_new_site_path_only(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET /pdf/bjmbr/2018.v51n11/e7704/en HTTP/1.1" 206 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"' expected = { 'ip': '201.14.120.2', 'code': '/pdf/bjmbr/2018.v51n11/e7704', 'access_type': 'PDF', 'iso_date': '2013-05-30', 'iso_datetime': '2013-05-30T00:01:01', 'year': '2013', 'day': '30', 'month': '05', 'original_agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'original_date': '[30/May/2013:00:01:01 -0300]', 'query_string': None, 'pdf_issn': u'1414-431X', 'script': '', 'pdf_path': '/pdf/bjmbr/2018.v51n11/e7704/en' } self.assertEqual(ac.parsed_access(line), expected)
def setUp(self): self.ac = AccessChecker( collection="scl", allowed_collections=lambda: [u"scl", u"arg"], acronym_to_issn_dict=lambda col: { u'zool': u'1984-4670', u'bjmbr': u'1414-431X' }, )
def test_pid_is_valid_script_sci_issues_valid_pid(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') self.assertEqual(ac._is_valid_html_request('sci_issues', '1414-431X'), True)
def test_query_string_without_parameters(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') url = u'GET http://www.scielo.br/scielo.php HTTP/1.1' self.assertEqual(ac._query_string(url), None)
def test_access_date_with_invalid_date(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') access_date = u'' self.assertEqual(ac._access_date(access_date), None)
def test_is_bot_Bing_sample(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '13245 157.56.92.164 - - [30/Nov/2013:03:53:26 -0200] "GET http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0104-87752010000200013 HTTP/1.1" 200 108777 "-" "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"' self.assertEqual(ac.parsed_access(line), None)
def test_is_bot_GoogleBot_sample(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '66.249.75.131 - - [24/Dec/2013:04:49:09 -0200] "GET http://www.scielo.br/scielo.php?script=sci_arttext&pid=S0102-79722002000200013 HTTP/1.1" 200 102967 "-" "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"' self.assertEqual(ac.parsed_access(line), None)
def test_pid_is_valid_not_allowed_issn(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') self.assertEqual( ac._is_valid_html_request('sci_arttext', 'XXXX-XXXX2012000100001'), False)
def test_is_bot_Spider_sample(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '180.76.5.118 - - [24/Dec/2013:04:49:09 -0200] "GET http://www.scielo.br/pdf/csc/v11n2/30434.pdf HTTP/1.1" 200 79618 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"' self.assertEqual(ac.parsed_access(line), None)
def test_parsed_access_valid_pdf_with_any_different_access(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '177.191.212.233 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/img/pt/author.gif HTTP/1.1" 304 0 "http://www.scielo.br/scielo.php?script=sci_serial&pid=1415-4757&nrm=iso&rep=&lng=pt" "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"' self.assertEqual(ac.parsed_access(line), None)
def test_is_bot_method_with_common_user_agent(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') agent = '"Mozilla/5.0 (Windows NT 5.1; rv:26.0) Gecko/20100101 Firefox/26.0"' self.assertEqual(ac.is_robot(agent), False)
def test_parsed_access_valid_pdf_with_not_allowed_acronym(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '201.14.120.2 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/pdf/not_allowed_acronym/v14n4/03.pdf HTTP/1.1" 206 4608 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"' self.assertEqual(ac.parsed_access(line), None)
def test_parsed_access_invalid_article_access_without_query_string(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') line = '187.19.211.179 - - [30/May/2013:00:01:01 -0300] "GET http://www.scielo.br/scielo.php HTTP/1.1" 200 25084 "-" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)"' self.assertEqual(ac.parsed_access(line), None)
def test_pid_is_valid_pdf_request_empty_file_path(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') request = u'' self.assertEqual(ac._is_valid_pdf_request(request), None)
def test_is_bot_method_with_bot_agent(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') agent = '"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"' self.assertEqual(ac.is_robot(agent), True)
def __init__(self, collection, mongo_uri=MONGO_URI, logs_source=LOGS_SOURCE, counter_compliant=None, skipped_log_dir=None): self._mongo_uri = "%s_%s" % (mongo_uri, collection) self._proc_coll = self.get_proc_collection() self._collection = collection self._logs_source = logs_source self._counter_compliant = counter_compliant self._ts = utils.TimedSet(expired=utils.checkdatelock) self._skipped_log_dir = skipped_log_dir self._skipped_log = None self._ac = AccessChecker(self._collection)
def test_pid_is_valid_pdf_request_invalid_request_not_allowed_acronym( self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') request = u'GET http://www.scielo.br/pdf/not_allowed_acronym/v96n2/a18v96n2.xxx HTTP/1.1' self.assertEqual(ac._is_valid_pdf_request(request), None)
def test_access_datetime(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') access_date = u'[30/Dec/2012:23:59:57 -0200]' self.assertEqual( ac._access_date(access_date).isoformat(), u'2012-12-30T23:59:57')
def test_pdf_or_html_access_for_html(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result(['scl', 'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') request = u'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1' self.assertEqual(ac._pdf_or_html_access(request), u'HTML') request = u'GET /scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1' self.assertEqual(ac._pdf_or_html_access(request), u'HTML')
def test_pdf_or_html_access_for_html_on_new_site(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result(['scl', 'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') request = u'GET http://www.scielo.br/article/abcd/2018.v31n3/e1382/pt/ HTTP/1.1' self.assertEqual(ac._pdf_or_html_access(request), u'HTML') request = u'GET /article/abcd/2018.v31n3/e1382/pt/ HTTP/1.1' self.assertEqual(ac._pdf_or_html_access(request), u'HTML')
def test_pid_is_valid_pdf_request_GET_without_domain(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') request = u'GET /pdf/zool/v29n4/18781.pdf HTTP/1.1' self.assertEqual(ac._is_valid_pdf_request(request), { 'pdf_issn': u'1984-4670', 'pdf_path': u'/pdf/zool/v29n4/18781.pdf' })
def test_pdf_or_html_access_for_pdf(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result(['scl', 'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') request = u'GET http://www.scielo.br/pdf/isz/v96n2/a18v96n2.pdf HTTP/1.1' self.assertEqual(ac._pdf_or_html_access(request), u'PDF') request = u'GET /pdf/isz/v96n2/a18v96n2.pdf HTTP/1.1' self.assertEqual(ac._pdf_or_html_access(request), u'PDF')
def test_pdf_or_html_access_for_files(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') request = u'GET http://www.scielo.br/css/screen/styles.css HTTP/1.1' self.assertEqual(ac._pdf_or_html_access(request), None) request = u'GET /css/screen/styles.css HTTP/1.1' self.assertEqual(ac._pdf_or_html_access(request), None)
def test_query_string_with_parameters(self): accesschecker = self.mocker.patch(AccessChecker) accesschecker._allowed_collections() self.mocker.result([u'scl', u'arg']) accesschecker._acronym_to_issn_dict() self.mocker.result({u'zool': u'1984-4670', u'bjmbr': u'1414-431X'}) self.mocker.replay() ac = AccessChecker(collection='scl') url = u'GET http://www.scielo.br/scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1' self.assertEqual(ac._query_string(url), { u'pid': u'S0100-736X2000000300007', u'script': u'sci_arttext' }) url = u'GET /scielo.php?pid=S0100-736X2000000300007&script=sci_arttext HTTP/1.1' self.assertEqual(ac._query_string(url), { u'pid': u'S0100-736X2000000300007', u'script': u'sci_arttext' })