def test_hash_missing_id_number(self): item = { 'date': '2015-08-11', 'entity': u'SCOTIANK BANCK', 'full_name': u'DOMINGUEZ OCAÑA, SANDRA BEATRIZ', 'host_name': u'AGUADO ALFARO, JOSE ALBERTO', 'id_document': u'DNI/LE', 'institution': u'congreso', 'location': '', 'meeting_place': '', 'office': u'ADMINISTRACION LUIS ALBERTO SANCHEZ - FERNANDO BELAUNDE TERRY', 'reason': u'MANTENIMIENTO PROGRAMADO', 'time_end': u'15:06', 'time_start': u'09:04', 'title': u'ADMINISTRADOR DE EDIFICIOS', } result = make_hash(item) expected = 'daf54933e2164e0c2da44ea0fc2b66dce011ecc1' self.assertEqual(expected, result['sha1'])
def test_correct_hash_sha1_for_legacy_data(self): item = { 'date': '2012-01-04', 'entity': u'PARTICULAR', 'full_name': u'BUENO NINAHUANCA, JHON BILL', 'host_name': u'RIVAS CIFUENTES, BENJAMIN DIONISIO', 'id_document': u'DNI', 'id_number': u'40748332', 'institution': u'minem', 'location': '', 'meeting_place': '', 'office': u'DGER - SALA DEP', 'reason': u'CONSULTA CIUDADANA', 'time_start': u'08:39', 'time_end': u'', 'title': u'Especialista I', } result = make_hash(item) expected = '09dc4688afd00bb9ba60e69a4d1369b09dc261cf' self.assertEqual(expected, result['sha1'])
def test_correct_hash_sha1_for_legacy_data(self): item = { 'full_name': u'LAVERIAN HERRERA, EFRAIN', 'entity': '', 'meeting_place': '', 'office': u'OFICINA DE LOGISTICA', 'host_name': u'URDANEGUI CABREJOS, FABRIZIO MARIO RAUL', 'reason': u'DEJAR DOCUMENTO', 'institution': u'produce', 'location': '', 'id_number': u'32613418', 'id_document': u'DNI', 'date': u'2008-01-02', 'time_start': u'16:16:51', 'time_end': u'17:15:31', 'objective': '', } result = make_hash(item) expected = 'af716f0ed4aa8e3d3f4e1b05908c30f02f3e74fa' self.assertEqual(expected, result['sha1'])
def test_hash_using_complete_data(self): item = { 'date': '2015-08-11', 'entity': u'SCOTIANK BANCK', 'full_name': u'DOMINGUEZ OCAÑA, SANDRA BEATRIZ', 'host_name': u'AGUADO ALFARO, JOSE ALBERTO', 'id_document': u'DNI/LE', 'id_number': u'10153798', 'institution': u'congreso', 'location': '', 'meeting_place': '', 'office': u'ADMINISTRACION LUIS ALBERTO SANCHEZ - FERNANDO BELAUNDE TERRY', 'reason': u'MANTENIMIENTO PROGRAMADO', 'time_end': u'15:06', 'time_start': u'09:04', 'title': u'ADMINISTRADOR DE EDIFICIOS', } result = make_hash(item) expected = '4784d22af48c79154d69b4dd4c1562b8f3a7d182' self.assertEqual(expected, result['sha1'])
def test_correct_hash_sha1_for_legacy_data(self): item = { 'date': '2013-10-24', 'entity': u'', 'full_name': u'JUAN PONCE VILLARROEL', 'host_name': u'FERNANDO NOBLECILLA ZUÑIGA', 'id_document': u'DNI', 'id_number': u'08882615', 'institution': u'defensa', 'location': '', 'meeting_place': '', 'office': u'', 'reason': u'VISITA PERSONAL', 'time_start': u'17:28', 'time_end': u'18:00', 'title': u'', } result = make_hash(item) expected = 'dd3e23e4a1b146e250f759666bd0cfdcf0c3db8d' self.assertEqual(expected, result['sha1'])
def test_correct_hash_sha1_for_legacy_data(self): item = { 'full_name': u'FIGUEROA BERMUDEZ FRANKLIN', 'entity': u'SCOTIABANK', 'meeting_place': '', 'office': u'DESPACHO VICE - MINISTERIAL DE LA MUJER', 'host_name': u'CENTRO DOCUMENTARIO', 'reason': '', 'institution': u'min. mujer', 'location': '', 'id_number': u'42982496', 'id_document': u'DNI', 'date': u'2012-02-29', 'time_start': u'12:33', 'time_end': u'13:18', 'objective': '', 'num_visit': '', 'title': '', } result = make_hash(item) expected = '50aa11295b04317f97e6e27dcb965c50d3e78a3b' self.assertEqual(expected, result['sha1'])
def test_correct_hash_sha1_for_legacy_data(self): item = { 'full_name': u'Victor Alberto Menacho Aguinaga', 'entity': u'E.P.Ancon 1', 'meeting_place': '', 'office': u'Unidad De Recursos Humanos', 'host_name': u'WENDY DIANA HINOSTROZA HUARANGA', 'reason': u'DOCUMENTACION', 'institution': u'inpe', 'location': '', 'id_number': u'42000454', 'id_document': u'DNI/LE', 'date': u'2011-08-01', 'time_start': u'11:35', 'time_end': u'11:49', 'objective': u'Documentacion', 'num_visit': '', 'title': '', } result = make_hash(item) expected = 'cad139c1cc501911d881edc6587f0ce887c2d6ce' self.assertEqual(expected, result['sha1'])
def parse(self, response): with open("page_" + response.meta['date'].strftime("%Y-%m-%d") + "_.html", "w") as handle: handle.write(response.body) this_date = response.meta['date'] for sel in response.xpath('//tr'): record = sel.xpath('td/text()').extract() if len(record) > 6: if this_date < datetime.date(2008, 5, 29): item = ManoloItem() try: item['full_name'] = sel.xpath('td')[2].xpath('text()').extract()[0] except IndexError: item['full_name'] = '' try: item['id_document'] = sel.xpath('td')[3].xpath('text()').extract()[0] except IndexError: item['id_document'] = '' try: item['id_number'] = sel.xpath('td')[4].xpath('text()').extract()[0] except IndexError: item['id_number'] = '' try: item['reason'] = sel.xpath('td')[5].xpath('text()').extract()[0] except IndexError: item['reason'] = '' try: item['host_name'] = sel.xpath('td')[6].xpath('text()').extract()[0] except IndexError: item['host_name'] = '' try: item['time_start'] = sel.xpath('td')[1].xpath('text()').extract()[0] except IndexError: item['time_start'] = '' try: item['time_end'] = sel.xpath('td')[8].xpath('text()').extract()[0] except IndexError: item['time_end'] = '' item['institution'] = 'Trib.Const.' item['date'] = response.meta['date'] item = utils.make_hash(item) yield item elif datetime.date(2008, 5, 29) <= this_date < datetime.date(2014, 8, 1): item = ManoloItem() try: item['full_name'] = sel.xpath('td')[2].xpath('text()').extract()[0] except IndexError: item['full_name'] = '' try: item['id_document'] = sel.xpath('td')[3].xpath('text()').extract()[0] except IndexError: item['id_document'] = '' try: item['id_number'] = sel.xpath('td')[4].xpath('text()').extract()[0] except IndexError: item['id_number'] = '' try: item['reason'] = sel.xpath('td')[5].xpath('text()').extract()[0] except IndexError: item['reason'] = '' try: item['host_name'] = sel.xpath('td')[6].xpath('text()').extract()[0] except IndexError: item['host_name'] = '' try: item['time_start'] = sel.xpath('td')[1].xpath('text()').extract()[0] except IndexError: item['time_start'] = '' try: item['time_end'] = sel.xpath('td')[7].xpath('text()').extract()[0] except IndexError: item['time_end'] = '' item['institution'] = 'Trib.Const.' item['date'] = response.meta['date'] item = utils.make_hash(item) yield item else: item = ManoloItem() try: item['full_name'] = sel.xpath('td')[1].xpath('text()').extract()[0] except IndexError: item['full_name'] = '' try: item['id_document'], item['id_number'] = utils.get_dni(sel.xpath('td')[2].xpath('text()').extract()[0]) except IndexError: item['id_document'] = '' item['id_number'] = '' try: item['entity'] = sel.xpath('td')[3].xpath('text()').extract()[0] except IndexError: item['entity'] = '' try: item['reason'] = sel.xpath('td')[4].xpath('text()').extract()[0] except IndexError: item['reason'] = '' try: item['host_name'] = sel.xpath('td')[5].xpath('text()').extract()[0] except IndexError: item['host_name'] = '' try: item['office'] = sel.xpath('td')[6].xpath('text()').extract()[0] except IndexError: item['office'] = '' try: item['time_start'] = sel.xpath('td')[7].xpath('text()').extract()[0] except IndexError: item['time_start'] = '' try: item['time_end'] = sel.xpath('td')[8].xpath('text()').extract()[0] except IndexError: item['time_end'] = '' item['institution'] = 'Trib.Const.' item['date'] = response.meta['date'] item = utils.make_hash(item) yield item
def parse(self, response): with open( "page_" + response.meta['date'].strftime("%Y-%m-%d") + "_.html", "w") as handle: handle.write(response.body) this_date = response.meta['date'] for sel in response.xpath('//tr'): record = sel.xpath('td/text()').extract() if len(record) > 6: if this_date < datetime.date(2008, 5, 29): item = ManoloItem() try: item['full_name'] = sel.xpath('td')[2].xpath( 'text()').extract()[0] except IndexError: item['full_name'] = '' try: item['id_document'] = sel.xpath('td')[3].xpath( 'text()').extract()[0] except IndexError: item['id_document'] = '' try: item['id_number'] = sel.xpath('td')[4].xpath( 'text()').extract()[0] except IndexError: item['id_number'] = '' try: item['reason'] = sel.xpath('td')[5].xpath( 'text()').extract()[0] except IndexError: item['reason'] = '' try: item['host_name'] = sel.xpath('td')[6].xpath( 'text()').extract()[0] except IndexError: item['host_name'] = '' try: item['time_start'] = sel.xpath('td')[1].xpath( 'text()').extract()[0] except IndexError: item['time_start'] = '' try: item['time_end'] = sel.xpath('td')[8].xpath( 'text()').extract()[0] except IndexError: item['time_end'] = '' item['institution'] = 'Trib.Const.' item['date'] = response.meta['date'] item = utils.make_hash(item) yield item elif datetime.date(2008, 5, 29) <= this_date < datetime.date( 2014, 8, 1): item = ManoloItem() try: item['full_name'] = sel.xpath('td')[2].xpath( 'text()').extract()[0] except IndexError: item['full_name'] = '' try: item['id_document'] = sel.xpath('td')[3].xpath( 'text()').extract()[0] except IndexError: item['id_document'] = '' try: item['id_number'] = sel.xpath('td')[4].xpath( 'text()').extract()[0] except IndexError: item['id_number'] = '' try: item['reason'] = sel.xpath('td')[5].xpath( 'text()').extract()[0] except IndexError: item['reason'] = '' try: item['host_name'] = sel.xpath('td')[6].xpath( 'text()').extract()[0] except IndexError: item['host_name'] = '' try: item['time_start'] = sel.xpath('td')[1].xpath( 'text()').extract()[0] except IndexError: item['time_start'] = '' try: item['time_end'] = sel.xpath('td')[7].xpath( 'text()').extract()[0] except IndexError: item['time_end'] = '' item['institution'] = 'Trib.Const.' item['date'] = response.meta['date'] item = utils.make_hash(item) yield item else: item = ManoloItem() try: item['full_name'] = sel.xpath('td')[1].xpath( 'text()').extract()[0] except IndexError: item['full_name'] = '' try: item['id_document'], item['id_number'] = utils.get_dni( sel.xpath('td')[2].xpath('text()').extract()[0]) except IndexError: item['id_document'] = '' item['id_number'] = '' try: item['entity'] = sel.xpath('td')[3].xpath( 'text()').extract()[0] except IndexError: item['entity'] = '' try: item['reason'] = sel.xpath('td')[4].xpath( 'text()').extract()[0] except IndexError: item['reason'] = '' try: item['host_name'] = sel.xpath('td')[5].xpath( 'text()').extract()[0] except IndexError: item['host_name'] = '' try: item['office'] = sel.xpath('td')[6].xpath( 'text()').extract()[0] except IndexError: item['office'] = '' try: item['time_start'] = sel.xpath('td')[7].xpath( 'text()').extract()[0] except IndexError: item['time_start'] = '' try: item['time_end'] = sel.xpath('td')[8].xpath( 'text()').extract()[0] except IndexError: item['time_end'] = '' item['institution'] = 'Trib.Const.' item['date'] = response.meta['date'] item = utils.make_hash(item) yield item