def get_links(content, session): links = [] if '<a' in content: for link in pq(content)('a'): if 'href' not in link.attrib: continue href = link.attrib['href'] if href.startswith('/'): href = SITE_URL + href if not href.startswith('http'): continue if href in links: continue filename = href.rpartition('/')[2] if filename == '' or filename.endswith( '.html') or filename.endswith('.aspx'): continue s3_object_name = 'government_decisions/' + filename if not object_storage.exists(s3_object_name): try: conn = session.get(href) if not conn.status_code == requests.codes.ok: continue href = object_storage.write(s3_object_name, data=conn.content, public_bucket=True, create_bucket=True) except: continue else: href = object_storage.urlfor(s3_object_name) links.append(dict(href=href, title=pq(link).text())) return links
def write_to_object_storage(self, object_name, data): logging.error('write_to_object_storage %s', object_name) if not object_storage.exists(object_name): ret = object_storage.write(object_name, data=data, public_bucket=True, create_bucket=True) else: ret = object_storage.urlfor(object_name) return ret
def unsign_document_link(url): url = url.replace("http://", "https://") if not url.startswith("https://www.mr.gov.il/Files_Michrazim/"): raise Exception("invalid url: {}".format(url)) filename = url.replace("https://www.mr.gov.il/Files_Michrazim/", "").replace(".signed", "") decoded_indicator = base_object_name + filename + '.decoded' if object_storage.exists(decoded_indicator): decoded_indicator_url = object_storage.urlfor(decoded_indicator) ret = requests.get(decoded_indicator_url) if ret.status_code == 200: return ret.text try: content = requests_get_content(url) page = pq(content) data_elt = page(page(page.children()[1]).children()[0]).children()[0] assert b'The requested operation is not supported, and therefore can not be displayed' not in content except Exception as e: logging.error( 'Failed to download from %s (%s), returning original url', url, e) return url try: if data_elt.attrib["DataEncodingType"] != "base64": raise Exception("unknown DataEncodingType: {}".format( data_elt.attrib["DataEncodingType"])) except KeyError: return None buffer = data_elt.text if buffer: buffer = base64.decodebytes(buffer.encode("ascii")) else: buffer = '' mime = data_elt.attrib["MimeType"] guessed_mime = None orig_filename = None try: page.remove_namespaces() orig_filename = next(page[0].iterdescendants('FileName')).text _, ext = os.path.splitext(orig_filename) except: ext = mimetypes.guess_extension(mime, strict=False) if not ext: with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: guessed_mime = m.id_buffer(buffer) logging.info('Attempted to detect buffer type: %s', guessed_mime) if guessed_mime == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ext = '.docx' else: ext = mimetypes.guess_extension(guessed_mime) assert ext, "Unknown file type mime:%s filename:%s guessed_mime:%s ext:%r buffer:%r" % ( mime, orig_filename, guessed_mime, ext, buffer[:128]) object_name = base_object_name + filename + (ext if ext else "") ret = write_to_object_storage(object_name, buffer) write_to_object_storage(decoded_indicator, ret) return ret
def process_row(row, *_): s3_object_name = row['s3_object_name'] url = row['url'] if not object_storage.exists(s3_object_name): conn = session.get(url) time.sleep(3) if not conn.status_code == requests.codes.ok: return None charset = get_charset(conn.content) conn.encode = charset object_storage.write( s3_object_name, data=conn.content, public_bucket=True, create_bucket=True, content_type="text/html; charset={}".format(charset)) return row
def process_row(row, *_): # if random.uniform(0, 1) >= 0.005: # return None s3_object_name = row['s3_object_name'] #url = os.path.join("https://ams3.digitaloceanspaces.com", "budgetkey-files", s3_object_name) url = object_storage.urlfor(s3_object_name) try: if object_storage.exists(s3_object_name): conn = session.get(url) txt = conn.text if needs_decoding(txt): txt = conn.content.decode('utf-8') pg = pq(txt, parser='html') row.update({ 'url': url, 'HeaderEntityNameEB': len(pg.find('#HeaderEntityNameEB')), 'HeaderProofValue': len(pg.find('#HeaderProofValue')), 'HeaderProof': len(pg.find('#HeaderProof ~ span:first')), 'HeaderProofValue_equals_HeaderProof': pg.find('#HeaderProof ~ span:first').text().strip() == pg.find('#HeaderProofValue').text().strip(), 'HeaderFixtReport': len(pg.find('#HeaderFixtReport')), 'HeaderProofFormat': len(pg.find("#HeaderProofFormat")), 'notification_type': pg.find('#HeaderFormNumber').text().strip(), 'positions': get_positions_array(pg), 'alias_stats': collect_all_aliases(pg) }) else: return None except Exception as err: raise RuntimeError('Parsing Failed Unexpectedly on {}'.format(url)) from err return row
def process_row(row, *_): s3_object_name = row['s3_object_name'] url = object_storage.urlfor(s3_object_name) try: if object_storage.exists(s3_object_name): conn = session.get(url) txt = conn.text if needs_decoding(txt): txt = conn.content.decode('utf-8') pg = pq(txt, parser='html') row.update({ 'url': url, 'HeaderEntityNameEB': len(pg.find('#HeaderEntityNameEB')), 'HeaderProofValue': len(pg.find('#HeaderProofValue')), 'HeaderProof': len(pg.find('#HeaderProof ~ span:first')), 'HeaderProofValue_equals_HeaderProof': pg.find('#HeaderProof ~ span:first').text().strip() == pg.find('#HeaderProofValue').text().strip(), 'notification_type': pg.find('#HeaderFormNumber').text().strip(), 'HeaderFixtReport': len(pg.find('#HeaderFixtReport')), 'HeaderProofFormat': len(pg.find("#HeaderProofFormat")), 'TaarichTchilatHaCehuna': len(pg.find("[fieldalias=TaarichTchilatHaCehuna]")), 'TaarichTchilatCehuna': len(pg.find("[fieldalias=TaarichTchilatCehuna]")), 'TaarichTehilatCehuna': len(pg.find("[fieldalias=TaarichTehilatCehuna]")), 'TaarichTchilatHaKehuna': len(pg.find("[fieldalias=TaarichTchilatHaKehuna]")), 'TaarichTchilatKehuna': len(pg.find("[fieldalias=TaarichTchilatKehuna]")), 'TaarichTehilatKehuna': len(pg.find("[fieldalias=TaarichTehilatKehuna]")), 'Gender': len(pg.find("[fieldalias=Gender]")), 'gender': pg.find("[fieldalias=Gender]").text().strip(), 'Shem': len(pg.find("[fieldalias=Shem]")), 'ShemPratiVeMishpacha': len(pg.find("[fieldalias=ShemPratiVeMishpacha]")), 'ShemPriatiVeMishpacha': len(pg.find("[fieldalias=ShemPriatiVeMishpacha]")), 'ShemMishpahaVePrati': len(pg.find("[fieldalias=ShemMishpahaVePrati]")), 'ShemRoeCheshbon': len(pg.find("[fieldalias=ShemRoeCheshbon]")), 'ShemRoehHeshbon': len(pg.find("[fieldalias=ShemRoehHeshbon]")), 'Accountant': len(pg.find("[fieldalias=Accountant]")), 'Tapkid': len(pg.find("[fieldalias=Tapkid]")), 'Tafkid': len(pg.find("[fieldalias=Tafkid]")), 'HaTafkidLoMuna': len(pg.find("[fieldalias=HaTafkidLoMuna]")), 'TeurTafkid': len(pg.find("[fieldalias=TeurTafkid]")), 'LeloTeur': len(pg.find("[fieldalias=LeloTeur]")), 'TeurHaTafkidLoMuna': len(pg.find("[fieldalias=TeurHaTafkidLoMuna]")), 'full_name': all_aliases_as_string(pg, ['Shem', 'ShemPratiVeMishpacha', 'ShemPriatiVeMishpacha', 'ShemMishpahaVePrati']), 'positions': get_positions_array(pg), #'is_nomination': False, #'positions':"", #'gender':"", #'name':"" }) else: return None except Exception as err: raise RuntimeError('Parsing Failed Unexpectedly on {}'.format(url)) from err return row
report['revision'] = REVISION time.sleep(1) url_to_use = report_url if url_to_use in url_to_fixed_file: url_to_use = url_to_fixed_file[url_to_use] logging.info("Using fixed file: %s", url_to_use) if url_to_use.startswith('http'): hash = hashlib.md5( report['report-title'].encode('utf8')).hexdigest()[:4] obj_name = "{report-year}-{report-period}-{report-publisher}-{report-subunit}-{report-date}".format( **report) obj_name += '-' + hash _, ext = os.path.splitext(url_to_use) obj_name += ext obj_name = os.path.join('spending-reports', obj_name) if not object_storage.exists(obj_name): tmp = tempfile.NamedTemporaryFile() try: stream = requests.get(url_to_use, stream=True, verify=False).raw except: logging.exception('Failed to load data from %s', url_to_use) stream.read = functools.partial(stream.read, decode_content=True) shutil.copyfileobj(stream, tmp) tmp.flush() url_to_use = object_storage.write(obj_name, file_name=tmp.name, create_bucket=False)