def parse_message(message: Dict) -> Tuple[str, Any]: """ Parses a message and ensures it has a single variant. Args: message: The message to parse Returns: The message variant and data content Raises: IndexError: If the message doesn't have exactly 1 key """ keys: List[str] = list(message) if len(keys) > 1: raise IndexError variant: str = keys[0] data: Any = message[variant] log.debug(f"Parsed a message with variant={variant} and data={data}") return (variant, data)
def create_static_pages(output_dir): '''Generates a static page from each of the files contained in `content/pages/`.''' template = env.get_template("page.html") for f in glob.glob("content/pages/*.md"): page_name = f.split("/")[-1].replace(".md", "") target_dir = os.path.join(output_dir, "%s/" % page_name) if not os.path.exists(target_dir): os.makedirs(target_dir) target = os.path.join(target_dir, "index.html") context = {} md_content = codecs.open(f, 'r', 'utf-8').read() context["content"] = markdown.markdown(md_content, output_format="html5", encoding="UTF-8") contents = template.render(**context) f = codecs.open(target, 'w', 'utf-8') f.write(contents) f.close() log.debug("Created static page '%s'." % page_name) # Content images if os.path.exists("content/media"): media_dir = os.path.join(output_dir, "media") if os.path.exists(media_dir): shutil.rmtree(media_dir) shutil.copytree("content/media", media_dir)
def create_static_pages(output_dir): '''Generates a static page from each of the files contained in `content/pages/`.''' template = env.get_template("page.html") for f in glob.glob("content/pages/*.md"): page_name = f.split("/")[-1].replace(".md", "") target_dir = os.path.join(output_dir, "%s/" % page_name) if not os.path.exists(target_dir): os.makedirs(target_dir) target = os.path.join(target_dir, "index.html") md_content = markdown.markdown(codecs.open(f, 'r', 'utf-8').read(), output_format="html5", encoding="UTF-8") context = dict({"content": md_content, "pagename": page_name}, **global_context) contents = template.render(**context) f = codecs.open(target, 'w', 'utf-8') f.write(contents) f.close() log.debug("Created static page '%s'." % page_name) # Content images if os.path.exists("content/media"): media_dir = os.path.join(output_dir, "media") if os.path.exists(media_dir): shutil.rmtree(media_dir) shutil.copytree("content/media", media_dir)
def _read_message(self) -> Dict: size_bytes = self._sock.recv(4) if size_bytes == b"": log.error("Empty Message: Closing") self._sock.close() sys.exit(1) size = struct.unpack(">I", size_bytes)[0] log.debug("Message size: %d", size) if size > 4096: remaining_size = size buf: List[int] = [] while remaining_size > 0: chunk = self._sock.recv(4096) buf.extend(chunk) remaining_size -= 4096 return json.loads(bytes(buf)) message: Dict = json.loads(self._sock.recv(size)) # Error handle if "Server" in message.keys(): # There has been an error in communication if "text" in message["Server"].keys(): payload: Dict = json.loads(message["Server"]["text"]) code = message["Server"]["code"] self._handle_server_error(code, payload) log.info(message) return message
def test_output(): # All of these just need to output without errors. from zenlog import log log.debug("A quirky message only developers care about") log.info("Curious users might want to know this") log.warn("Something is wrong and any user should be informed") log.warning("Something is wrong and any user should be informed") log.error("Serious stuff, this is red for a reason") log.critical("OH NO everything is on fire") log.c("OH NO everything is on fire") log.crit("OH NO everything is on fire")
def create_api(packages, output_dir, repo_dir): '''Generates a static API containing all the datapackage.json of the containing datasets. Accepts a list of pkg_info dicts, which are generated with the process_datapackage function.''' all_metadata = [] for pkg_info in packages: pkg_dir = os.path.join(repo_dir, pkg_info['name']) all_metadata.append(json.loads(open(os.path.join(pkg_dir, "datapackage.json")).read())) with open(os.path.join(output_dir, 'api.json'), 'w') as api_file: json.dump(all_metadata, api_file) log.debug("Created api.json.")
def create_index_page(packages, output_dir): '''Generates the index page with the list of available packages. Accepts a list of pkg_info dicts, which are generated with the process_datapackage function.''' template = env.get_template("list.html") target = "index.html" # Merge global context with local variables (http://stackoverflow.com/a/1552420/122400) context = dict({"datapackages": packages}, **global_context) contents = template.render(**context) f = codecs.open(os.path.join(output_dir, target), 'w', 'utf-8') f.write(contents) f.close() log.debug("Created index.html.")
def create_contact_page(output_dir, contact_email=""): '''Creates a contact form page.''' template = env.get_template("contact.html") target_dir = os.path.join(output_dir, "contacto/") if not os.path.exists(target_dir): os.makedirs(target_dir) target = os.path.join(target_dir, "index.html") context = {} context["contact_email"] = contact_email contents = template.render(**context) f = codecs.open(target, 'w', 'utf-8') f.write(contents) f.close() log.debug("Created contact page.")
def create_index_page(packages, output_dir): '''Generates the index page with the list of available packages. Accepts a list of pkg_info dicts, which are generated with the process_datapackage function.''' template = env.get_template("list.html") target = "index.html" context = {"datapackages": packages, "welcome_text": markdown.markdown(codecs.open("content/welcome_text.md", 'r', 'utf-8').read(), output_format="html5", encoding="UTF-8"), } contents = template.render(**context) f = codecs.open(os.path.join(output_dir, target), 'w', 'utf-8') f.write(contents) f.close() log.debug("Created index.html.")
def create_contact_page(output_dir, contact_email=""): '''Creates a contact form page.''' template = env.get_template("contact.html") target_dir = os.path.join(output_dir, "contact/") if not os.path.exists(target_dir): os.makedirs(target_dir) target = os.path.join(target_dir, "index.html") context = {} context["contact_email"] = contact_email contents = template.render(**context) f = codecs.open(target, 'w', 'utf-8') f.write(contents) f.close() log.debug("Created contact page.")
def getpage(url): if not os.path.exists('cache'): log.info('Creating new cache/ folder.') os.mkdir('cache') url_hash = hash(url) cache_file = 'cache/' + url_hash if os.path.exists(cache_file): log.debug("Cache hit for %s" % url) page = file_get_contents(cache_file) else: log.debug("Cache miss for %s" % url) page = urllib.urlopen(url).read() file_put_contents(cache_file, page, utf8=True) return page
def load_ground_truth_data(base_path: Union[None, Path, str] = None): if base_path is None: base_path = Path(__file__).parents[1].joinpath('webapp', 'data') else: base_path = Path(base_path) images_ids = sorted(p.stem for p in base_path.iterdir() if p.is_file() and not p.stem.endswith('_gt')) images = [] ground_truths = [] for img_id in images_ids: img_name = str(base_path.joinpath(f'{img_id}.png')) gt_name = str(base_path.joinpath(f'{img_id}_gt.png')) # Read image to segment try: img_data = plt.imread(img_name) if np.amax(img_data) <= 1: log.debug( f'Image {img_id}.png value range was converted from [0, 1] to [0, 255]' ) img_data *= 255 img_data = img_data.astype(np.uint8, copy=False) except FileNotFoundError: log.warning(f'Skipping since no file found with name {img_name}') images.append(None) ground_truths.append(None) continue if 2 < img_data.ndim: img_data = np.rint( ImageTools.rgb_to_grayscale(img_data.astype( np.float64))).astype(np.uint8) assert np.amax(img_data) > 1 images.append(img_data) # Read GT image gt_data = plt.imread(gt_name) if gt_data.ndim == 3: gt_data = gt_data[:, :, 0] ground_truths.append(gt_data > 0) return { img_id: (img, gt) for img_id, img, gt in zip(images_ids, images, ground_truths) if img is not None }
def process_dep(i): log.debug("Trying ID %d..." % i) url = FORMATTER_URL_IL % i soup = BeautifulSoup(getpage(url), "lxml") title = soup.find('span', id=RE_TITLE) if title: summary = soup.find('span', id=RE_SUMMARY) doc_url = soup.find('a', id=RE_DOCLINK) pdf_url = soup.find('a', id=RE_PDFLINK) eventdates = soup.findAll('span', id=RE_EVENTDATE) eventtypes = soup.findAll('span', id=RE_EVENTTYPE) eventinfos = soup.findAll('div', id=RE_EVENTINFO) dist_date = soup.find('span', id=RE_DISTDATE) authors = soup.findAll('a', id=RE_AUTHOR) parlgroup = soup.find('span', id=RE_PARLGROUP) row = {'title': title.text, 'summary': summary.text, 'id': i, 'url': url, 'authors': [a.text for a in authors]} if doc_url: row['doc_url'] = doc_url['href'] if pdf_url: row['pdf_url'] = pdf_url['href'] if dist_date: row['dist_date'] = dist_date.text if parlgroup: row['parlgroup'] = parlgroup.text for index, eventdate in enumerate(eventdates): event = {'date': eventdate.text} event['type'] = eventtypes[index].text.strip() info = eventinfos[index] if info.text: # TODO: Processar esta informação event = parse_event_info(event, info) if not row.get('events'): row['events'] = [] row['events'].append(event) log.info("Scraped initiative: %s" % title.text) return row else: return None
def create_dataset_page(pkg_info, output_dir): '''Generate a single dataset page.''' template = env.get_template("dataset.html") name = pkg_info["name"] if not os.path.exists(os.path.join(output_dir, name)): os.makedirs(os.path.join(output_dir, name)) target = "%s/index.html" % (name) context = dict({"datapkg": pkg_info}, **global_context) contents = template.render(**context) f = codecs.open(os.path.join(output_dir, target), 'w', 'utf-8') f.write(contents) f.close() log.debug("Created %s." % target)
def process(self, t: ROSType, headers_file: str, original_file: str, ros_type_str: str) -> FuzzTarget: logging.debug(f"Processing {t.type_name} type") imports = "\n".join([ f'#include "{headers_file}"', f'#include "{original_file}"', ]) request_code = "\n".join( [self.fuzz_field(field) for field in t.fields]) return FuzzTarget( imports=imports, client_name=FuzzTargetProcessor.normalize_client_name(t.type_name), request_code=request_code, node_type=ros_type_str, )
def _send_message(self, message: Union[Dict, str]): """ Serialises a dictionary into JSON and sends it across the stream. Messages will be length prefixed before sending. Args: message: The message to send """ readable: str = json.dumps(message) if isinstance(message, dict) else message log.debug(f"Sending message={readable} to the control layer") data: bytes = readable.encode("utf-8") length = (len(data)).to_bytes(4, byteorder="big") self.stream.send(length + data)
def create_dataset_page(pkg_info, output_dir): '''Generate a single dataset page.''' template = env.get_template("dataset.html") name = pkg_info["name"] if not os.path.exists(os.path.join(output_dir, name)): os.makedirs(os.path.join(output_dir, name)) target = "%s/index.html" % (name) context = {"datapkg": pkg_info} context['welcome_text'] = markdown.markdown(codecs.open("content/welcome_text.md", 'r', 'utf-8').read(), output_format="html5", encoding="UTF-8") contents = template.render(**context) f = codecs.open(os.path.join(output_dir, target), 'w', 'utf-8') f.write(contents) f.close() log.debug("Created %s." % target)
def _message_control(self) -> None: response: Dict = self._read_message() log.debug("HEARTBEAT") if "Alive" in response.keys(): # Write it back self._state = State.HEARTBEAT self._send_message(response) elif "JobConfig" in response.keys(): log.info("RECIEVED JOB CONFIG") self._state = State.READ_JOB self._message_stack.append(response) elif "Dataset" in response.keys(): log.info("RECIEVED DATASET") self._state = State.PROCESSING self._message_stack.append(response)
def play_by_station_uuid(self, _uuid): print(_uuid) # Pyradios by default don't let you search by uuid # a trick is to call click_counter(uuid) directly to get the statioon info is_ok = "false" try: self.target_station = self.API.click_counter(_uuid) log.debug(self.target_station) is_ok = self.target_station["ok"] except Exception as e: log.error("Could not find a station by the UUID") sys.exit(0) self.API.search(name=self.target_station["name"], name_exact=True) # againg register a valid click if is_ok == "false": res = self.API.click_counter(self.target_station["stationuuid"]) log.debug(res)
def verify(self): """ Creates a new model for the user and authenticates it with the challenge response method. Raises: IndexError: If an invalid message is encountered """ # Connect to the socket self._connect() message = { "NewModel": { "email": self.email, "password": self.password, "model_name": self.model_name, } } self._send_message(message) while True: # Read some data data = self._read_message() log.debug(f"Received data={data}") try: variant, data = parse_message(data) if variant == "Challenge": self.authenticate_challenge(data) elif variant == "AccessToken": self.display_access(data) self.save_access_tokens() break else: log.warn( f"Encountered an unexpected message variant={variant}") except IndexError: log.error(f"Failed to parse a message from data={data}")
def station_validator(self): if len(self.response) == 0: log.error("No stations found by the name") sys.exit(0) if len(self.response) > 1: log.info("Multiple stations found by the name") stations_name = "" for station in self.response: # stations_name = stations_name + "," + station["name"] log.info("name: {} | id: {} | country: {}".format( station["name"], station["stationuuid"], station["country"])) log.info(stations_name) sys.exit(0) if len(self.response) == 1: log.info("Station found: {}".format(self.response[0]["name"])) log.debug(self.response[0]) self.target_station = self.response[0] self.API.click_counter(self.target_station["stationuuid"])
def create_dataset_page(pkg_info, output_dir): '''Generate a single dataset page.''' template = env.get_template("dataset.html") name = pkg_info["name"] if not os.path.exists(os.path.join(output_dir, name)): os.makedirs(os.path.join(output_dir, name)) target = "%s/index.html" % (name) context = {"datapkg": pkg_info} context['welcome_text'] = markdown.markdown(codecs.open( "content/welcome_text.md", 'r', 'utf-8').read(), output_format="html5", encoding="UTF-8") contents = template.render(**context) f = codecs.open(os.path.join(output_dir, target), 'w', 'utf-8') f.write(contents) f.close() log.debug("Created %s." % target)
def create_index_page(packages, output_dir): '''Generates the index page with the list of available packages. Accepts a list of pkg_info dicts, which are generated with the process_datapackage function.''' template = env.get_template("list.html") target = "index.html" context = { "datapackages": packages, "welcome_text": markdown.markdown(codecs.open("content/welcome_text.md", 'r', 'utf-8').read(), output_format="html5", encoding="UTF-8"), } contents = template.render(**context) f = codecs.open(os.path.join(output_dir, target), 'w', 'utf-8') f.write(contents) f.close() log.debug("Created index.html.")
def main(): if not os.path.exists(dest): os.mkdir(dest) log.info("Directory 'imgs/' created.") mp_json = json.loads(open(mp_file, 'r').read()) for mp_id in mp_json: url = pic_url_formatter % mp_id filename = '%s.jpg' % os.path.join(dest, mp_id) if os.path.exists(filename): log.debug("File for id %s already exists, skipping." % mp_id) continue log.info('Retrieving picture with id: %s' % mp_id) try: urlretrieve(url, filename) except IOError: log.error('Socket error! :(') log.info('Done. Now do find ./imgs/ -size -722c -exec rm {} \;') log.info('to clean up things.')
def _process_job(self) -> None: log.info("PROCCESSING JOB") # Get message from message stack data: Dict = self._message_stack.pop() # Make sure the dataset ia actually there assert "Dataset" in data # Get training and prediction datasets train = decode_and_decompress(data["Dataset"]["train"]) predict = decode_and_decompress(data["Dataset"]["predict"]) train_pd = pd.read_csv(io.StringIO(train)) predict_pd = pd.read_csv(io.StringIO(predict)) # Prepare the datasets for callback train_pd, predict_pd, predict_rids = prepare_datasets( train_pd, predict_pd) # Check the user has specified a callback here to satisfy mypy assert self.callback is not None predictions = self.callback(train_pd, predict_pd, self.recv_job_config) log.debug("Predictions: %s", predictions.head()) # Attatch record ids onto predictions predictions["record_id"] = predict_rids cols = predictions.columns.tolist() cols.insert(0, cols.pop()) predictions = predictions[cols] assert len(predictions.index) == len(predict_pd.index) compressed_predictions: str = compress_and_encode( predictions.to_csv(index=False)) message = {"Predictions": compressed_predictions} self._send_message(message) self._state = State.HEARTBEAT
def read_and_validate_yaml_file(path: str) -> dict: yaml_file_path = os.path.join(path, "fuzz.yaml") ensure_yaml_exists(yaml_file_path) yaml_obj = verify_yaml_file(yaml_file_path) if "TODO" in json.dumps(yaml_obj): logging.warning("The 'TODO' keyword was found in the yaml file\n" "Did you forget to fill in the blanks?") services_keys = (yaml_obj["services"] if "services" in yaml_obj else {}).keys() topics_keys = (yaml_obj["topics"] if "topics" in yaml_obj else {}).keys() actions_keys = (yaml_obj["actions"] if "actions" in yaml_obj else {}).keys() logging.debug( f"{len(topics_keys)} topics detected: {', '.join([f'`{s}`' for s in topics_keys])}" ) logging.debug( f"{len(services_keys)} services detected: {', '.join([f'`{s}`' for s in services_keys])}" ) logging.debug( f"{len(actions_keys)} actions detected: {', '.join([f'`{s}`' for s in actions_keys])}" ) return yaml_obj
def generate_cpp_file(fuzz_target: FuzzTarget, source_file: str, template_name: str): __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) plain_source_file_name = Path(source_file).name without_extension = os.path.splitext(plain_source_file_name)[0] # Read template env = Environment(loader=FileSystemLoader(__location__)) template = env.get_template(template_name) logging.debug("Template read") # Populate template template_arguments = fuzz_target.get_mapping() template_arguments["FILE_NAME"] = plain_source_file_name fuzzing_path = os.path.join(os.path.dirname(__file__), "fuzzing_api.hpp") template_arguments["FUZZING_API"] = open(fuzzing_path).read() template = template.render(template_arguments) logging.debug("Template populated") # Write the populated file full_path = os.path.join(os.path.dirname(source_file), without_extension + "_generated.cpp") try: with open(full_path, "w") as fd: fd.write(template) logging.debug( f"Template written with {fuzz_target.client_name} client") except Exception: logging.error("Couldn't write generated file", exc_info=True) return full_path
def fetch_data_package(url, dir_name): rq = requests.get(url) if (rq.status_code != 200): log.warn("Not authorized %d at %s" % (rq.status_code, url)) return False spec = rq.json() # check for update dp_filename = os.path.join(dir_name, 'datapackage.json') if os.path.isfile(dp_filename): with open(dp_filename) as f: cached = json.load(f) if cached == spec: log.debug("No updates") return False # create a data folder data_folder = os.path.join(dir_name, 'data') if not os.path.isdir(dir_name): os.makedirs(data_folder) # download a copy of the datapackage download_file(dir_name, url, 'datapackage.json') for res in spec['resources']: if 'path' in res: # paths override urls, for local mirrors basepath = "/".join(url.split('/')[:-1]) + '/' fn = download_file(data_folder, basepath + res['path']) elif 'url' in res: # download resource from url fn = download_file(data_folder, res['url']) else: # skip this resource log.debug("Skipping: %s" % res) continue if 'title' in res: log.debug('Downloaded: %s - %s' % (res['title'], fn)) return True
async def main(args): uri = f"ws://{args.base}" if args.port: uri = f"{uri}:{args.port}" log.info(f"Connecting to: {uri}") async with websockets.connect(uri) as websocket: registration = Registration(name=bot.name, gametype=args.gametype, bots=args.bots, runs=args.runs) message = Registration.Schema().dumps(registration) log.debug(f"Sending to the server: {message}") await websocket.send(message) greeting = await websocket.recv() log.info(f"< {greeting}") await play_game(websocket, args.gametype)
def __init__(self, URL): self.url = URL self.is_playing = False self.process = None log.debug("player: url => {}".format(self.url)) self.process = Popen( ["ffplay", "-nodisp", "-nostats", "-loglevel", "error", self.url]) log.debug("player: ffplay => PID {} initiated".format( self.process.pid)) sleep( 3) # sleeping for 3 seconds wainting for ffplay to start properly if self.is_active(): self.is_playing = True log.info("Radio started successfully") else: log.error("Radio could not be stared, may be a dead station") sys.exit(0)
def handle_message(self, topic, payload): log.debug("Trying to handle message on topic: " + str(topic) + " with payload: " + str(payload)) self.check_sequence(topic, payload) if self.topics.get(topic) is None: self.handle_error(TopicError(topic)) return else: topic = self.topics.get(topic) topic_type = type(topic) handler = self.message_handlers.get(topic_type) try: handler.handle_message(topic, payload) except Exception as err: self.handle_error(err) topic.payload = payload log.debug("Message on topic " + str(topic.topic) + " handled.")
async def _run_auction( room_key: RoomKey, room_info: RoomInfo, room: Union[SingleRoom, MultiRoom], websocket, ): if room_info.room_type is MultiRoom: assert isinstance(room, MultiRoom) log.debug("Running multiroom") for _ in range(room.runs): auctioneer = Auctioneer( room=room.bot_room, game_type=room_key.gametype, slowdown=0, verbose=True, ) winners = await auctioneer.run_auction() for winner in winners: room.winners[winner] += 1 log.debug(f"Winners: {room.winners}") message = ResetBot.Schema().dumps(ResetBot(reset="RESET")) await websocket.send(message) else: auctioneer = Auctioneer( room=room.bot_room, game_type=room_key.gametype, slowdown=0, verbose=True, ) log.info("running normal room") room.winners = await auctioneer.run_auction() log.info(f"Winners: {room.winners}") room_info.has_run = True
def __init__( self, email: str, password: str, model_name: str, address: Tuple[str, int], ): log.debug( f"Initialising a new Authentication with email={email}, model_name={model_name}" ) self.email: str = email self.password: str = password self.model_name: str = model_name self.access_token: Optional[str] = None self.model_id: Optional[str] = None self.stream = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.private_key = load_priv_key() self.address: Tuple[str, int] = address
def signal_handler(sig, frame): global player log.debug("You pressed Ctrl+C!") log.debug("Stopping the radio") if player.is_playing: player.stop() log.debug("Exiting now") sys.exit(0)
def fuzz_field(self, field: Field, parent="request", indent=1) -> str: logging.debug(f"Generating field {field.name}") fresh = self.get_fresh_variable() preindent = " " * indent res = preindent + f"// {field.name}\n" # Primitive type if field.type.is_primitive: cpp_type = FuzzTargetProcessor.PRIMITIVES_CPP_TYPES[ field.type.type_name] res += preindent + f"{cpp_type} {fresh};\n" res += ( preindent + f"if (!get{field.type.type_name.capitalize()}({fresh})) return;\n" ) # Composite type else: res += preindent + f"{field.type.type_name} {fresh};\n" for subfield in field.type.fields: res += preindent + self.fuzz_field( subfield, parent=fresh, indent=indent + 1) res += preindent + f"{parent}->{field.name} = {fresh};\n" return res
def generate(offline=False, fetch_only=False, output_dir=OUTPUT_DIR, theme_dir=os.path.join(THEMES_DIR, 'centraldedados'), repo_dir=REPO_DIR, config_file=CONFIG_FILE): '''Main function that takes care of the whole process.''' global env, packages # Read the config file parser = SafeConfigParser() parser.read(config_file) # Load the theme and set up Jinja theme_name = parser.get('ui', 'theme') theme_dir = os.path.join(THEMES_DIR, theme_name) template_dir = os.path.join(theme_dir, "templates") env = jinja2.Environment(loader=jinja2.FileSystemLoader([template_dir])) # Set up the output directory if not os.path.exists(output_dir): os.mkdir(output_dir) # Set up the dir for storing repositories if not os.path.exists(repo_dir): log.debug("Directory %s doesn't exist, creating it." % repo_dir) os.mkdir(repo_dir) # Copy htaccess file shutil.copyfile(os.path.join(theme_dir, 'static/htaccess'), os.path.join(output_dir, ".htaccess")) # Static CSS files css_dir = os.path.join(output_dir, "css") if os.path.exists(css_dir): shutil.rmtree(css_dir) shutil.copytree(os.path.join(theme_dir, "static/css"), css_dir) # Static JavaScript files js_dir = os.path.join(output_dir, "js") if os.path.exists(js_dir): shutil.rmtree(js_dir) shutil.copytree(os.path.join(theme_dir, "static/js"), js_dir) # Theme images img_dir = os.path.join(output_dir, "img") if os.path.exists(img_dir): shutil.rmtree(img_dir) shutil.copytree(os.path.join(theme_dir, "static/img"), img_dir) # Fonts fonts_dir = os.path.join(output_dir, "fonts") if os.path.exists(fonts_dir): shutil.rmtree(fonts_dir) shutil.copytree(os.path.join(theme_dir, "static/fonts"), fonts_dir) if not parser.items('repositories'): log.critical('No repository data in settings.conf (does it even exist?). Cannot proceed :(') sys.exit() # go through each specified dataset for r in parser.items('repositories'): name, url = r dir_name = os.path.join(repo_dir, name) repo = None # do we have a local copy? if os.path.isdir(dir_name): if not os.path.isdir(os.path.join(dir_name, '.git')): if url.endswith(".json"): log.info("%s: Data package, refreshing" % name) updated = fetch_data_package(url, dir_name) else: log.info('%s: Unsupported repo, skipping update' % name) continue elif not offline: repo = git.Repo(dir_name) origin = repo.remotes.origin try: origin.fetch() except AssertionError: # usually this fails on the first run, try again origin.fetch() except git.exc.GitCommandError: log.critical("%s: Fetch error, this dataset will be left out." % name) continue # see if we have updates if not local_and_remote_are_at_same_commit(repo, origin): log.debug("%s: Repo has new commits, updating local copy." % name) updated = True # connection errors can also happen if fetch succeeds but pull fails try: result = origin.pull()[0] except git.exc.GitCommandError: log.critical("%s: Pull error, this dataset will be left out." % name) continue if result.flags & result.ERROR: log.error("%s: Pull error, but going ahead." % name) updated = False else: log.info("%s: No changes." % name) updated = False else: log.debug("%s: Offline mode, using cached version." % name) # we set updated to True in order to re-generate everything updated = True repo = git.Repo(dir_name) if fetch_only: # if the --fetch-only flag was set, skip to the next dataset continue else: if offline: log.warn("%s: No local cache, skipping." % name) continue else: if url.endswith(".git"): # Handle GIT Repository URL log.info("%s: New repo, cloning." % name) try: repo = git.Repo.clone_from(url, dir_name) # For faster checkouts, one file at a time: # repo = git.Repo.clone_from(url, dir_name, n=True, depth=1) # repo.git.checkout("HEAD", "datapackage.json") except git.exc.GitCommandError as inst: log.warn("%s: skipping %s" % (inst, name)) continue updated = True elif url.endswith(".json"): # Handle Data Package URL log.info("%s: New data package, fetching." % name) updated = fetch_data_package(url, dir_name) else: log.warn("Unsupported repository: %s" % url) # get datapackage metadata try: pkg_info = process_datapackage(name, repo_dir, url) except ParseException as inst: log.warn("%s: skipping %s" % (inst, name)) continue # set last updated time based on last commit, comes in Unix timestamp format so we convert import datetime if repo is not None: d = repo.head.commit.committed_date else: d = int(time.mktime(time.localtime())) last_updated = datetime.datetime.fromtimestamp(int(d)).strftime('%Y-%m-%d %H:%M:%S') pkg_info['last_updated'] = last_updated # add it to the packages list for index page generation after the loop ends packages.append(pkg_info) # re-generate the dataset HTML pages create_dataset_page(pkg_info, output_dir) # if repo was updated, copy over CSV/JSON/* and ZIP files to the download dir # (we always generate them if offline) if updated or offline: create_dataset_page(pkg_info, output_dir) datafiles = pkg_info['datafiles'] zipf = zipfile.ZipFile(os.path.join(output_dir, name + '.zip'), 'w') for d in datafiles: log.info("Copying %s" % d['path']) # copy file target = os.path.join(output_dir, os.path.basename(d['path'])) shutil.copyfile(os.path.join(dir_name, d['path']), target) # generate JSON version of CSV if target.endswith('.csv'): csv2json(target, target.replace(".csv", ".json")) # make zip file zipf.write(os.path.join(dir_name, d['path']), d['basename'], compress_type=zipfile.ZIP_DEFLATED) if 'readme_path' in pkg_info: try: zipf.write(pkg_info['readme_path'], 'README.md') except OSError: pass zipf.close() # HTML index with the list of available packages create_index_page(packages, output_dir) # Static JSON API of the data packages create_api(packages, output_dir, repo_dir) # Static pages create_static_pages(output_dir) # Contact page create_contact_page(output_dir, parser.get('credentials', 'contact_email')) log.info("All static content is ready inside '%s'." % OUTPUT_DIR)
def generate_site(fast_run): # flush output create_dir(OUTPUT_DIR) create_dir(os.path.join(OUTPUT_DIR, TRANSCRIPTS_PATH)) create_dir(os.path.join(OUTPUT_DIR, MPS_PATH)) create_dir(os.path.join(OUTPUT_DIR, MEDIA_PATH)) # init Jinja env = jinja2.Environment(loader=jinja2.FileSystemLoader([TEMPLATE_DIR]), extensions=['jinja2htmlcompress.SelectiveHTMLCompress'], trim_blocks=True, lstrip_blocks=True) env.filters['date'] = format_date # generate pages log.info("Copying static files...") copy_tree(MEDIA_SOURCE_DIR, os.path.join(OUTPUT_DIR, MEDIA_PATH)) log.info("Generating index...") render_template_into_file(env, 'index.html', 'index.html') log.info("Generating MP index...") mps = generate_mp_list() context = {"mps": mps} render_template_into_file(env, 'mp_list.html', "deputados/index.html", context) gov_data = get_gov_dataset() govpost_data = list(get_govpost_dataset()) gov_mp_ids = [int(row[2]) for row in govpost_data if row[2]] date_data = get_date_dataset() log.info("Generating MP pages...") for mp in mps: id = int(mp['id']) mp['photo_url'] = PHOTO_URL_BASE + str(id) + ".jpg" # determine government posts if id in gov_mp_ids: mp['govposts'] = [] govpost_rows = [row for row in govpost_data if row[2].strip() and int(row[2]) == id] for row in govpost_rows: gov_number = int(row[0]) gov = None for r in gov_data: if int(r[0]) == gov_number: gov = {'number': r[0], 'start_date': dateparser.parse(r[1]), 'end_date': dateparser.parse(r[2])} break if not gov: log.critical("Gov not found!") mp['govposts'].append({ 'post': row[3], 'start_date': dateparser.parse(row[4]), 'end_date': dateparser.parse(row[5]), 'gov': gov, }) # parse dates for m in mp['mandates']: m['start_date'] = dateparser.parse(m['start_date']) m['end_date'] = dateparser.parse(m['end_date']) # nice effect: if no end date, set to today context = {'mp': mp, 'l': None} filename = os.path.join(MPS_PATH, mp['slug'], 'index.html') render_template_into_file(env, 'mp_detail.html', filename, context) log.info("Generating session index...") datedict = generate_datedict() all_years = [y for y in datedict] for year_number in datedict: year = datedict[year_number] context = {'year': year, 'year_number': year_number, 'all_years': all_years, 'datedict': datedict, } target_dir = os.path.join(TRANSCRIPTS_PATH + "%s/" % year_number) filename = target_dir + "index.html" # print filename render_template_into_file(env, 'day_list.html', filename, context) # get most recent year and make the session index y = all_years[-1] year = datedict[y] context = {'year': year, 'year_number': year_number, 'all_years': all_years, 'datedict': datedict, } render_template_into_file(env, 'day_list.html', TRANSCRIPTS_PATH + 'index.html', context) log.info("Generating HTML session pages...") if fast_run: COUNTER = 0 date_data.reverse() for leg, sess, num, d, dpub, page_start, page_end in date_data: dateobj = dateparser.parse(d) context = {'session_date': dateobj, 'year_number': year_number, 'text': get_session_text(leg, sess, num), 'monthnames': MESES, 'pdf_url': 'xpto', } target_dir = "%s%d/%02d/%02d" % (TRANSCRIPTS_PATH, dateobj.year, dateobj.month, dateobj.day) if not os.path.exists(os.path.join(OUTPUT_DIR, target_dir)): create_dir(os.path.join(OUTPUT_DIR, target_dir)) filename = "%s/index.html" % target_dir render_template_into_file(env, 'day_detail.html', filename, context) log.debug(d) if fast_run: COUNTER += 1 if COUNTER > 20: break
def generate(offline, fetch_only): '''Main function that takes care of the whole process.''' # set up the output directory if not os.path.exists(output_dir): os.mkdir(output_dir) # set up the dir for storing repositories if not os.path.exists(repo_dir): log.info("Directory %s doesn't exist, creating it." % repo_dir) os.mkdir(repo_dir) # create dir for dataset pages if not os.path.exists(os.path.join(output_dir, datasets_dir)): os.mkdir(os.path.join(output_dir, datasets_dir)) # create download dir for zip and csv/json/* dataset files if not os.path.exists(os.path.join(output_dir, files_dir)): os.mkdir(os.path.join(output_dir, files_dir)) # create static dirs # TODO: only update changed files -- right now we regenerate the whole static dir css_dir = os.path.join(output_dir, "css") js_dir = os.path.join(output_dir, "js") img_dir = os.path.join(output_dir, "img") fonts_dir = os.path.join(output_dir, "fonts") if os.path.exists(css_dir): shutil.rmtree(css_dir) shutil.copytree("static/css", css_dir) if os.path.exists(js_dir): shutil.rmtree(js_dir) shutil.copytree("static/js", js_dir) if os.path.exists(img_dir): shutil.rmtree(img_dir) shutil.copytree("static/img", img_dir) if os.path.exists(fonts_dir): shutil.rmtree(fonts_dir) shutil.copytree("static/fonts", fonts_dir) # read the config file to get the datasets we want to publish parser = SafeConfigParser() parser.read(config_file) packages = [] if not parser.items('repositories'): log.critical('No repository data in settings.conf (does it even exist?). Cannot proceed :(') sys.exit() # go through each specified dataset for r in parser.items('repositories'): name, url = r dir_name = os.path.join(repo_dir, name) # do we have a local copy? if os.path.isdir(dir_name): if not offline: log.info("Checking for changes in repo '%s'..." % name) repo = git.Repo(dir_name) origin = repo.remotes.origin try: origin.fetch() except AssertionError: # usually this fails on the first run, try again origin.fetch() except git.exc.GitCommandError: log.critical("Fetch error connecting to repository, this dataset will be ignored and not listed in the index!") continue # connection errors can also happen if fetch succeeds but pull fails try: result = origin.pull()[0] except git.exc.GitCommandError: log.critical("Pull error connecting to repository, this dataset will be ignored and not listed in the index!") continue # we get specific flags for the results Git gave us # and we set the "updated" var in order to signal whether to # copy over the new files to the download dir or not if result.flags & result.HEAD_UPTODATE: log.info("No new changes in repo '%s'." % name) updated = False elif result.flags & result.ERROR: log.error("Error pulling from repo '%s'!" % name) updated = False else: # TODO: figure out other git-python flags and return more # informative log output log.info("Repo changed, updating. (returned flags: %d)" % result.flags) updated = True else: log.info("Offline mode, using cached version of package %s..." % name) # we set updated to True in order to re-generate everything # FIXME: See checksum of CSV files to make sure they're new before # marking updated as true updated = True repo = git.Repo(dir_name) if fetch_only: # if the --fetch-only flag was set, skip to the next dataset continue else: if offline: log.warn("Package %s specified in settings but no local cache, skipping..." % name) continue else: log.info("We don't have repo '%s', cloning..." % name) repo = git.Repo.clone_from(url, dir_name) updated = True # get datapackage metadata pkg_info = process_datapackage(name) # set last updated time based on last commit, comes in Unix timestamp format so we convert import datetime d = repo.head.commit.committed_date last_updated = datetime.datetime.fromtimestamp(int("1284101485")).strftime('%Y-%m-%d %H:%M:%S') log.debug(last_updated) pkg_info['last_updated'] = last_updated # add it to the packages list for index page generation after the loop ends packages.append(pkg_info) # re-generate the dataset HTML pages create_dataset_page(pkg_info) # if repo was updated, copy over CSV/JSON/* and ZIP files to the download dir # (we always generate them if offline) if updated or offline: create_dataset_page(pkg_info) datafiles = pkg_info['datafiles'] zipf = zipfile.ZipFile(os.path.join(output_dir, files_dir, name + '.zip'), 'w') for d in datafiles: # copy CSV file target = os.path.join(output_dir, files_dir, os.path.basename(d['path'])) shutil.copyfile(os.path.join(dir_name, d['path']), target) # generate JSON version csv2json(target, target.replace(".csv", ".json")) # make zip file zipf.write(os.path.join(dir_name, d['path']), d['basename'], compress_type=zipfile.ZIP_DEFLATED) try: zipf.write(pkg_info['readme_path'], 'README.md') except OSError: pass zipf.close() # generate the HTML index with the list of available packages create_index_page(packages) # generate the static JSON API of the data packages create_api(packages)
# -*- coding: utf-8 -*- import urllib2 import json from zenlog import log SOURCE_FILE = "banks.json" def download_page(url): response = urllib2.urlopen(url) html = response.read() return html f = open(SOURCE_FILE, 'r') contents = f.read() jsondata = json.loads(contents) for item in jsondata: # extrair o URL e o código/ID a partir do JSON url = item['url'] cod = url.split('=')[-1] # sacar o conteúdo da página html = download_page(url) # gravar num ficheiro html filename = cod + ".html" outfile = open(filename, 'w') outfile.write(html) outfile.close() log.debug(u'Já saquei o ' + cod + " :D")
from pprint import pprint #driver = webdriver.PhantomJS() driver = webdriver.Firefox() driver.get("http://www.bportugal.pt/en-US/Supervisao/Pages/Instituicoesautorizadas.aspx") # Get the institutions listed in the table, one per row # click the number to go to the next page -- .MudarPagina strong + a # every 10 pages click in the > arrow to advance -- .MudarPagina strong + span > a bank_list = [] pagecount = 1 while True: log.debug("Novo loop, pagecount eh " + str(pagecount)) wait = UI.WebDriverWait(driver, 10) links = driver.find_elements_by_css_selector(".AreaResultados td a") log.debug("Encontrei %d links..." % len(links)) if len(links) == 0: from time import sleep sleep(3) links = driver.find_elements_by_css_selector(".AreaResultados td a") if len(links) == 0: log.error("Não há links, snif") else: log.debug("Iupi, %d links!" % len(links)) rows = driver.find_elements_by_css_selector(".AreaResultados tbody tr") # skip first row