def delete_instance(self, db_instance_id, timeout=60 * 15): inst = self.get_instance(instance_id=db_instance_id, tags=False) if inst is None: return True if inst["DBInstanceStatus"] == "deleting": while True: with Timer(timeout_seconds=timeout) as t: if t.timeout: return False inst = self.get_instance(instance_id=db_instance_id, tags=False) if inst is None: return True time.sleep(15) self.rds_client.delete_db_instance(DBInstanceIdentifier=db_instance_id, SkipFinalSnapshot=True) while True: with Timer(timeout_seconds=timeout) as t: if t.timeout: return False instance = self.get_instance(instance_id=db_instance_id, tags=False) if instance is None: return time.sleep(15)
def run(): with Timer('Merging Comment and Post Data from Subreddits'): data_dir = '/home/hadoop/reddit_topic_model/app_module/data/' with open(data_dir + 'stashinvest/reddit_submissions.parquet', 'rb') as submissions_file: submissions = pd.read_parquet(submissions_file, engine='pyarrow') submissions.drop( submissions[submissions['author'] == 'stashofficial'].index, axis=0, inplace=True) with open(data_dir + 'stashinvest/reddit_comments.parquet', 'rb') as comments_file: comments = pd.read_parquet(comments_file, engine='pyarrow') comments.drop(comments[comments['author'] == 'stashofficial'].index, axis=0, inplace=True) comments.drop(comments[comments['body'] == '[deleted]'].index, axis=0, inplace=True) merged = merge_parent_and_child_comments(submissions, comments) merged.to_pickle(data_dir + 'tmp/merged.pkl') print("\nSample") print(merged.head(), "\n")
def get_activation_key(): key = None with Timer(timeout_seconds=1200, start=True) as activation_key_timer: while True: # noinspection PyBroadException,PyPep8 try: # start with initial wait as it will time for the instance to initialize time.sleep(15) # get key from redirection result # see https://docs.aws.amazon.com/storagegateway/latest/userguide/get-activation-key.html conn = http.client.HTTPConnection( instance_public_address) conn.request( "GET", "?activationRegion={}".format(self.region)) resp = conn.getresponse() if resp.status == 302: if resp.getheader("Location") is not None: key = [ q[1] for q in [ h.split("=") for h in resp.getheader("Location"). split('?')[1].split('&') ] if q[0] == 'activationKey' ][0] break except: pass if activation_key_timer.timeout: break return key
def delete_stack(self, timeout=900, empty_bucket_resources=False): if self.owned: if not self.is_stack_present(): return True else: if empty_bucket_resources: s3 = S3() buckets = [ r["PhysicalResourceId"] for r in self.cnf_service.describe( services.cloudformation_service.STACK_RESOURCES, StackName=self.stack_name, region=self.region) if r["ResourceType"] == "AWS::S3::Bucket" ] for bucket in buckets: s3.empty_bucket(bucket) with Timer(timeout_seconds=timeout, start=True) as timer: self.cfn_client.delete_stack(StackName=self.stack_name) while self.is_stack_present() is True: if timer.timeout: raise Exception("Timeout deleting stack {}".format( self.stack_name)) time.sleep(20) return True
def wait_for_db_connections(self, db_instance_id, timeout): with Timer(timeout_seconds=timeout, start=True) as t: while not t.timeout: c = self.get_daily_database_connections(db_instance_id, 1) if len(c) > 0 and c[0] > 0: return True time.sleep(15) return False
def wait_for_volume_iops(self, volume_id, timeout, min_iops): with Timer(timeout, start=True) as t: while not t.timeout: iops = self.get_daily_volume_iops(volume_id, 1) if len(iops) > 0 and iops[0] >= min_iops: return True time.sleep(15) return False
def wait_for_system_ready(self, instance_id, timeout=600): with Timer(timeout_seconds=timeout, start=True) as t: while not t.timeout: status = self.get_system_status(instance_id=instance_id) if status is not None and status == "ok": return True time.sleep(10) return False
def wait_for_network_io(self, instance_id, timeout, io_mb): with Timer(timeout, start=True) as t: while not t.timeout: io = self.get_daily_network_io(instance_id, 1) if len(io) > 0 and io[0] >= io_mb * MEGA_BYTE: return True time.sleep(15) return False
def wait_for_cpu_load(self, instance_id, timeout, load): with Timer(timeout, start=True) as t: while not t.timeout: cpu = self.get_daily_cpu_utilization(instance_id, 1) if len(cpu) > 0 and cpu[0] >= load: return True time.sleep(15) return False
def run(): raw_data = pd.read_pickle("./data/tmp/merged.pkl") with Timer("Preprocess Text"): processor = Preprocessor() prepared_df = processor.preprocess_df(df=raw_data, text_column="text") prepared_df.to_pickle("./data/tmp/preprocessed.pkl") print("\nSample") print(prepared_df.head(), "\n")
def run(env): with Timer('Updating Table with New Partition'): spark_session = SparkSession.builder.appName( "reddit").enableHiveSupport().getOrCreate() spark_session.sparkContext.setLogLevel("WARN") spark_session.sql(f""" ALTER TABLE source_social.reddit_scores ADD IF NOT EXISTS PARTITION(process_date='{datetime.today().strftime("%Y-%m-%d")}') LOCATION "s3://stash-de-source-{env}/source_social.db/reddit_scores/process_date={datetime.today().strftime("%Y-%m-%d")}" """)
def run(env:str='edge'): df = pd.read_pickle('./data/tmp/scored.pkl') with Timer("Add Sentiment Analyses"): with Timer('Append Vader Sentiment Scores'): df = get_vader_sentiment(df) # This takes approx. 2.5 hours... Not worth it at the moment # with Timer('Append Flair Sentiment Scores'): # df = get_flair_sentiment(df) df = df[[ 'id','time','Topic: 1','Topic: 2', 'Topic: 3','Topic: 4','Topic: 5','Topic: 6', 'Topic','vader_neg','vader_neu','vader_pos', 'vader_compound' ]] df.to_parquet(f's3://stash-de-source-{env}/source_social.db/reddit_scores/process_date={datetime.today().strftime("%Y-%m-%d")}/batch.parquet') print("\nSample") print(df.head(),"\n")
def create_stack(self, template_body=None, template_file=None, iam_capability=False, timeout=600, tags=None, params=None, empty_existing_buckets=True): assert (len( [t for t in [template_body, template_file] if t is not None]) == 1) if template_file is not None: with open(template_file, "rt") as f: template = "".join(f.readlines()) else: template = template_body self.delete_stack(empty_bucket_resources=empty_existing_buckets) args = { "StackName": self.stack_name, "TemplateBody": template, "Parameters": [] if params is None else [{ "ParameterKey": p, "ParameterValue": params[p] } for p in params], "Capabilities": ["CAPABILITY_NAMED_IAM"] if iam_capability else [], "Tags": [{ "Key": t, "Value": tags[t] } for t in tags] if tags is not None else [] } try: self._stack_id = self.cfn_client.create_stack(**args)["StackId"] except Exception as ex: print(ex) with Timer(timeout_seconds=timeout, start=True) as timer: while self.is_stack_in_status("CREATE_IN_PROGRESS") is True: time.sleep(20) if timer.timeout: raise Exception("Timeout creating stack {}".format( self.stack_name)) if self.is_stack_in_status("CREATE_COMPLETE") is True: self.owned = True return else: raise ValueError("Stack did not create successfully")
def wait_for_image_not_longer_available(self, image_id, timeout=300): with Timer(timeout_seconds=timeout, start=True) as t: count = 0 while not t.timeout: img = self.get_image(image_id) if img is None: count += 1 if count >= 3: return True time.sleep(5) return False
def wait_for_image_available(self, image_id, timeout=300): with Timer(timeout_seconds=timeout, start=True) as t: count = 0 while not t.timeout: img = self.get_image(image_id) if img is not None and img.get("State") == "available": count += 1 if count >= 3: return True time.sleep(5) return False
def wait_for_volume_state(self, volume_id, state, timeout=300): with Timer(timeout_seconds=timeout, start=True) as t: count = 0 while not t.timeout: img = self.get_volume(volume_id) if img is not None and img.get("State") == state: count += 1 if count >= 3: return True time.sleep(5) return False
def check_real(ltl_text, part_text, is_moore, ltl_to_atm: LTLToAutomaton, solver: SolverInterface, max_k: int, min_size, max_size, opt_level=0) -> LTS: """ When opt_level>0, introduce incompleteness (but it is sound: if returns REAL, then REAL) When max_k>0, reduce UCW to k-UCW. """ timer = Timer() spec = parse_acacia_and_build_expr(ltl_text, part_text, ltl_to_atm, opt_level) logging.info("LTL formula size: %i", expr_size(spec.formula)) timer.sec_restart() automaton = ltl_to_atm.convert(~spec.formula) logging.info('automaton size is: %i' % len(automaton.nodes)) logging.debug('automaton (dot) is:\n' + automaton_to_dot.to_dot(automaton)) logging.debug('automaton translation took (sec): %i' % timer.sec_restart()) tau_desc = build_tau_desc(spec.inputs) desc_by_output = dict( (o, build_output_desc(o, is_moore, spec.inputs)) for o in spec.outputs) if max_k == 0: logging.info("using CoBuchiEncoder") encoder = CoBuchiEncoder(automaton, tau_desc, spec.inputs, desc_by_output, range(max_size)) model = model_searcher.search(min_size, max_size, encoder, solver) else: coreach_automaton = k_reduce(automaton, max_k) # with open('/tmp/orig.dot', 'w') as f: # f.write(automaton_to_dot.to_dot(automaton)) # with open('/tmp/red.dot', 'w') as f: # f.write(automaton_to_dot.to_dot(coreach_automaton)) # exit() logging.info("using CoReachEncoder") logging.info('co-reachability automaton size is: %i' % len(coreach_automaton.nodes)) logging.debug('co-reachability automaton (dot) is:\n' + automaton_to_dot.to_dot(coreach_automaton)) encoder = CoreachEncoder(coreach_automaton, tau_desc, spec.inputs, desc_by_output, range(max_size), max_k) model = model_k_searcher.search(min_size, max_size, max_k, encoder, solver) logging.info('searching a model took (sec): %i' % timer.sec_restart()) return model
def run(): # Get the Preprocessed Dataset df = pd.read_pickle('./data/tmp/preprocessed.pkl') if os.path.isfile('./models/MALLET/mallet_model.pkl'): # Let's not do any model retraining without building in topic stability constraints # e.g. number of docs or tokens now in different topics seen = False # Data we provide is new and unseen for the model with open('./models/MALLET/mallet_model.pkl', 'rb') as modelfile: topic_model = pickle.load(modelfile) with open('./models/MALLET/mallet_dict.pkl', 'rb') as dictfile: dictionary = pickle.load(dictfile) df['bow'] = df['tokens'].apply(dictionary.doc2bow) else: seen = True # any data we provide is used to train the model with Timer('Train the LDA Model'): test_range = (5, 50) df, corpus, dictionary = get_corpus_and_dict(df, 'tokens') list_of_models, scores = topic_count_selection( dictionary, corpus, list(df['tokens']), test_range) plot_coherence( test_range, scores).savefig('./models/MALLET/ModelCoherence.png') # Let's save the model with highest coherence num_topics = test_range[0] + scores.index(max(scores)) + 1 topic_model = LdaMallet('/home/hadoop/Mallet-master/bin/mallet', corpus=corpus, num_topics=num_topics, id2word=dictionary, iterations=1000, prefix=f'{os.getcwd()}/models/MALLET/', random_seed=42) print(f"* Chosen Model with {num_topics} topics") with open('./models/MALLET/mallet_model.pkl', 'wb') as modelfile: topic_model.save(modelfile) with open('./models/MALLET/mallet_corpus.pkl', 'wb') as corpusfile: pickle.dump(corpus, corpusfile) with open('./models/MALLET/mallet_dict.pkl', 'wb') as dictfile: pickle.dump(dictionary, dictfile) df = get_topic_model_scores(df, topic_model, seen=seen) df.to_pickle('./data/tmp/scored.pkl') print("\nSample") print(df.head(), "\n")
def copy_image(self, image_id, destination_region, name, tags=None, description=None, wait_to_complete=300, encrypted=False): ec2_destination = Ec2(region=destination_region) ec2_destination_client = boto3.client("ec2", region_name=destination_region) args = { "SourceImageId": image_id, "SourceRegion": self.region, "Name": name, "Encrypted": encrypted } if description is not None: args["Description"] = description with Timer(timeout_seconds=wait_to_complete) as timer: # noinspection PyBroadException try: image_copy_id = ec2_destination_client.copy_image( **args)["ImageId"] except Exception: return None image_copy = ec2_destination.get_image(image_copy_id) if wait_to_complete == 0: if image_copy is not None: ec2_destination.create_tags( resource_ids=[image_copy["ImageId"]], tags=tags) return image_copy while True: if image_copy is not None and image_copy[ "State"] == "available": ec2_destination.create_tags( resource_ids=[image_copy["ImageId"]], tags=tags) return image_copy if timer.timeout: return None time.sleep(20) image_copy = ec2_destination.get_image(image_copy_id) if image_copy is not None and image_copy["State"] == "failed": return None
def create_image(self, instance_id, name, tags=None, description=None, no_reboot=True, wait_to_complete=600): args = {"InstanceId": instance_id, "Name": name, "NoReboot": no_reboot} if description is not None: args["Description"] = description with Timer(timeout_seconds=wait_to_complete) as timer: # noinspection PyBroadException try: image_id = self.ec2_client.create_image(**args)["ImageId"] image = self.get_image(image_id) if wait_to_complete == 0: self.create_tags([image_id], tags=tags) return self.get_image(image_id) while True: if image["State"] == "available": self.create_tags([image_id], tags=tags) # there may be a tile lag between the image created and becoming visible in a new sessions. while True: img = Ec2(self.region, session=boto3.Session()).get_image( image["ImageId"]) if img is not None: return img if timer.timeout: raise Exception( "Image created but not returned by describe function" ) time.sleep(10) if timer.timeout: return None time.sleep(20) image = self.get_image(image_id) if image["State"] == "failed": return None except Exception as ex: print(ex) return None
def wait_until_cluster_status(self, db_cluster_id, status, timeout): if not isinstance(status, list): status = [status] while True: with Timer(timeout_seconds=timeout) as t: if t.timeout: return False time.sleep(15) current_status = self.get_cluster(cluster_id=db_cluster_id, tags=False).get("Status") if current_status in status: return True
def delete_cluster(self, db_cluster_id, timeout=60 * 15): cluster = self.get_cluster(cluster_id=db_cluster_id, tags=False) if cluster is None: return True if cluster["Status"] == "deleting": while True: with Timer(timeout_seconds=timeout) as t: if t.timeout: return False cluster = self.get_instance(instance_id=db_cluster_id, tags=False) if cluster is None: return True time.sleep(15) self.start_cluster(db_cluster_id=db_cluster_id, timeout=timeout) for member_id in [ m["DBInstanceIdentifier"] for m in cluster.get("DBClusterMembers", []) ]: self.delete_instance(member_id) self.rds_client.delete_db_cluster(DBClusterIdentifier=db_cluster_id, SkipFinalSnapshot=True) while True: with Timer(timeout_seconds=timeout) as t: if t.timeout: return False instance = self.get_cluster(cluster_id=db_cluster_id, tags=False) if instance is None: return True time.sleep(15)
def check_unreal(ltl_text, part_text, is_moore, ltl_to_atm: LTLToAutomaton, solver: SolverInterface, max_k: int, min_size, max_size, opt_level=0) -> LTS: """ Note that opt_level > 0 may introduce unsoundness (returns unrealizable while it is). """ timer = Timer() spec = parse_acacia_and_build_expr(ltl_text, part_text, ltl_to_atm, opt_level) logging.info("LTL formula size: %i", expr_size(spec.formula)) timer.sec_restart() automaton = ltl_to_atm.convert(spec.formula) logging.info('(unreal) automaton size is: %i' % len(automaton.nodes)) logging.debug('(unreal) automaton (dot) is:\n' + automaton_to_dot.to_dot(automaton)) logging.debug('(unreal) automaton translation took (sec): %i' % timer.sec_restart()) # note: inputs/outputs and machine type are reversed tau_desc = build_tau_desc(spec.outputs) desc_by_output = dict((i, build_output_desc(i, not is_moore, spec.outputs)) for i in spec.inputs) if max_k == 0: encoder = CoBuchiEncoder(automaton, tau_desc, spec.outputs, desc_by_output, range(max_size)) model = model_searcher.search(min_size, max_size, encoder, solver) else: coreach_automaton = k_reduce(automaton, max_k) logging.info("(unreal) using CoReachEncoder") logging.info('(unreal) co-reachability automaton size is: %i' % len(coreach_automaton.nodes)) logging.debug('(unreal) co-reachability automaton (dot) is:\n' + automaton_to_dot.to_dot(coreach_automaton)) encoder = CoreachEncoder(coreach_automaton, tau_desc, spec.outputs, desc_by_output, range(max_size), max_k) model = model_k_searcher.search(min_size, max_size, max_k, encoder, solver) logging.debug('(unreal) model_searcher.search took (sec): %i' % timer.sec_restart()) return model
def delete_images(self, image_ids): for image_id in image_ids: snapshots = self.get_image_snapshots(image_id) self.deregister_image(image_id) if snapshots is not None: self.delete_snapshots(snapshots) with Timer(timeout_seconds=300) as timer: image = self.get_image(image_id) if image is None: return if timer.timeout: raise "Timeout deleting image {}" time.sleep(10)
def wait_until_not_longer_in_status(self, status, timeout=900): current_status = self.get_stack_status() with Timer(timeout_seconds=timeout, start=True) as timer: while True: if type(status) == list: if current_status not in status: break else: if current_status != status: break if timer.timeout: raise Exception( "Timeout waiting stack {} to get out of status {}". format(self.stack_name, status)) time.sleep(20)
def cache_ranks(cron_cursor): time = Timer() time.start() logger.info("Caching ranks... ") cron_cursor.execute("SELECT extID FROM users WHERE isBanned = 0 ORDER BY stars") Leaderboards = cron_cursor.fetchall() Leaderboards.reverse() Ranks.clear() UserRank = 0 for User in Leaderboards: UserRank += 1 Ranks[str(User[0])] = UserRank time.end() logger.info(f"Done! {time.ms_return()}ms")
def cache_comment_bans(cron_cursor): """Caches comment bans so a lookup doesn't have to be made.""" time = Timer() time.start() logger.info("Caching comment bans...") timestamp = round(pytime.time()) #so expired bans dont get cached cron_cursor.execute("SELECT accountID, endTimestamp, reason FROM commentbans WHERE endTimestamp > %s", (timestamp,)) comment_bans = cron_cursor.fetchall() CommentBanCache.clear() for ban in comment_bans: CommentBanCache[ban[0]] = { "end_time" : ban[1], "reason" : ban[2] } time.end() logger.info(f"Done with {len(comment_bans)} comment bans cached! {time.ms_return()}ms")
def cron_thread(): Log("Cron thread started!") time = Timer() while True: Log("Running cron!") time.start() cron_cursor = mydb.cursor() #create cursor specifically for cron jobs cache_user_ids(cron_cursor) cache_ranks(cron_cursor) calculate_cp(cron_cursor) max_star_count_ban(cron_cursor) cache_comment_bans(cron_cursor) cache_server_stats(cron_cursor) cron_cursor.close() #close it after all is done Log(f"Cron done! Took {round(time.end(),2)}s") pytime.sleep(UserConfig["CronThreadDelay"])
def copy_snapshot(self, snapshot_id, destination_region, tags=None, description=None, wait_to_complete=300): ec2_destination = Ec2(region=destination_region) ec2_destination_client = boto3.client("ec2", region_name=destination_region) args = {"SourceSnapshotId": snapshot_id, "SourceRegion": self.region} if description is not None: args["Description"] = description with Timer(timeout_seconds=wait_to_complete) as timer: while True: try: if timer.timeout: return None snapshot_copy_id = ec2_destination_client.copy_snapshot( **args)["SnapshotId"] break except Exception as ex: if self.snapshot_request_limit_exceeded(ex): time.sleep(20) snapshot = ec2_destination.get_snapshot(snapshot_copy_id) if wait_to_complete == 0: if snapshot is not None: ec2_destination.create_tags( resource_ids=[snapshot["SnapshotId"]], tags=tags) return snapshot while True: if snapshot["State"] == "completed": ec2_destination.create_tags( resource_ids=[snapshot["SnapshotId"]], tags=tags) return snapshot if timer.timeout: return None time.sleep(20) snapshot = ec2_destination.get_snapshot(snapshot_copy_id) if snapshot["State"] == "error": return None
def terminate_instance(self, instance_id, wait_to_complete=300): status = self.get_instance_status(instance_id) if status in [None, "terminated"]: return True self.ec2_client.terminate_instances(InstanceIds=[instance_id]) time.sleep(15) with Timer(timeout_seconds=wait_to_complete) as timer: while True: status = self.get_instance_status(instance_id) if status == "terminated": return True if timer.timeout: return False time.sleep(10)