def multiprocess_overlapping_cases(data: pd.DataFrame, job_num: int): """ Compute parallel executed case instances in given dataset. The computation will be split into the given number of jobs. :param data: case log :param job_num: number of jobs """ print_time('parallel cases %s' % len(data)) # get split points steps = get_steps(data, job_num) print(steps) jobs = [] out_q = Queue() # start all jobs for idx, r in enumerate(steps): p = Process(target=overlapping_cases, args=(data, idx + 1, r, out_q)) jobs.append(p) p.start() # collect results res = {} for i in range(len(steps)): res.update(out_q.get()) # collect processes for job in jobs: job.join() # update case log for k, v in res.items(): data.at[k, 'overlapping_cases'] = v
def _optimize_model(self, arg_values : argparse.Namespace) -> \ Iterable[GoalEncState]: with print_time("Loading data", guard=arg_values.verbose): if arg_values.start_from: _, (arg_values, unparsed_args, (metadata, state)) = \ torch.load(arg_values.start_from) _, tokenized_goals, outputs = \ goals_to_total_distances_tensors_with_meta( extract_dataloader_args(arg_values), str(arg_values.scrape_file), metadata) else: metadata, tokenized_goals, outputs = \ goals_to_total_distances_tensors( extract_dataloader_args(arg_values), str(arg_values.scrape_file)) with print_time("Converting data to tensors", guard=arg_values.verbose): tensors = [pad_sequence([torch.LongTensor(tok_goal) for tok_goal in tokenized_goals], batch_first=True), torch.FloatTensor(outputs)] with print_time("Building the model", guard=arg_values.verbose): model = self._get_model(arg_values, goal_enc_get_num_tokens(metadata)) if arg_values.start_from: self.load_saved_state(arg_values, unparsed_args, state) return ((metadata, state) for state in optimize_checkpoints(tensors, arg_values, model, lambda batch_tensors, model: self._get_batch_prediction_loss(arg_values, batch_tensors, model)))
def parallel_activities(data: pd.DataFrame, delta: datetime.timedelta, instance, range: list, out_q: Queue): """ This function is called by multiprocess_parallel_activities(). It computes the number of parallel executed activity instances in a given subset. :param data: dataset :param delta: time interval :param instance: current instance :param range: subset :param out_q: output queue """ # total number of activity instances size = len(data) # collect results r = {} for idx, row in data[range[0]:range[1]].iterrows(): exclude_cases = data.loc[(data['start'] > row['end'] + delta) | (data['end'] < row['start'] - delta)] counter = size - len(exclude_cases) r[idx] = counter print_time('instance: %s' % instance, False) out_q.put(r)
def get_tokens(args: List[str]): parser = argparse.ArgumentParser(description="Pick a set of tokens") parser.add_argument("--type", choices=["mixed"], default="mixed") parser.add_argument("-v", "--verbose", action='count', default=0) parser.add_argument("-n", "--num-keywords", type=int, default=120) parser.add_argument("-s", "--num-samples", type=int, default=2000) parser.add_argument("-j", "--num-threads", type=int, default=None) parser.add_argument("scrapefile", type=Path2) parser.add_argument("dest") arg_values = parser.parse_args(args) with print_time("Reading scraped data", guard=arg_values.verbose): raw_data = list(data.read_text_data(arg_values.scrapefile)) embedding = SimpleEmbedding() subset = data.RawDataset(random.sample(raw_data, arg_values.num_samples)) relevance_pairs = [ (context.focused_goal, embedding.encode_token(serapi_instance.get_stem(tactic))) for relevant_lemmas, prev_tactics, context, tactic in subset ] with print_time("Calculating keywords", guard=arg_values.verbose): keywords = get_relevant_k_keywords2(relevance_pairs, arg_values.num_keywords, arg_values.num_threads) with (open(arg_values.dest, mode='w') if arg_values.dest != "-" else contextlib.nullcontext(sys.stdout)) as f: for keyword in keywords: f.write(keyword + "\n")
def train(self, checkpoint_path=None, weights_only=False): print('Starting training') print(datetime.datetime.now()) start_time = datetime.datetime.now() # Load pretrained parameters if desired if checkpoint_path is not None: self.load_checkpoint(checkpoint_path, weights_only) if weights_only: self.initialize_visualizations() else: # Initialize any training visualizations self.initialize_visualizations() # Train for specified number of epochs for self.epoch in range(self.epoch, self.num_epochs): epoch_start_time = datetime.datetime.now() # Increment the LR scheduler if self.scheduler is not None: self.scheduler.step() # Run an epoch of training self.train_one_epoch() epoch_end_time = datetime.datetime.now() total_seconds = (epoch_end_time - epoch_start_time).seconds util.print_time('Epoch', total_seconds) if self.epoch % self.validation_freq == 0: self.validate() if self.lin_rms_sq_error_meter.avg <= self.lin_rms_sq_error and self.loss.avg <= self.loss_error: self.save_checkpoint() self.lin_rms_sq_error = self.lin_rms_sq_error_meter.avg self.loss_error = self.loss.avg self.visualize_metrics() end_time = datetime.datetime.now() seconds = (end_time - start_time).seconds util.print_time('Training', seconds)
def set_activity_instances(df: pd.DataFrame): """ This function maps events and activity instances based on a first-come, first-serves approach. :param df: event log """ print_time('set activity instance') end_transitions = [ 'autoskip', 'manualskip', 'complete', 'withdraw', 'ate_abort', 'pi_abort' ] trace = df.loc[0, 'caseID'] activity = 0 instances = {} for idx, row in df.iterrows(): if row['caseID'] != trace: trace = row['caseID'] instances = {} if row['name'] not in instances.keys(): activity += 1 instances[row['name']] = activity df.at[idx, 'activity_instance'] = instances[row['name']] if row['transition'].lower() in end_transitions: instances.pop(row['name'], None) print_time('set activity instance', False)
def import_report(file: str): """ This function is used to import a conformance report. :param file: conformance report :return: pandas DataFrame, containing raw cost per case """ print_time('Import report') df = pd.read_csv(file, sep=',', header=[0]) costs = [] cases = [] for idx, row in enumerate(df.loc[:, 'Case IDs']): raw_cost = df.loc[idx, 'Raw Fitness Cost'] # update alignment id in case table for case in df.loc[idx, 'Case IDs'].split('|'): cases.append(case) costs.append(float(raw_cost)) print_time('Import report', False) df = pd.DataFrame({'case': cases, 'response': costs}) return df
def verify_candidates(candidates, user_movies_matrix, start_time): print("\nVerifying candidates...") count = 0 print("Number of buckets in total: " + str(len(candidates))) for cnr, candidate_group in enumerate(candidates): # print("Number of candidates in bucket " + str(cnr) + ": " + str(len(candidate_group))) for cnr1, candidate1 in enumerate(candidate_group): for cnr2 in range(cnr1 + 1, len(candidate_group)): candidate2 = list(candidate_group)[cnr2] jsim = sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]) if jsim >= 0.50: print("Number of candidates in bucket " + str(cnr) + ": " + str(len(candidate_group))) count = count + 1 print((candidate1, candidate2)) print("Similarity: " + str( sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]))) print("Found until now: " + str(count)) util.print_time(start_time) print() # print() print(count)
def open_page(self, url, wait_time=0): print_time(f"Opening url {url}") self.driver.get(url) if (wait_time > 0): self.driver.implicitly_wait(wait_time) print_time("Page loaded")
def multiprocess_overlapping_events(log_data: pd.DataFrame, job_num: int): """ Compute parallel executed events of given dataset. The computation will be split into the given number of jobs. :param log_data: event log :param job_num: number of jobs """ print_time('parallel events') # get all event names event_names = log_data['name'].unique() # convert variable type of timestamp log_data['timestamp'] = pd.to_datetime(log_data.loc[:, 'timestamp'], format='%Y-%m-%d %H:%M:%S') # iterate through all event names for e in event_names: print_time('calculate parallel events for %s' % e) # get subset sub_data = log_data.loc[log_data['name'] == e] sub_data = sub_data.sort_values(by=['timestamp']) steps = get_steps(sub_data, job_num) print(steps) # collect jobs and results jobs = [] out_q = Queue() # set time interval (= theta) delta = datetime.timedelta(days=1) # start different jobs for idx, r in enumerate(steps): p = Process(target=overlapping_events, args=(sub_data, delta, idx + 1, r, out_q)) jobs.append(p) p.start() # collect results res = {} for i in range(len(steps)): res.update(out_q.get()) # collect processes for job in jobs: job.join() for k, v in res.items(): log_data.at[k, 'parallel_events'] = v
def multiprocess_parallel_event_sets(data: pd.DataFrame, job_num: int): """ Compute parallel executed event sets of given dataset. The computation will be split into the given number of jobs. :param data: event log :param job_num: number of jobs """ print_time('parallel event sets') data['parallel_sets'] = [0] * len(data) set_names = data['set_name'].unique() print('Set names: %s' % set_names) # iterate through sets for s in set_names: print_time('calculate parallel events sets for set_%s' % s) sub_data = data.loc[data['set_name'] == s] # get split points steps = get_steps(sub_data, job_num) print(steps) # collect jobs and results jobs = [] out_q = Queue() # set time interval (= theta) delta = datetime.timedelta(days=1) # start jobs for idx, r in enumerate(steps): p = Process(target=parallel_event_sets, args=( sub_data, delta, idx + 1, r, out_q)) jobs.append(p) p.start() # collect results res = {} for i in range(len(steps)): res.update(out_q.get()) # collect processes for job in jobs: job.join() # update data for k, v in res.items(): data.at[k, 'parallel_sets'] = v
def build_tag_to_pos(Y): tag_to_pos = {} i = 0 print_time("building build_tag_to_pos...") for s in Y: for t in s: if t not in tag_to_pos: tag_to_pos[t] = i i += 1 pos_to_tag = {v: k for k, v in tag_to_pos.items()} return tag_to_pos, pos_to_tag
def reinforce_training_worker(args: argparse.Namespace, initial_buffer_size: int, lock: Lock, namespace: multiprocessing.managers.Namespace, samples: Queue[LabeledTransition]): last_trained_at = 0 samples_retrieved = 0 memory: List[LabeledTransition] = [] while True: if samples_retrieved - last_trained_at < args.train_every_min: next_sample = samples.get() memory.append(next_sample) samples_retrieved += 1 continue else: try: next_sample = samples.get(timeout=.01) memory.append(next_sample) samples_retrieved += 1 if samples_retrieved - last_trained_at > args.train_every_max: eprint("Forcing training", guard=args.verbose >= 2) else: continue except queue.Empty: pass if len(memory) > args.buffer_max_size: memory = random.sample(memory, args.buffer_max_size - args.train_every_max) # del memory[0:args.train_every_max+1] if samples_retrieved - last_trained_at >= args.train_every_min: last_trained_at = samples_retrieved transition_samples = sample_batch(memory, args.batch_size) with lock: eprint( f"Locked in training thread for {len(memory)} samples", guard=args.verbose >= 2) q_estimator = namespace.estimator predictor = namespace.predictor with print_time("Assigning scores", guard=args.verbose >= 2): training_samples = assign_scores(args, q_estimator, predictor, transition_samples) with print_time("Training", guard=args.verbose >= 2): q_estimator.train(training_samples, show_loss=args.show_loss) q_estimator.save_weights(args.out_weights, args) namespace.estimator = q_estimator eprint("Unlocked in training thread", guard=args.verbose >= 2) pass
def build_word_to_pos(X, padding="EOC"): word_to_pos = {} word_to_pos[padding] = 0 i = 1 print_time("building build_word_to_pos...") for s in X: for w in s: if w not in word_to_pos: word_to_pos[w] = i i += 1 pos_to_word = {v: k for k, v in word_to_pos.items()} return word_to_pos, pos_to_word
def fit(self, dataset, num_epochs, tolerance=0.0001): """ Parameters ---------- dataset: Dataset with the sequences and tags num_epochs: int Number of epochs that the model will be trained Returns -------- Nothing. The method only changes self.parameters. """ print_time("Starting training...") self.tolerance = tolerance if self.fitted: print("\n\tWarning: Model already trained") if len(self.acc_per_epoch) == 0: prev_acc = 0 else: prev_acc = self.acc_per_epoch[-1] for epoch in range(num_epochs): acc = self.fit_epoch(dataset) print_time("Epoch: %i Accuracy: %f" % (epoch, acc)) self.acc_per_epoch.append(acc) if abs(acc-prev_acc) < self.tolerance: print("Stopped by tolerance!") break prev_acc = acc if self.averaged: new_w = 0 for old_w in self.params_per_epoch: new_w += old_w new_w /= len(self.params_per_epoch) self.parameters = new_w self.fitted = True
def multiprocess_parallel_activities(data: pd.DataFrame, job_num: int): """ Compute parallel executed activity instances in given dataset. The computation will be split into the given number of jobs. :param data: activity log :param job_num: number of jobs """ print_time('parallel activities') activity_name = data['name'].unique() # compute for all activities for a in activity_name: print_time('calculate parallel activities for activity %s' % a) # get subset sub_data = data.loc[data['name'] == a] steps = get_steps(sub_data, job_num) print(steps) jobs = [] out_q = Queue() # set time interval (=theta) delta = datetime.timedelta(days=1) # start all jobs for idx, r in enumerate(steps): p = Process(target=parallel_activities, args=(sub_data, delta, idx + 1, r, out_q)) jobs.append(p) p.start() # collect results res = {} for i in range(len(steps)): res.update(out_q.get()) # collect processes for job in jobs: job.join() # update DataFrame for k, v in res.items(): data.at[k, 'parallel_activities'] = v
def print_report(nr_found, nr_buckets, start_time, matrix_shape, k, b, r, seed): print("______________________________________________\n") print("Final Report") print("------------\n") print("Found pairs in total: " + str(nr_found)) print("Number of buckets in total: " + str(nr_buckets)) util.print_time(start_time) print() print("For:") print('Users: {1} | Movies: {0}'.format(str(matrix_shape[0]), str(matrix_shape[1]))) print() print("With parameters:") print('Sig. Length: {0} | Bands: {1} | Rows: {2} | Seed: {3}'.format( k, b, r, seed))
def callback(self): print('==') global first if self.img_ready(): with print_time('get_img'): surface, img_file = self.get_img() with print_time('gui'): self.set_display(surface) self.draw_line((self.xmid_px, 0), (self.xmid_px, self.height_px)) if do_face: with print_time('get_face'): result = self.get_face(img_file) if result: (mouth_x, mouth_y), depth = result if first: print( f'Field of view: {self.width_px}x{self.height_px} px' ) print( f'Field of view: {self.width_deg:.0f}x{self.height_deg:.0f} deg' ) print( f'degrees: {self.deg_per_px_x:.3f} {self.deg_per_px_y:.3f}' ) first = False self.draw_dot((mouth_x, mouth_y)) with print_time('do_servo'): if do_servo: altitude = (self.ymid_px - mouth_y) * self.deg_per_px_y self.turn(self.xmid_px - mouth_x) self.aim(altitude, depth) self.got_target() if do_shoot: self.maybe_fire() else: self.cancel_target() else: print('image not ready')
def extract_data_from_table(self, template, pop=[], remove_last=False): if (not template): raise Exception("No template added to parser") print_time("Extracting data from table") data = [] rows = self.soup.find_all("tr") if (len(pop) > 0): sorted_pop = pop.sort(reverse=True) for index in sorted_pop: print_time(f"Removing row by index {index}") rows.pop(index) # Remove table header if (remove_last): print_time(f"Removing last element") rows.pop() current = 1 log_counter = 0 elements = len(rows) #Iterate over every row in the table for row in rows: log_counter += 1 if (log_counter == self.log_each_n or current == elements): print_time(f"Parsing row {current} of {elements}") log_counter = 0 new_dict = {} columns = row.find_all("td") if (type(template) == "dict"): for key, value in template.items(): new_dict[value] = columns[key].text else: for index, value in enumerate(template): new_dict[value] = columns[index].text data.append(new_dict) current += 1 print_time("Parser complete") return data
def _optimize_model( self, arg_values: argparse.Namespace ) -> Iterable[FeaturesDNNEvaluatorState]: with print_time("Loading data", guard=arg_values.verbose): if arg_values.start_from: _, (arg_values, unparsed_args, (picklable_token_map, state)) = torch.load(arg_values.start_from) token_map = tmap_from_picklable(picklable_token_map) _, word_features_data, vec_features_data, outputs,\ word_features_vocab_sizes, vec_features_size = features_to_total_distances_tensors_with_map( extract_dataloader_args(arg_values), str(arg_values.scrape_file), token_map) else: token_map, word_features_data, vec_features_data, outputs, \ word_features_vocab_sizes, vec_features_size = features_to_total_distances_tensors( extract_dataloader_args(arg_values), str(arg_values.scrape_file)) # eprint(f"word data: {word_features_data[:10]}") # eprint(f"vec data: {vec_features_data[:10]}") # eprint(f"outputs: {outputs[:100]}") with print_time("Converting data to tensors", guard=arg_values.verbose): tensors = [ torch.LongTensor(word_features_data), torch.FloatTensor(vec_features_data), torch.FloatTensor(outputs) ] with print_time("Building the model", guard=arg_values.verbose): model = self._get_model(arg_values, word_features_vocab_sizes, vec_features_size) if arg_values.start_from: self.load_saved_state(arg_values, unparsed_args, state) return ( (tmap_to_picklable(token_map), state) for state in optimize_checkpoints( tensors, arg_values, model, lambda batch_tensors, model: self. _get_batch_prediction_loss(arg_values, batch_tensors, model)))
def mark_outlier(data: pd.DataFrame, base_path: str): """ This function calls the LOF computation for each feature. :param data: case log :param base_path: path to save plots """ print_time('events: mark outlier') columns = [] for c in list(data.columns): # ignore selected columns if 'case' in c or 'involved' in c: pass else: columns.append(c) # call LOF computation for c in columns: do_LOF(data, c, base_path, plot=False)
def mark_outlier(data: pd.DataFrame, base_path: str): """ This function calls the LOF computation for each feature. :param data: case log :param base_path: path to location to save plots """ print_time('case mark outlier') columns = [] for c in list(data.columns): # ignore selected columns if 'case_duration' in c or 'overlapping_cases' in c: columns.append(c) else: pass # call LOF computation for c in columns: do_LOF(data, c, base_path, plot=True)
def verify_partial_candidates(candidate_group, user_movies_matrix, bucket_nr, nr_found, start_time): for cnr1, candidate1 in enumerate(candidate_group): for cnr2 in range(cnr1 + 1, len(candidate_group)): candidate2 = list(candidate_group)[cnr2] jsim = sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]) if jsim >= 0.50: pair = sorted((candidate1, candidate2)) data.save_pair(pair) print("\tFound similar pair: " + str(pair)) print("\tSimilarity: " + str( sim.jaccard(user_movies_matrix[candidate1], user_movies_matrix[candidate2]))) print("\tBucket number: " + str(bucket_nr)) print("\tNumber of candidates in the bucket: " + str(len(candidate_group))) nr_found[0] = nr_found[0] + 1 print("\tFound until now: " + str(nr_found[0])) util.print_time(start_time, "\t") print()
def find_cluster(data: pd.DataFrame, base_path: str, plot: bool): """ This function calls the k-means clustering for all features. :param data: case log :param base_path: path to save plots and cluster information :param plot: whether to generate plots or not """ print_time('case kmeans') columns = [] for c in list(data.columns): # ignore selected columns if 'case_duration' in c or 'overlapping_cases' in c: columns.append(c) else: pass # call k-means computation for c in columns: do_kmeans(data, c, base_path, plot)
def find_cluster(data: pd.DataFrame, base_path: str, plot: bool): """ This function calls the k-means clustering for all features. :param data: case log :param base_path: path to save plots and cluster information :param plot: whether to generate plots or not """ print_time('activities kmeans') columns = [] for c in list(data.columns): # ignore selected columns if 'case' in c or 'involved' in c or 'weekday' in c or 'weekend' in c \ or 'start_am' in c: pass else: columns.append(c) # call k-means computation for c in columns: do_kmeans(data, c, base_path, plot)
def run(bolt_path, plink_path, bfile, num_people, pheno_path, pheno_col, out_id): info = (f'=> generating subset individual data\n' f'bolt_path: {bolt_path}\n' f'plink_path: {plink_path}\n' f'bfile: {bfile}\n' f'num_people: {num_people}\n' f'pheno_filename: {pheno_path}\n' f'pheno_col: {pheno_col}\n' f'out: {out_id}\n') print(info) sys.stdout.flush() file_cache_out_path = os.path.join('file_cache', f'{out_id}_{num_people}') _, pheno_temp = generate_subset(plink_path=plink_path, bfile=bfile, num_people=num_people, out=file_cache_out_path, pheno_path=pheno_path) print(f'subset pheno file: {pheno_temp}') print('=> assigning the SNP components by chromosome') sys.stdout.flush() snp_assignment_filename = file_cache_out_path + '.snps_assignment' partition(file_cache_out_path + '.bim', snp_assignment_filename) print('=> running BOLT-REML') print_time() sys.stdout.flush() dt = bench_bolt_reml(bolt_path, snp_assignment_filename, file_cache_out_path + '.bed', file_cache_out_path + '.bim', file_cache_out_path + '.fam', pheno_temp, pheno_col) log_path_prefix = os.path.join('output', f'{out_id}_{num_people}') print(f'log_path_prefix: {log_path_prefix}') with open(f'{log_path_prefix}.bench', 'w') as file: file.write(info) file.write(f'BOLT-REML took {dt} sec\n') print_time()
def test(): if(len(sys.argv) < 3): raise Exception("Script must be called with two arguments, the path to chromedriver and the path to firebase config") chromedriver = sys.argv[1] elapsed = Elapsed() scraper = Scraper(chromedriver, headless=True) test_url = "https://96hpr.csb.app" try: scraper.open_page(test_url) html = scraper.get_outerhtml( By.XPATH, "/html/body/div/div/table/tbody") parsed = Parser(html, log_each_n=10) template = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"] parsed.extract_data_from_table(template, [0], True) print_time(f"Extracted data") finally: scraper.close() elapsed.end()
def multiprocess_aggregation(data: pd.DataFrame, job_num: int): """ Aggregate enriched event log to case log. The computation will be split into the given number of jobs. :param data: event log :param job_num: number of jobs :return: case log """ print_time('aggregate events') print('Lenght dataset %s' % len(data)) case_ids = data['caseID'].unique() steps = get_steps_seq(case_ids, data, job_num) print(steps) events = data['name'].unique() # feature types feature = [ 'abs_lag', 'pre_lag', 'post_lag', 'weekday', 'weekend', 'parallel_events' ] # blueprint to collect single events case_tmp = {} for e in events: case_tmp[e] = {'counter': 0} for f in feature: case_tmp[e][f] = 0 jobs = [] out_q = Queue() # start all jobs for idx, r in enumerate(steps): p = Process(target=aggregate, args=(data, case_tmp, r, out_q, idx)) jobs.append(p) p.start() # collect results res = {} for i in range(len(steps)): res.update(out_q.get()) for job in jobs: job.join() print_time('time features', start=False) print_time('merge results') # return case log return pd.DataFrame(list(res.values()))
def multiprocess_aggregation(data: pd.DataFrame, job_num: int): """ Aggregate enriched activity log to case log. The computation will be split into the given number of jobs. :param data: enriched activity log :param job_num: number of jobs :return: case log """ print_time('aggregate activities') case_ids = data['caseID'].unique() steps = get_steps_seq(case_ids, data, job_num) # prepare names activity_names = data['name'].unique() activity_names = ['_'.join(['act', str(i)]) for i in activity_names] feature = [ 'abs_lag', 'duration', 'start_am', 'weekday', 'parallel_activities' ] # generate blueprint case_tmp = {} for a in activity_names: case_tmp[a] = {'counter': 0} for f in feature: case_tmp[a][f] = 0 jobs = [] out_q = Queue() # start all jobs for idx, r in enumerate(steps): p = Process(target=aggregate, args=(data, case_tmp, r, out_q, idx)) jobs.append(p) p.start() # merge jobs and collect results res = {} for i in range(len(steps)): res.update(out_q.get()) for job in jobs: job.join() print_time('aggregate activities', start=False) print_time('merge results') return pd.DataFrame(list(res.values()))
def multiprocess_time_feature(log_data: pd.DataFrame, job_num: int): """ Compute basic features for given dataset. The computation will be split into the given number of jobs. :param log_data: event log :param job_num: number of jobs """ print_time('time features') case_ids = log_data['caseID'].unique() # convert variable type of timestamp log_data['timestamp'] = pd.to_datetime(log_data.loc[:, 'timestamp'], utc=True, format='%Y-%m-%d ' '%H:%M:%S') # get split points steps = get_steps_seq(case_ids, log_data, job_num) jobs = [] out_q = Queue() # start jobs for idx, r in enumerate(steps): p = Process(target=multi_time_feature, args=(log_data, r, out_q, idx)) jobs.append(p) p.start() # collect results res = {} for i in range(len(steps)): res.update(out_q.get()) for job in jobs: job.join() print_time('time features', start=False) print_time('merge results') # add features to DataFrame for idx, data in res.items(): for attr, v in data.items(): log_data.at[idx, attr] = v
from util import print_time def main(): parser = argparse.ArgumentParser(description="Merge multiple svm format features.") parser.add_argument( "-i", nargs="*", required=True, dest="input_filename", help="Specify input file path (accept multiple inputs)" ) parser.add_argument("-o", required=True, dest="output_filename", help="Specify output file path") opts = parser.parse_args(sys.argv[1:]) all_X = [] for fileName in opts.input_filename: print "Loading " + fileName + " ..." X, y = load_svmlight_file(fileName) print X.shape all_X.append(X.todense()) X = np.concatenate(all_X, axis=1) print "Saving " + opts.output_filename + " ..." print X.shape dump_svmlight_file(X, y, opts.output_filename) if __name__ == "__main__": ts = time.time() main() te = time.time() print_time(ts, te)
psi_ez_x[-1,:-1,:] = -pml_cb[-1]*(hy[-1,:-1,:] - hy[-3,:-1,:]) for i in xrange(-2,-npml-1,-1): psi_ez_x[i,:-1,:] = pml_ca[i]*psi_ez_x[i+1,:-1,:] - pml_cb[i]*(hy[i,:-1,:] - hy[i-2,:-1,:]) ez[-npml-2:-2,:-1,:] += 0.5*psi_ez_x[:,:-1,:] # for source ez[270,ny/2,1] += np.sin(2*np.pi*frequency*dt*tstep) # for pbc update_pbc_e('z', *em_arrays[:-3]) update_h(*em_arrays) # for pml psi_hy_x[-1,:,1:] = -pml_cb[-1]*(ez[-1,:,1:] - ez[-3,:,1:]) for i in xrange(-2,-npml-1,-1): psi_hy_x[i,:,1:] = pml_ca[i]*psi_hy_x[i+1,:,1:] - pml_cb[i]*(ez[i,:,1:] - ez[i-2,:,1:]) hy[-npml-1:-1,:,1:] += 0.5*psi_hy_x[:,:,1:] # for pbc update_pbc_h('z', *em_arrays[3:]) if tstep%tgap == 0: print_time(tstep) #print hx[260:280,190:210,1] #print psi_ez_x[:,ny/2,nz/2] im.set_array(ez[:,:,nz/2].T**2) plt.draw() #savefig('./png/%.5d.png' % tstep) print ''