Exemplo n.º 1
0
def multiprocess_overlapping_cases(data: pd.DataFrame, job_num: int):
    """
	Compute parallel executed case instances in given dataset. The
	computation will be	split into the given number of jobs.

	:param data: case log
	:param job_num: number of jobs
	"""
    print_time('parallel cases %s' % len(data))

    # get split points
    steps = get_steps(data, job_num)

    print(steps)
    jobs = []
    out_q = Queue()

    # start all jobs
    for idx, r in enumerate(steps):
        p = Process(target=overlapping_cases, args=(data, idx + 1, r, out_q))
        jobs.append(p)
        p.start()

    # collect results
    res = {}
    for i in range(len(steps)):
        res.update(out_q.get())

    # collect processes
    for job in jobs:
        job.join()

    # update case log
    for k, v in res.items():
        data.at[k, 'overlapping_cases'] = v
    def _optimize_model(self, arg_values : argparse.Namespace) -> \
        Iterable[GoalEncState]:
        with print_time("Loading data", guard=arg_values.verbose):
            if arg_values.start_from:
                _, (arg_values, unparsed_args, (metadata, state)) = \
                    torch.load(arg_values.start_from)
                _, tokenized_goals, outputs = \
                    goals_to_total_distances_tensors_with_meta(
                        extract_dataloader_args(arg_values),
                        str(arg_values.scrape_file), metadata)
            else:
                metadata, tokenized_goals, outputs = \
                    goals_to_total_distances_tensors(
                        extract_dataloader_args(arg_values),
                        str(arg_values.scrape_file))

        with print_time("Converting data to tensors", guard=arg_values.verbose):
            tensors = [pad_sequence([torch.LongTensor(tok_goal)
                                     for tok_goal in tokenized_goals],
                                     batch_first=True),
                       torch.FloatTensor(outputs)]

        with print_time("Building the model", guard=arg_values.verbose):
            model = self._get_model(arg_values, goal_enc_get_num_tokens(metadata))

            if arg_values.start_from:
                self.load_saved_state(arg_values, unparsed_args, state)

        return ((metadata, state) for state in
                optimize_checkpoints(tensors, arg_values, model,
                                     lambda batch_tensors, model:
                                     self._get_batch_prediction_loss(arg_values,
                                                                     batch_tensors,
                                                                     model)))
Exemplo n.º 3
0
def parallel_activities(data: pd.DataFrame, delta: datetime.timedelta,
                        instance, range: list, out_q: Queue):
    """
	This function is called by multiprocess_parallel_activities(). It computes
	the number of parallel executed activity instances in a given subset.

	:param data: dataset
	:param delta: time interval
	:param instance: current instance
	:param range: subset
	:param out_q: output queue
	"""

    # total number of activity instances
    size = len(data)

    # collect results
    r = {}

    for idx, row in data[range[0]:range[1]].iterrows():
        exclude_cases = data.loc[(data['start'] > row['end'] + delta) |
                                 (data['end'] < row['start'] - delta)]
        counter = size - len(exclude_cases)
        r[idx] = counter

    print_time('instance: %s' % instance, False)
    out_q.put(r)
Exemplo n.º 4
0
def get_tokens(args: List[str]):
    parser = argparse.ArgumentParser(description="Pick a set of tokens")
    parser.add_argument("--type", choices=["mixed"], default="mixed")
    parser.add_argument("-v", "--verbose", action='count', default=0)
    parser.add_argument("-n", "--num-keywords", type=int, default=120)
    parser.add_argument("-s", "--num-samples", type=int, default=2000)
    parser.add_argument("-j", "--num-threads", type=int, default=None)
    parser.add_argument("scrapefile", type=Path2)
    parser.add_argument("dest")
    arg_values = parser.parse_args(args)

    with print_time("Reading scraped data", guard=arg_values.verbose):
        raw_data = list(data.read_text_data(arg_values.scrapefile))
    embedding = SimpleEmbedding()
    subset = data.RawDataset(random.sample(raw_data, arg_values.num_samples))
    relevance_pairs = [
        (context.focused_goal,
         embedding.encode_token(serapi_instance.get_stem(tactic)))
        for relevant_lemmas, prev_tactics, context, tactic in subset
    ]
    with print_time("Calculating keywords", guard=arg_values.verbose):
        keywords = get_relevant_k_keywords2(relevance_pairs,
                                            arg_values.num_keywords,
                                            arg_values.num_threads)

    with (open(arg_values.dest, mode='w') if arg_values.dest != "-" else
          contextlib.nullcontext(sys.stdout)) as f:
        for keyword in keywords:
            f.write(keyword + "\n")
Exemplo n.º 5
0
    def train(self, checkpoint_path=None, weights_only=False):
        print('Starting training')
        print(datetime.datetime.now())
        start_time = datetime.datetime.now()
        # Load pretrained parameters if desired
        if checkpoint_path is not None:
            self.load_checkpoint(checkpoint_path, weights_only)
            if weights_only:
                self.initialize_visualizations()
        else:
            # Initialize any training visualizations
            self.initialize_visualizations()

        # Train for specified number of epochs
        for self.epoch in range(self.epoch, self.num_epochs):
            epoch_start_time = datetime.datetime.now()
            # Increment the LR scheduler
            if self.scheduler is not None:
                self.scheduler.step()
            # Run an epoch of training
            self.train_one_epoch()
            epoch_end_time = datetime.datetime.now()
            total_seconds = (epoch_end_time - epoch_start_time).seconds
            util.print_time('Epoch', total_seconds)
            if self.epoch % self.validation_freq == 0:
                self.validate()
                if self.lin_rms_sq_error_meter.avg <= self.lin_rms_sq_error and self.loss.avg <= self.loss_error:
                    self.save_checkpoint()
                    self.lin_rms_sq_error = self.lin_rms_sq_error_meter.avg
                    self.loss_error = self.loss.avg
                self.visualize_metrics()
        end_time = datetime.datetime.now()
        seconds = (end_time - start_time).seconds
        util.print_time('Training', seconds)
Exemplo n.º 6
0
def set_activity_instances(df: pd.DataFrame):
    """
	This function maps events and activity instances based on a first-come,
	first-serves approach.

	:param df: event log
	"""
    print_time('set activity instance')
    end_transitions = [
        'autoskip', 'manualskip', 'complete', 'withdraw', 'ate_abort',
        'pi_abort'
    ]

    trace = df.loc[0, 'caseID']
    activity = 0
    instances = {}
    for idx, row in df.iterrows():
        if row['caseID'] != trace:
            trace = row['caseID']
            instances = {}

        if row['name'] not in instances.keys():
            activity += 1
            instances[row['name']] = activity

        df.at[idx, 'activity_instance'] = instances[row['name']]

        if row['transition'].lower() in end_transitions:
            instances.pop(row['name'], None)

    print_time('set activity instance', False)
Exemplo n.º 7
0
def import_report(file: str):
    """
	This function is used to import a conformance report.

	:param file: conformance report
	:return: pandas DataFrame, containing raw cost per case
	"""

    print_time('Import report')
    df = pd.read_csv(file, sep=',', header=[0])

    costs = []
    cases = []

    for idx, row in enumerate(df.loc[:, 'Case IDs']):

        raw_cost = df.loc[idx, 'Raw Fitness Cost']

        # update alignment id in case table
        for case in df.loc[idx, 'Case IDs'].split('|'):
            cases.append(case)
            costs.append(float(raw_cost))

    print_time('Import report', False)

    df = pd.DataFrame({'case': cases, 'response': costs})
    return df
Exemplo n.º 8
0
def verify_candidates(candidates, user_movies_matrix, start_time):
    print("\nVerifying candidates...")
    count = 0

    print("Number of buckets in total: " + str(len(candidates)))
    for cnr, candidate_group in enumerate(candidates):
        # print("Number of candidates in bucket " + str(cnr) + ": " + str(len(candidate_group)))
        for cnr1, candidate1 in enumerate(candidate_group):
            for cnr2 in range(cnr1 + 1, len(candidate_group)):
                candidate2 = list(candidate_group)[cnr2]
                jsim = sim.jaccard(user_movies_matrix[candidate1],
                                   user_movies_matrix[candidate2])
                if jsim >= 0.50:
                    print("Number of candidates in bucket " + str(cnr) + ": " +
                          str(len(candidate_group)))
                    count = count + 1
                    print((candidate1, candidate2))
                    print("Similarity: " + str(
                        sim.jaccard(user_movies_matrix[candidate1],
                                    user_movies_matrix[candidate2])))
                    print("Found until now: " + str(count))
                    util.print_time(start_time)
                    print()
        # print()

    print(count)
Exemplo n.º 9
0
    def open_page(self, url, wait_time=0):
        print_time(f"Opening url {url}")
        self.driver.get(url)

        if (wait_time > 0):
            self.driver.implicitly_wait(wait_time)

        print_time("Page loaded")
Exemplo n.º 10
0
def multiprocess_overlapping_events(log_data: pd.DataFrame, job_num: int):
    """
	Compute parallel executed events of given dataset. The computation will be
	split into the given number of jobs.

	:param log_data: event log
	:param job_num: number of jobs
	"""

    print_time('parallel events')

    # get all event names
    event_names = log_data['name'].unique()

    # convert variable type of timestamp
    log_data['timestamp'] = pd.to_datetime(log_data.loc[:, 'timestamp'],
                                           format='%Y-%m-%d %H:%M:%S')

    # iterate through all event names
    for e in event_names:
        print_time('calculate parallel events for %s' % e)

        # get subset
        sub_data = log_data.loc[log_data['name'] == e]
        sub_data = sub_data.sort_values(by=['timestamp'])

        steps = get_steps(sub_data, job_num)
        print(steps)

        # collect jobs and results
        jobs = []
        out_q = Queue()

        # set time interval (= theta)
        delta = datetime.timedelta(days=1)

        # start different jobs
        for idx, r in enumerate(steps):
            p = Process(target=overlapping_events,
                        args=(sub_data, delta, idx + 1, r, out_q))
            jobs.append(p)
            p.start()

        # collect results
        res = {}
        for i in range(len(steps)):
            res.update(out_q.get())

        # collect processes
        for job in jobs:
            job.join()

        for k, v in res.items():
            log_data.at[k, 'parallel_events'] = v
Exemplo n.º 11
0
def multiprocess_parallel_event_sets(data: pd.DataFrame, job_num: int):
	"""
	Compute parallel executed event sets of given dataset. The computation
	will be	split into the given number of jobs.

	:param data: event log
	:param job_num: number of jobs
	"""

	print_time('parallel event sets')

	data['parallel_sets'] = [0] * len(data)

	set_names = data['set_name'].unique()
	print('Set names: %s' % set_names)

	# iterate through sets
	for s in set_names:
		print_time('calculate parallel events sets for set_%s' % s)
		sub_data = data.loc[data['set_name'] == s]

		# get split points
		steps = get_steps(sub_data, job_num)
		print(steps)

		# collect jobs and results
		jobs = []
		out_q = Queue()

		# set time interval (= theta)
		delta = datetime.timedelta(days=1)

		# start jobs
		for idx, r in enumerate(steps):
			p = Process(target=parallel_event_sets, args=(
				sub_data, delta,
				idx + 1,
				r, out_q))
			jobs.append(p)
			p.start()

		# collect results
		res = {}
		for i in range(len(steps)):
			res.update(out_q.get())

		# collect processes
		for job in jobs:
			job.join()

		# update data
		for k, v in res.items():
			data.at[k, 'parallel_sets'] = v
Exemplo n.º 12
0
    def build_tag_to_pos(Y):
        tag_to_pos = {}
        i = 0
        print_time("building build_tag_to_pos...")
        for s in Y:
            for t in s:
                if t not in tag_to_pos:
                    tag_to_pos[t] = i
                    i += 1
        pos_to_tag = {v: k for k, v in tag_to_pos.items()}

        return tag_to_pos, pos_to_tag
Exemplo n.º 13
0
def reinforce_training_worker(args: argparse.Namespace,
                              initial_buffer_size: int,
                              lock: Lock,
                              namespace: multiprocessing.managers.Namespace,
                              samples: Queue[LabeledTransition]):
    last_trained_at = 0
    samples_retrieved = 0
    memory: List[LabeledTransition] = []
    while True:
        if samples_retrieved - last_trained_at < args.train_every_min:
            next_sample = samples.get()
            memory.append(next_sample)
            samples_retrieved += 1
            continue
        else:
            try:
                next_sample = samples.get(timeout=.01)
                memory.append(next_sample)
                samples_retrieved += 1
                if samples_retrieved - last_trained_at > args.train_every_max:
                    eprint("Forcing training", guard=args.verbose >= 2)
                else:
                    continue
            except queue.Empty:
                pass
        if len(memory) > args.buffer_max_size:
            memory = random.sample(memory, args.buffer_max_size -
                                   args.train_every_max)
            # del memory[0:args.train_every_max+1]
        if samples_retrieved - last_trained_at >= args.train_every_min:
            last_trained_at = samples_retrieved
            transition_samples = sample_batch(memory, args.batch_size)
            with lock:
                eprint(
                    f"Locked in training thread for {len(memory)} samples",
                    guard=args.verbose >= 2)
                q_estimator = namespace.estimator
                predictor = namespace.predictor
                with print_time("Assigning scores", guard=args.verbose >= 2):
                    training_samples = assign_scores(args,
                                                     q_estimator,
                                                     predictor,
                                                     transition_samples)
                with print_time("Training", guard=args.verbose >= 2):
                    q_estimator.train(training_samples,
                                      show_loss=args.show_loss)
                q_estimator.save_weights(args.out_weights, args)
                namespace.estimator = q_estimator
                eprint("Unlocked in training thread",
                       guard=args.verbose >= 2)

    pass
Exemplo n.º 14
0
    def build_word_to_pos(X, padding="EOC"):
        word_to_pos = {}
        word_to_pos[padding] = 0
        i = 1
        print_time("building build_word_to_pos...")
        for s in X:
            for w in s:
                if w not in word_to_pos:
                    word_to_pos[w] = i
                    i += 1

        pos_to_word = {v: k for k, v in word_to_pos.items()}
        return word_to_pos, pos_to_word
Exemplo n.º 15
0
    def fit(self, dataset, num_epochs, tolerance=0.0001):
        """
        Parameters
        ----------

        dataset:
        Dataset with the sequences and tags

        num_epochs: int
        Number of epochs that the model will be trained


        Returns
        --------

        Nothing. The method only changes self.parameters.
        """

        print_time("Starting training...")
        self.tolerance = tolerance

        if self.fitted:
            print("\n\tWarning: Model already trained")

        if len(self.acc_per_epoch) == 0:
            prev_acc = 0
        else:
            prev_acc = self.acc_per_epoch[-1]

        for epoch in range(num_epochs):
            acc = self.fit_epoch(dataset)
            print_time("Epoch: %i Accuracy: %f" % (epoch, acc))
            self.acc_per_epoch.append(acc)
            if abs(acc-prev_acc) < self.tolerance:
                print("Stopped by tolerance!")
                break
            prev_acc = acc


        if self.averaged:
            new_w = 0
            for old_w in self.params_per_epoch:
                new_w += old_w
            new_w /= len(self.params_per_epoch)
            self.parameters = new_w

        self.fitted = True
Exemplo n.º 16
0
def multiprocess_parallel_activities(data: pd.DataFrame, job_num: int):
    """
	Compute parallel executed activity instances in given dataset. The
	computation will be	split into the given number of jobs.

	:param data: activity log
	:param job_num: number of jobs
	"""
    print_time('parallel activities')
    activity_name = data['name'].unique()

    # compute for all activities
    for a in activity_name:
        print_time('calculate parallel activities for activity %s' % a)

        # get subset
        sub_data = data.loc[data['name'] == a]

        steps = get_steps(sub_data, job_num)
        print(steps)

        jobs = []
        out_q = Queue()

        # set time interval (=theta)
        delta = datetime.timedelta(days=1)

        # start all jobs
        for idx, r in enumerate(steps):
            p = Process(target=parallel_activities,
                        args=(sub_data, delta, idx + 1, r, out_q))
            jobs.append(p)
            p.start()

        # collect results
        res = {}
        for i in range(len(steps)):
            res.update(out_q.get())

        # collect processes
        for job in jobs:
            job.join()

        # update DataFrame
        for k, v in res.items():
            data.at[k, 'parallel_activities'] = v
Exemplo n.º 17
0
def print_report(nr_found, nr_buckets, start_time, matrix_shape, k, b, r,
                 seed):
    print("______________________________________________\n")

    print("Final Report")
    print("------------\n")
    print("Found pairs in total: " + str(nr_found))
    print("Number of buckets in total: " + str(nr_buckets))
    util.print_time(start_time)
    print()
    print("For:")
    print('Users: {1} | Movies: {0}'.format(str(matrix_shape[0]),
                                            str(matrix_shape[1])))
    print()
    print("With parameters:")
    print('Sig. Length: {0} | Bands: {1} | Rows: {2} | Seed: {3}'.format(
        k, b, r, seed))
Exemplo n.º 18
0
    def callback(self):
        print('==')
        global first
        if self.img_ready():
            with print_time('get_img'):
                surface, img_file = self.get_img()

            with print_time('gui'):
                self.set_display(surface)
                self.draw_line((self.xmid_px, 0),
                               (self.xmid_px, self.height_px))
            if do_face:
                with print_time('get_face'):
                    result = self.get_face(img_file)
                if result:
                    (mouth_x, mouth_y), depth = result

                    if first:
                        print(
                            f'Field of view: {self.width_px}x{self.height_px} px'
                        )
                        print(
                            f'Field of view: {self.width_deg:.0f}x{self.height_deg:.0f} deg'
                        )
                        print(
                            f'degrees: {self.deg_per_px_x:.3f} {self.deg_per_px_y:.3f}'
                        )
                        first = False

                    self.draw_dot((mouth_x, mouth_y))

                    with print_time('do_servo'):
                        if do_servo:
                            altitude = (self.ymid_px -
                                        mouth_y) * self.deg_per_px_y
                            self.turn(self.xmid_px - mouth_x)
                            self.aim(altitude, depth)
                            self.got_target()
                            if do_shoot:
                                self.maybe_fire()
                else:
                    self.cancel_target()
        else:
            print('image not ready')
Exemplo n.º 19
0
    def extract_data_from_table(self, template, pop=[], remove_last=False):
        if (not template):
            raise Exception("No template added to parser")

        print_time("Extracting data from table")
        data = []
        rows = self.soup.find_all("tr")

        if (len(pop) > 0):
            sorted_pop = pop.sort(reverse=True)
            for index in sorted_pop:
                print_time(f"Removing row by index {index}")
                rows.pop(index)  # Remove table header

        if (remove_last):
            print_time(f"Removing last element")
            rows.pop()

        current = 1
        log_counter = 0
        elements = len(rows)

        #Iterate over every row in the table
        for row in rows:
            log_counter += 1
            if (log_counter == self.log_each_n or current == elements):
                print_time(f"Parsing row {current} of {elements}")
                log_counter = 0

            new_dict = {}
            columns = row.find_all("td")

            if (type(template) == "dict"):
                for key, value in template.items():
                    new_dict[value] = columns[key].text
            else:
                for index, value in enumerate(template):
                    new_dict[value] = columns[index].text

            data.append(new_dict)
            current += 1

        print_time("Parser complete")
        return data
Exemplo n.º 20
0
    def _optimize_model(
            self, arg_values: argparse.Namespace
    ) -> Iterable[FeaturesDNNEvaluatorState]:
        with print_time("Loading data", guard=arg_values.verbose):
            if arg_values.start_from:
                _, (arg_values, unparsed_args,
                    (picklable_token_map,
                     state)) = torch.load(arg_values.start_from)
                token_map = tmap_from_picklable(picklable_token_map)
                _, word_features_data, vec_features_data, outputs,\
                    word_features_vocab_sizes, vec_features_size = features_to_total_distances_tensors_with_map(
                        extract_dataloader_args(arg_values),
                        str(arg_values.scrape_file), token_map)
            else:
                token_map, word_features_data, vec_features_data, outputs, \
                    word_features_vocab_sizes, vec_features_size = features_to_total_distances_tensors(
                        extract_dataloader_args(arg_values),
                        str(arg_values.scrape_file))

        # eprint(f"word data: {word_features_data[:10]}")
        # eprint(f"vec data: {vec_features_data[:10]}")
        # eprint(f"outputs: {outputs[:100]}")

        with print_time("Converting data to tensors",
                        guard=arg_values.verbose):
            tensors = [
                torch.LongTensor(word_features_data),
                torch.FloatTensor(vec_features_data),
                torch.FloatTensor(outputs)
            ]

        with print_time("Building the model", guard=arg_values.verbose):
            model = self._get_model(arg_values, word_features_vocab_sizes,
                                    vec_features_size)
            if arg_values.start_from:
                self.load_saved_state(arg_values, unparsed_args, state)

        return (
            (tmap_to_picklable(token_map), state)
            for state in optimize_checkpoints(
                tensors, arg_values, model, lambda batch_tensors, model: self.
                _get_batch_prediction_loss(arg_values, batch_tensors, model)))
Exemplo n.º 21
0
def mark_outlier(data: pd.DataFrame, base_path: str):
    """
	This function calls the LOF computation for each feature.

	:param data: case log
	:param base_path: path to save plots
	"""
    print_time('events: mark outlier')
    columns = []

    for c in list(data.columns):
        # ignore selected columns
        if 'case' in c or 'involved' in c:
            pass
        else:
            columns.append(c)

    # call LOF computation
    for c in columns:
        do_LOF(data, c, base_path, plot=False)
Exemplo n.º 22
0
def mark_outlier(data: pd.DataFrame, base_path: str):
    """
	This function calls the LOF computation for each feature.

	:param data: case log
	:param base_path: path to location to save plots
	"""
    print_time('case mark outlier')
    columns = []

    for c in list(data.columns):
        # ignore selected columns
        if 'case_duration' in c or 'overlapping_cases' in c:
            columns.append(c)
        else:
            pass

    # call LOF computation
    for c in columns:
        do_LOF(data, c, base_path, plot=True)
Exemplo n.º 23
0
def verify_partial_candidates(candidate_group, user_movies_matrix, bucket_nr,
                              nr_found, start_time):
    for cnr1, candidate1 in enumerate(candidate_group):
        for cnr2 in range(cnr1 + 1, len(candidate_group)):
            candidate2 = list(candidate_group)[cnr2]
            jsim = sim.jaccard(user_movies_matrix[candidate1],
                               user_movies_matrix[candidate2])
            if jsim >= 0.50:
                pair = sorted((candidate1, candidate2))
                data.save_pair(pair)
                print("\tFound similar pair: " + str(pair))
                print("\tSimilarity: " + str(
                    sim.jaccard(user_movies_matrix[candidate1],
                                user_movies_matrix[candidate2])))
                print("\tBucket number: " + str(bucket_nr))
                print("\tNumber of candidates in the bucket: " +
                      str(len(candidate_group)))
                nr_found[0] = nr_found[0] + 1
                print("\tFound until now: " + str(nr_found[0]))
                util.print_time(start_time, "\t")
                print()
Exemplo n.º 24
0
def find_cluster(data: pd.DataFrame, base_path: str, plot: bool):
    """
	This function calls the k-means clustering for all features.

	:param data: case log
	:param base_path: path to save plots and cluster information
	:param plot: whether to generate plots or not
	"""
    print_time('case kmeans')
    columns = []

    for c in list(data.columns):
        # ignore selected columns
        if 'case_duration' in c or 'overlapping_cases' in c:
            columns.append(c)
        else:
            pass

    # call k-means computation
    for c in columns:
        do_kmeans(data, c, base_path, plot)
Exemplo n.º 25
0
def find_cluster(data: pd.DataFrame, base_path: str, plot: bool):
    """
	This function calls the k-means clustering for all features.

	:param data: case log
	:param base_path: path to save plots and cluster information
	:param plot: whether to generate plots or not
	"""
    print_time('activities kmeans')
    columns = []

    for c in list(data.columns):
        # ignore selected columns
        if 'case' in c or 'involved' in c or 'weekday' in c or 'weekend' in c \
          or 'start_am' in c:
            pass
        else:
            columns.append(c)

    # call k-means computation
    for c in columns:
        do_kmeans(data, c, base_path, plot)
Exemplo n.º 26
0
def run(bolt_path, plink_path, bfile, num_people, pheno_path, pheno_col, out_id):
    info = (f'=> generating subset individual data\n'
            f'bolt_path: {bolt_path}\n'
            f'plink_path: {plink_path}\n'
            f'bfile: {bfile}\n'
            f'num_people: {num_people}\n'
            f'pheno_filename: {pheno_path}\n'
            f'pheno_col: {pheno_col}\n'
            f'out: {out_id}\n')
    print(info)
    sys.stdout.flush()

    file_cache_out_path = os.path.join('file_cache', f'{out_id}_{num_people}')
    _, pheno_temp = generate_subset(plink_path=plink_path, bfile=bfile, num_people=num_people, out=file_cache_out_path,
                                    pheno_path=pheno_path)
    print(f'subset pheno file: {pheno_temp}')

    print('=> assigning the SNP components by chromosome')
    sys.stdout.flush()

    snp_assignment_filename = file_cache_out_path + '.snps_assignment'
    partition(file_cache_out_path + '.bim', snp_assignment_filename)

    print('=> running BOLT-REML')
    print_time()
    sys.stdout.flush()
    dt = bench_bolt_reml(bolt_path, snp_assignment_filename,
                         file_cache_out_path + '.bed',
                         file_cache_out_path + '.bim',
                         file_cache_out_path + '.fam',
                         pheno_temp, pheno_col)

    log_path_prefix = os.path.join('output', f'{out_id}_{num_people}')
    print(f'log_path_prefix: {log_path_prefix}')

    with open(f'{log_path_prefix}.bench', 'w') as file:
        file.write(info)
        file.write(f'BOLT-REML took {dt} sec\n')
    print_time()
Exemplo n.º 27
0
def test():
    if(len(sys.argv) < 3):
      raise Exception("Script must be called with two arguments, the path to chromedriver and the path to firebase config")

    chromedriver = sys.argv[1]

    elapsed = Elapsed()

    scraper = Scraper(chromedriver, headless=True)
    test_url = "https://96hpr.csb.app"

    try:
        scraper.open_page(test_url)
        html = scraper.get_outerhtml(
            By.XPATH, "/html/body/div/div/table/tbody")
        parsed = Parser(html, log_each_n=10)
        template = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O"]
        parsed.extract_data_from_table(template, [0], True)
        print_time(f"Extracted data")
    finally:
        scraper.close()
        elapsed.end()
Exemplo n.º 28
0
def multiprocess_aggregation(data: pd.DataFrame, job_num: int):
    """
	Aggregate enriched event log to case log. The computation will be
	split into the given number of jobs.

	:param data: event log
	:param job_num: number of jobs
	:return: case log
	"""
    print_time('aggregate events')
    print('Lenght dataset %s' % len(data))

    case_ids = data['caseID'].unique()

    steps = get_steps_seq(case_ids, data, job_num)
    print(steps)

    events = data['name'].unique()

    # feature types
    feature = [
        'abs_lag', 'pre_lag', 'post_lag', 'weekday', 'weekend',
        'parallel_events'
    ]

    # blueprint to collect single events
    case_tmp = {}
    for e in events:
        case_tmp[e] = {'counter': 0}
        for f in feature:
            case_tmp[e][f] = 0

    jobs = []
    out_q = Queue()

    # start all jobs
    for idx, r in enumerate(steps):
        p = Process(target=aggregate, args=(data, case_tmp, r, out_q, idx))
        jobs.append(p)
        p.start()

    # collect results
    res = {}
    for i in range(len(steps)):
        res.update(out_q.get())

    for job in jobs:
        job.join()

    print_time('time features', start=False)
    print_time('merge results')

    # return case log
    return pd.DataFrame(list(res.values()))
Exemplo n.º 29
0
def multiprocess_aggregation(data: pd.DataFrame, job_num: int):
    """
	Aggregate enriched activity log to case log. The computation will be
	split into the given number of jobs.

	:param data: enriched activity log
	:param job_num: number of jobs
	:return: case log
	"""
    print_time('aggregate activities')

    case_ids = data['caseID'].unique()
    steps = get_steps_seq(case_ids, data, job_num)

    # prepare names
    activity_names = data['name'].unique()
    activity_names = ['_'.join(['act', str(i)]) for i in activity_names]

    feature = [
        'abs_lag', 'duration', 'start_am', 'weekday', 'parallel_activities'
    ]

    # generate blueprint
    case_tmp = {}
    for a in activity_names:
        case_tmp[a] = {'counter': 0}
        for f in feature:
            case_tmp[a][f] = 0

    jobs = []
    out_q = Queue()

    # start all jobs
    for idx, r in enumerate(steps):
        p = Process(target=aggregate, args=(data, case_tmp, r, out_q, idx))
        jobs.append(p)
        p.start()

    # merge jobs and collect results
    res = {}
    for i in range(len(steps)):
        res.update(out_q.get())

    for job in jobs:
        job.join()

    print_time('aggregate activities', start=False)
    print_time('merge results')

    return pd.DataFrame(list(res.values()))
Exemplo n.º 30
0
def multiprocess_time_feature(log_data: pd.DataFrame, job_num: int):
    """
	Compute basic features for given dataset. The computation will be split
	into the given number of jobs.

	:param log_data: event log
	:param job_num: number of jobs
	"""
    print_time('time features')
    case_ids = log_data['caseID'].unique()

    # convert variable type of timestamp
    log_data['timestamp'] = pd.to_datetime(log_data.loc[:, 'timestamp'],
                                           utc=True,
                                           format='%Y-%m-%d '
                                           '%H:%M:%S')

    # get split points
    steps = get_steps_seq(case_ids, log_data, job_num)

    jobs = []
    out_q = Queue()

    # start jobs
    for idx, r in enumerate(steps):
        p = Process(target=multi_time_feature, args=(log_data, r, out_q, idx))
        jobs.append(p)
        p.start()

    # collect results
    res = {}
    for i in range(len(steps)):
        res.update(out_q.get())

    for job in jobs:
        job.join()

    print_time('time features', start=False)
    print_time('merge results')

    # add features to DataFrame
    for idx, data in res.items():
        for attr, v in data.items():
            log_data.at[idx, attr] = v
Exemplo n.º 31
0
from util import print_time


def main():

    parser = argparse.ArgumentParser(description="Merge multiple svm format features.")
    parser.add_argument(
        "-i", nargs="*", required=True, dest="input_filename", help="Specify input file path (accept multiple inputs)"
    )
    parser.add_argument("-o", required=True, dest="output_filename", help="Specify output file path")
    opts = parser.parse_args(sys.argv[1:])

    all_X = []
    for fileName in opts.input_filename:
        print "Loading " + fileName + " ..."
        X, y = load_svmlight_file(fileName)
        print X.shape
        all_X.append(X.todense())

    X = np.concatenate(all_X, axis=1)
    print "Saving " + opts.output_filename + " ..."
    print X.shape
    dump_svmlight_file(X, y, opts.output_filename)


if __name__ == "__main__":
    ts = time.time()
    main()
    te = time.time()
    print_time(ts, te)
Exemplo n.º 32
0
	psi_ez_x[-1,:-1,:] = -pml_cb[-1]*(hy[-1,:-1,:] - hy[-3,:-1,:])
	for i in xrange(-2,-npml-1,-1):
		psi_ez_x[i,:-1,:] = pml_ca[i]*psi_ez_x[i+1,:-1,:] - pml_cb[i]*(hy[i,:-1,:] - hy[i-2,:-1,:])
	ez[-npml-2:-2,:-1,:] += 0.5*psi_ez_x[:,:-1,:]

	# for source
	ez[270,ny/2,1] += np.sin(2*np.pi*frequency*dt*tstep)
	# for pbc
	update_pbc_e('z', *em_arrays[:-3])


	update_h(*em_arrays)
	# for pml
	psi_hy_x[-1,:,1:] = -pml_cb[-1]*(ez[-1,:,1:] - ez[-3,:,1:])
	for i in xrange(-2,-npml-1,-1):
		psi_hy_x[i,:,1:] = pml_ca[i]*psi_hy_x[i+1,:,1:] - pml_cb[i]*(ez[i,:,1:] - ez[i-2,:,1:])
	hy[-npml-1:-1,:,1:] += 0.5*psi_hy_x[:,:,1:]
	# for pbc
	update_pbc_h('z', *em_arrays[3:])


	if tstep%tgap == 0:
		print_time(tstep)
		#print hx[260:280,190:210,1]
		#print psi_ez_x[:,ny/2,nz/2]
		im.set_array(ez[:,:,nz/2].T**2)
		plt.draw()
		#savefig('./png/%.5d.png' % tstep) 

print ''