def printnl(nl2, np2, l, v, atmi): filename = 'nl2s.dat' n = SortedSet(nl2[np2[atmi]:np2[atmi + 1]]) d = n.difference(l) with open(filename, "a") as myfile: myfile.write(str(list(d))) myfile.write('\n \n')
def extract_FL1L(self): self.F_L1L = SortedSet() cpt = 1 for transition1 in self.L1L: A = SortedSet() B = SortedSet() for transition2 in self.T_pr: if self.relations[transition2][ transition1] == Relations.RIGHT_CAUSALITY: print("for transition ", transition1, " : ", transition2) A.add(transition2) if self.relations[transition1][ transition2] == Relations.RIGHT_CAUSALITY: print("for transition ", transition1, " : ", transition2) B.add(transition2) ''' The solution to tackle length-one loops in sound SWF-nets focuses on the pre- and post-processing phases of process mining. The key idea is to identify the length-one-loop tasks and the single place to which each task should be connected. Any length-one-loop task t can be identified by searching a loopcomplete event log for traces containing the substring tt. To determine the correct place p to which each t should be connected in the discovered net, we must check which transitions are directed followed by t but do not direct follow t (i.e. p is an output place of these transitions) and which transitions direct follow t but t does not direct follow them (i.e. p is the input place of these transitions) ''' print(len(A) == len(B)) place = 'p' + str(cpt) for transition in A.difference(B): # Add input places transition_place = (transition1, place) self.F_L1L.add(transition_place) for transition in B.difference(A): #Add output place transition_place = (place, transition1) self.F_L1L.add(transition_place) cpt += 1 print(self.F_L1L) pass
def test_difference(): temp = SortedSet(range(100), load=7) that = temp.difference(range(0, 10), range(10, 20)) assert all(val == temp[val] for val in range(100)) assert all((val + 20) == that[val] for val in range(80))
def test_difference(): temp = SortedSet(range(100)) temp._reset(7) that = temp.difference(range(0, 10), range(10, 20)) assert all(val == temp[val] for val in range(100)) assert all((val + 20) == that[val] for val in range(80))
class Index: def __init__(self, tokenizer=None): self.postings = SortedDict() self.unit_list = SortedSet() self.unit_count = 0 self.tokenizer = __default_tokenizer__ if tokenizer is None else tokenizer def add(self, unit: Unit): self.unit_count += 1 if len(unit.keywords()) == 0: self.unit_count -= 1 else: self.unit_list.add(unit) for word in unit.keywords(): if word: if word in self.postings: self.postings[word].add(unit) else: self.postings[word] = SortedSet([unit]) def count(self): return self.unit_count def search(self, query): """Searches given query inside the index. :param `query`: String to search. Can contain operators `('and', 'or', 'not')` to refine results. :returns: a list of document units that satisfy the given query. """ tokens = self.tokenizer(query) result, sub_result = None, None i = 0 while i < len(tokens): # print("> Now on token", i, ":", tokens[i]) if tokens[i] == 'not': i += 1 sub_result = self.unit_list.difference( self.postings[tokens[i]]) if i < len(tokens) and tokens[ i] in self.postings else self.unit_list else: sub_result = self.postings[tokens[i]] if i < len( tokens) and tokens[i] in self.postings else None if i < len(tokens) and (tokens[i] == 'and' or tokens[i] == 'or'): operator = tokens[i] i += 1 if tokens[i] == 'not': i += 1 sub_result = self.unit_list.difference(self.postings[ tokens[i]]) if i < len(tokens) and tokens[ i] in self.postings else self.unit_list else: sub_result = self.postings[tokens[i]] if i < len( tokens) and tokens[i] in self.postings else None if result is not None and sub_result is not None: if operator == 'and': result = result.intersection(sub_result) else: result = result.union(sub_result) elif result is not None: if sub_result is not None: result = result.union(sub_result) elif result is None: result = sub_result i += 1 return result def keywords(self): return self.postings.keys() def __getitem__(self, word): return self.postings[word] if word in self.postings else None
class SparseTimeSeriesDataSet: # A dataset designed for dealing with sparse time series data that needs to be kept in sync in time. def __init__(self, unique_timestamps = None, minimum_time_between_timestamps = None, mode='strict'): # possible modes are strict, remove_difference, union if unique_timestamps is not None: self.unique_timestamps = SortedSet(unique_timestamps) else: self.unique_timestamps = SortedSet() self.mode = mode self.all_raw_data = {} #dict of sorteddicts self.timestamp_indexed_data = {} self.minimum_time_between_timestamps = minimum_time_between_timestamps self.check_minimum_timestamp_interval() def __len__(self): return len(self.unique_timestamps) @classmethod def sample_data_at_intervals(cls, start_timestamp, end_timestamp, interval, data): # extends previous datapoint if one is missing timestamps = SortedList([x[0] for x in data]) start_timestamp = int(start_timestamp) end_timestamp = int(end_timestamp) assert(timestamps[0] <= start_timestamp) assert(timestamps[-1] >= end_timestamp) sampled_data = [] for timestamp in range(start_timestamp, end_timestamp+1, interval): index = timestamps.bisect_right(timestamp)-1 new_datapoint = data[index].copy() new_datapoint[0] = timestamp sampled_data.append(new_datapoint) return sampled_data @property def ids(self): return list(self.all_raw_data.keys()) @property def first_timestamp(self): return self.unique_timestamps[0] def first_timestamp_for_id(self, id): return self.all_raw_data[id][0][0] @property def last_timestamp(self): return self.unique_timestamps[-1] def last_timestamp_for_id(self, id): return self.all_raw_data[id][-1][0] def first_unpadded_index_for_id(self, id): first_timestamp = self.first_timestamp_for_id(id) return self.unique_timestamps.index(first_timestamp) def last_unpadded_index_for_id(self, id): last_timestamp = self.last_timestamp_for_id(id) return self.unique_timestamps.index(last_timestamp) def check_minimum_timestamp_interval(self): if self.minimum_time_between_timestamps is not None: prev_timestamp = 0 for timestamp in self.unique_timestamps: if timestamp-prev_timestamp < self.minimum_time_between_timestamps: raise InvalidTimestampsInDataError("Found timestamps that have less than the required {} between them".format(self.minimum_time_between_timestamps)) prev_timestamp = timestamp def add(self, id: str, data): if len(data) == 0: raise ValueError("Tried to add empty data for id {}".format(id)) if id in self.all_raw_data and self.all_raw_data[id] == data: print("Data for id {} already added.".format(id)) return self.all_raw_data[id] = data if len(data[0]) > 2: # we have multidimensional data timestamp_indexed_data = SortedDict([[int(x[0]), x[1:]] for x in data]) else: timestamp_indexed_data = SortedDict([[int(x[0]), x[1]] for x in data]) new_timestamps = {x[0] for x in data} difference = new_timestamps.difference(self.unique_timestamps) if self.mode == 'strict': if len(difference) != 0: raise InvalidTimestampsInDataError("Tried to add new data with id {} that includes timestamps that are not in the set of allowed timestamps. " "Difference = {}".format(id, difference)) opposite_difference = self.unique_timestamps.difference(new_timestamps) # for timestamp_current in opposite_difference: # if timestamp_current > min(new_timestamps) and timestamp_current < max(new_timestamps): # raise Exception("Missing timestamps in the middle of the data") elif self.mode == 'remove_difference': for timestamp_to_remove in difference: del(timestamp_indexed_data[timestamp_to_remove]) elif self.mode == 'union': self.unique_timestamps = self.unique_timestamps.union(new_timestamps) self.check_minimum_timestamp_interval() if len(timestamp_indexed_data) == 0: raise NotEnoughInputData("The data being added has zero length. If the mode is remove_difference, then this means that the new data has no timestamps in common with the required timestamps") self.timestamp_indexed_data[id] = timestamp_indexed_data def get_left_and_right_padding_required(self, ids): padding_required = [] for id in ids: first_timestamp_for_id = self.first_timestamp_for_id(id) last_timestamp_for_id = self.last_timestamp_for_id(id) left_padding = self.unique_timestamps.index(first_timestamp_for_id) right_padding = len(self) - self.unique_timestamps.index(last_timestamp_for_id)-1 assert(self.all_raw_data[id][0][0] == self.unique_timestamps[left_padding]) assert(self.all_raw_data[id][-1][0] == self.unique_timestamps[-(right_padding+1)]) padding_required.append([left_padding, right_padding]) return padding_required def get_data_extend_missing_internal(self, id: str): # This function does't pad the left or right of the data, but it will fill in any missing data # using the previous value timestamp_indexed_data = self.timestamp_indexed_data[id] timestamps_in_this_data = set(timestamp_indexed_data.keys()) missing_timestamps = self.unique_timestamps - timestamps_in_this_data if len(missing_timestamps) > 0: for timestamp in missing_timestamps: entry_index = timestamp_indexed_data.bisect_right(timestamp) if entry_index != 0 and entry_index < len(timestamp_indexed_data): # only pad in the middle of the data and not at the end current_padded_value = timestamp_indexed_data.peekitem(entry_index - 1)[1] timestamp_indexed_data[timestamp] = current_padded_value if isinstance(timestamp_indexed_data.peekitem(0)[1], list) or isinstance(timestamp_indexed_data.peekitem(0)[1], tuple): to_return = [[x[0], *x[1]]for x in timestamp_indexed_data.items()] else: to_return = list(timestamp_indexed_data.items()) return to_return def get_padded_data_in_sync(self, padding_val = "extend"): # It will always pad missing values in the middle or end of the data by extending the previous value. # The padding_val variable determined how to pad the beginning when there is no value before it. padded_timestamp_indexed_data = {} for ric, timestamp_indexed_data in self.timestamp_indexed_data.items(): padded_timestamp_indexed_data[ric] = timestamp_indexed_data timestamps_in_this_data = set(timestamp_indexed_data.keys()) missing_timestamps = self.unique_timestamps - timestamps_in_this_data if len(missing_timestamps) > 0: for timestamp in missing_timestamps: entry_index = padded_timestamp_indexed_data[ric].bisect_right(timestamp) if entry_index == 0: if padding_val == 'extend': current_padded_value = padded_timestamp_indexed_data[ric].peekitem(entry_index)[1] else: current_padded_value = padding_val else: current_padded_value = padded_timestamp_indexed_data[ric].peekitem(entry_index-1)[1] padded_timestamp_indexed_data[ric][timestamp] = current_padded_value return padded_timestamp_indexed_data def get_start_and_end_index_for_concat_data(self, keys): start_stop = [] current_position = 0 for id in keys: if id in self.timestamp_indexed_data: length_of_data = len(self.timestamp_indexed_data[id]) start_stop.append([current_position,current_position+length_of_data]) current_position = length_of_data else: print("warning: tried to concat data for keys {} but key {} is missing".format(keys, id)) return start_stop def concat_data_unpadded(self, keys, as_numpy = True, with_timestamps = True): data_to_concat = [] for id in keys: if id in self.timestamp_indexed_data: if with_timestamps: data_to_concat.append(np.squeeze(self.timestamp_indexed_data[id].items()[:])) else: data_to_concat.append(np.squeeze(self.timestamp_indexed_data[id].values()[:])) else: print("warning: tried to concat data for keys {} but key {} is missing".format(keys, id)) if as_numpy: return np.concatenate(data_to_concat) else: return np.concatenate(data_to_concat).tolist()