def mine_string_patterns(doc): id, lines = doc docs = [] for i, line in enumerate(lines): lr = [] line = re.sub(r'\d+', '', line) toks = line.strip().split(' ') for t in toks: if t: lr.append(t) docs.append(lr) wordmap = {} # type: Dict[str, int] #problematic! idx = 0 for doc in docs: for tok in doc: if tok not in wordmap: wordmap[tok] = idx idx += 1 doc_vecs = [] for doc in docs: doc_vec = [] for tok in doc: doc_vec.append(wordmap[tok]) doc_vecs.append(doc_vec) db = doc_vecs ps = PrefixSpan(db) invwordmap = invert(wordmap) func = ps.frequent # lambda function for sorting key = None # upper bound bound = None # filter lambda function filter = None threshold = 2 closed = True generator = False ps.minlen = 2 ps.maxlen = 10 results = [] for freq, patt in func(threshold, closed=closed, generator=generator, key=key, bound=bound, filter=filter): pattern = ' '.join((invwordmap[i] for i in patt)) results.append([pattern, freq]) return id, results
def one_stacking_period(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" temp = [] curr_time = 0 aversion = "c" for row in spamreader: if not curr_usr == row[2]: curr_usr = row[2] dict[curr_usr] = temp temp = [] if row[2] == "": continue curr_time += int(row[3]) if row[6] == "0": if aversion == "c": temp.append(aversion) aversion = "a" temp.append(aversion) aversion = "c" curr_time = 0 if curr_time > PERIOD * 1000: temp.append(aversion) curr_time = curr_time - (PERIOD * 1000) aversion = "c" for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("one stacking period \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
def find_clusters_names(labels, features): groups = [[] for i in range(0, max(labels)+1)] for i in range(0, max(labels)+1): groups[i] = features[features['labels'] == i].index groups[i] = groups[i].tolist() for group in groups: for i in range(0, len(group)): group[i] = group[i].split("::") group[i] = group[i] + group[i][len(group[i])-1].split(" ") res= [] for group in groups : prefix = PrefixSpan(group) prefix.maxlen = 4 prefix.minlen = 4 res.append(prefix.topk(5, filter = lambda patt, matches : diversity_score(patt) >= len(patt))) return [create_str(res[i][0][1]) for i in range(0, len(res))]
def raw_data(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" temp = [] for row in spamreader: if not curr_usr == row[2]: curr_usr = row[2] dict[curr_usr] = temp temp = [] if row[2] == "": continue if row[6] == "0": temp.append("a") else: temp.append("c") for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("raw data \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
def aversion_direction_one_stacking_period(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" avg = [] curr_time = 0 aversion = [0.0, 0.0, "1"] temp = [] for row in spamreader: if not curr_usr == row[2]: mean = np.average(avg, axis=0) t = [] for i in temp: res = "c" if i[2] == "0": diff = [np.abs(a - b) for a, b in zip(i[0:2], mean)] # if np.abs((diff[0] / mean[0]) - (diff[1] / mean[1])) < treshold: if (np.abs(((diff[0] + mean[0]) / mean[0]) - ((diff[1] + mean[1]) / mean[1])) < treshold) or \ (((diff[0] + mean[0]) / mean[0]) > treshold2 and ( ((diff[1] + mean[1]) / mean[1]) > treshold2)): res = "f" elif diff[0] - diff[1] > 0: if i[0] < mean[0]: res = "l" if i[0] > mean[0]: res = "r" else: if i[1] < mean[1]: res = "u" if i[1] > mean[1]: res = "d" t.append(res) dict[curr_usr] = t curr_usr = row[2] temp = [] avg = [] if row[2] == "": continue if row[6] == "1": avg.append([ float(row[4].replace(",", ".")), float(row[5].replace(",", ".")) ]) curr_time += int(row[3]) if row[6] == "0": if aversion[2] == "1": temp.append(aversion) aversion = [ float(row[4].replace(",", ".")), float(row[5].replace(",", ".")), row[6] ] temp.append(aversion) aversion = [0.0, 0.0, "1"] curr_time = 0 if curr_time > PERIOD * 1000: temp.append(aversion) curr_time = curr_time - (PERIOD * 1000) aversion = [0.0, 0.0, "1"] for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("aversion direction one stacking period \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
discrete_time = [] with open(filename, "r", encoding="utf-8") as weights_file: print(f"Reading file {filename}") for i, weights_triple in enumerate(weights_file): current_weights = weights_triple.replace(",", ".").split("\t") weights.append(int(current_weights[1])) discrete_time_base = int(current_weights[0].strip()) discrete_time.append(discrete_time_base) curr_frequency = int(current_weights[3].strip()) frequency.append(curr_frequency) for k in range(0, curr_frequency): weights.append(int(current_weights[1])) discrete_time_base += 1 discrete_time.append(discrete_time_base) if limit is not None and (i == limit or discrete_time_base >= limit): print("Limit reached") break return discrete_time, weights if __name__ == '__main__': basedir = "C:/Users/havar/Home/cache_simulation_results/" _t, _w = _read_db(basedir + "scaled_w_01.csv") data = list(chunks(_w, 1000)) ps = PrefixSpan(data) ps.minlen = 5 ps.maxlen = 100 print(ps.frequent(5, closed=True))