def one_stacking_period(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" temp = [] curr_time = 0 aversion = "c" for row in spamreader: if not curr_usr == row[2]: curr_usr = row[2] dict[curr_usr] = temp temp = [] if row[2] == "": continue curr_time += int(row[3]) if row[6] == "0": if aversion == "c": temp.append(aversion) aversion = "a" temp.append(aversion) aversion = "c" curr_time = 0 if curr_time > PERIOD * 1000: temp.append(aversion) curr_time = curr_time - (PERIOD * 1000) aversion = "c" for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("one stacking period \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
def base_sequence(dataset, num): data = [] for i in range(0, num): item = dataset[i].split("\t")[0].lower() data.append(item) ps_base = PS(data) base_sequence = get_longest(ps_base.topk(1000)) return base_sequence
def raw_data(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" temp = [] for row in spamreader: if not curr_usr == row[2]: curr_usr = row[2] dict[curr_usr] = temp temp = [] if row[2] == "": continue if row[6] == "0": temp.append("a") else: temp.append("c") for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("raw data \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
def mine_frequent_span(log): input = [] different_events = set() for trace in log: trace_events = [] for event in trace: event_attribs = event.get_attributes() event_name = str(event_attribs["concept:name"]) if "lifecycle:transition" in event_attribs: event_name += "-" + str(event_attribs["lifecycle:transition"]) trace_events.append(event_name) different_events.add(event_name) input.append(trace_events) # Encode input encoding = {} decoding = {} for i, event in enumerate(different_events): encoding[event] = i decoding[i] = event # Encode traces minimum_size = 5 encoded = [[encoding[event] for event in sublist] for sublist in input] ps = PrefixSpan(encoded) outputs = ps.topk(10000) decoded_output = list( reversed( sorted([(sublist[0], [decoding[output] for output in sublist[1]]) for sublist in outputs], key=lambda x: x[0]))) #print(decoded_output) to_file = "\n".join(map(str, decoded_output)) with open("frequent_subs.txt", "w") as f: f.write(to_file)
def find_clusters_names(labels, features): groups = [[] for i in range(0, max(labels)+1)] for i in range(0, max(labels)+1): groups[i] = features[features['labels'] == i].index groups[i] = groups[i].tolist() for group in groups: for i in range(0, len(group)): group[i] = group[i].split("::") group[i] = group[i] + group[i][len(group[i])-1].split(" ") res= [] for group in groups : prefix = PrefixSpan(group) prefix.maxlen = 4 prefix.minlen = 4 res.append(prefix.topk(5, filter = lambda patt, matches : diversity_score(patt) >= len(patt))) return [create_str(res[i][0][1]) for i in range(0, len(res))]
def find_patterns(self): print(self.sampling_type) db = self.data ps = PrefixSpan(db) n_items = len(db) result = None opts = { "closed": self.closed, # Somehow does not work #"generator": self.generator } from pprint import pprint pprint(opts) if self.sampling_type: result = ps.topk(self.k, **opts) else: print("Support value:", self.min_support) print("Size:", n_items) print("Support:", n_items * self.min_support / 100) result = ps.frequent((self.min_support * n_items / 100.0), **opts) self.table.model().clear() model = QStandardItemModel(self.table) model.clear() for col, label in enumerate(["Support", "Pattern"]): item = QStandardItem(label) model.setHorizontalHeaderItem(col, item) sequences = [] for support, pattern in result: if len(pattern) < self.min_len: continue support /= n_items sequences.append((support, pattern)) sitem = self.NumericItem(support) pitem = QStandardItem(str(pattern)) model.appendRow([sitem, pitem]) self.Outputs.object.send(sequences) self.table.setModel(model)
class PrefixSpanManager: """ Classe d'outil a l'utilisation de prefixspan Parameters: * sax_engine: SaxEngine Instance de preprocessing SAX * export: Boolean Si oui ou non les donnees sont deja exportees au bon format Variables: * se_instance: SaxEngine L'instance de class SAX * data: Array[] Les donnees au format SAX """ def __init__(self, sax_engine, export = True): self.se_instance = sax_engine self.data = sax_engine.sax_data self.process_data = [] self.ps = None self.ploter = Plot(self) if export: self.export_format() def run(self): """ Creer l'instance PrefixSpan avec les donnees pretraites """ self.ps = PrefixSpan(self.process_data) def export_format(self): """ Modifie le format pour correspondre au besoin de l'instance de PrefixSpan """ tmp = [] for elmt in self.data: tmp.append(elmt.ravel()) self.process_data = tmp def topk(self, n, c = True): """ Affiche les motifs les plus frequents(plus grand support) et par defaut les fermes Parameters: * n: int Nombre de motifs a afficher Returns: Liste de motifs frequent """ return self.ps.topk(n, closed = c) def frequent(self, n): """ Retourne les frequent de support n Parameters: * n: int Support minimal Returns: Liste des motifs de support minimal n """ return self.ps.frequent(n) def plot(self, l): self.ploter.plot_prefixspan(l)
def main(): dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1") author_title = dblp_data dataset = author_title.to_numpy() list1 = dataset[:,2].tolist() #convert authors to lower case list2 = [] for i in list1: sublist = i.lower().split() list2.append(sublist) te = TransactionEncoder() te_ary = te.fit(list2).transform(list2) df = pd.DataFrame(te_ary, columns=te.columns_) frequent = fpgrowth(df, min_support=0.001, use_colnames=True) frequent = frequent[frequent['itemsets'].str.len()>1] freqauth_list = [] for i in frequent['itemsets']: freqauth_list.append([x for x in i]) freqauth_dict = {} for i in freqauth_list: title_idx_sublist = [] for idx, j in enumerate(list2): if set(i).issubset(j): title_idx_sublist.append(idx) freqauth_dict.update({tuple(i):title_idx_sublist}) freqauth_title_dict = {} kstem = ks.PyKrovetzStemmer() for key, value in freqauth_dict.items(): title_df = author_title.iloc[value]['title'] title_sublist = list(title_df) title_sublists = [] temp_list = [] for x in title_sublist: tempx = re.sub(r'[.]','', x) temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split() temp_list2 = [] if isinstance(temp_list, list): temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist]) title_sublists.extend(temp_list2) else: if not temp_list in stopwordlist: title_sublists.extend([kstem.stem(temp_list)]) freqauth_title_dict.update({key:title_sublists}) # Closed / Top k titles of frequent authors freqauth_title_dict_closed = {} for k, v in freqauth_title_dict.items(): ps = PrefixSpan(v) closed_Seq_pattern = ps.topk(5, closed=True) freqauth_title_dict_closed.update({k:closed_Seq_pattern}) # To get frequent author's context indicators frequentlist = freqauth_list cleanedList = list2 new_author_list = [] for i in range(0,len(frequentlist)): temp_author_list = [] authorlist = list(frequentlist[i]) found = 0 for k in range(0,len(cleanedList)): for j in range(0, len(authorlist)): if (authorlist[j] in(cleanedList[k])): found = 1 else: found = 0 break if found == 1: for jj in range(0,len(authorlist)): if (authorlist[jj] in(cleanedList[k])): cleanedList[k].remove(authorlist[jj]) temp_author_list.append(cleanedList[k]) new_author_list.append(temp_author_list) context_indicator_list = [] for i in range(0,len(new_author_list)): te = TransactionEncoder() te_ary = te.fit(new_author_list[i]).transform(new_author_list[i]) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True) supp = frequent_author_list.support.unique() # all unique support count # Dictionary storing itemset with same support count key freq_dic = {} for i in range(len(supp)): inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets']) freq_dic[supp[i]] = inset # Dictionary storing itemset with support count <= key freq_dic2 = {} for i in range(len(supp)): inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets']) freq_dic2[supp[i]] = inset2 # Find Closed frequent itemset close_freq = [] for index, row in frequent_author_list.iterrows(): isclose = True cli = row['itemsets'] cls = row['support'] checkset = freq_dic[cls] for i in checkset: if (cli != i): if (frozenset.issubset(cli, i)): isclose = False break if (isclose): close_freq.append([x for x in (row['itemsets'])]) context_indicator_list.append(close_freq) freqauth_context_ind_dict = {} for authpair, titlelist in freqauth_title_dict_closed.items(): cleantitlelist = [] for i in titlelist: if isinstance(i, tuple): if isinstance(i[1], list): listtostring = ' '.join(i[1]) cleantitlelist.append(listtostring) freqauth_context_ind_dict.update({authpair:cleantitlelist}) # Merging both titles and Context indicator author for frequent pattern authors for idx, key in enumerate(freqauth_context_ind_dict): newval = [] if len(context_indicator_list[idx])> 0: for i in context_indicator_list[idx]: if len(i) > 0: tempstr = '&'.join(i) newval = freqauth_context_ind_dict[key] newval.append(tempstr) freqauth_context_ind_dict.update({key:newval}) # Context Indicator Weighting CI_list = list(freqauth_context_ind_dict.values()) freqauth_context_in_weights = {} for key, value in freqauth_context_ind_dict.items(): freq_auth_CI_list = value length_of_CI = len(value) temp_dict = {} for i in freq_auth_CI_list: count_tmp = 0 for j in CI_list: if (i in (j)): count_tmp += 1 weight = round(1 - ((count_tmp - 1) / count_tmp), 2) if (weight > 0.1): temp_dict.update({i:weight}) sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True) freqauth_context_in_weights.update({key:sorted_weights_dict}) freq_auth_transactions = {} list_of_freq_auth = list(freqauth_context_in_weights.keys()) for i in range(0, len(freqauth_title_dict)): temp_dict = {} title_list = freqauth_title_dict.get(list_of_freq_auth[i]) CI_list = freqauth_context_in_weights[list_of_freq_auth[i]] CI_list_auth = [] for n, c in enumerate(CI_list): CI_list_auth.append(c[0]) for j in range(0, len(title_list)): cos_sim = cos_similarity(CI_list_auth,title_list[j]) cos_sim = round(cos_sim, 3) t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j]) temp_dict.update({t_title:cos_sim}) sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True) t_len = len(list(temp_dict.values())) max_len = t_len if (t_len > 4): max_len = 4 sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len]) freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1}) # To find the strongest SSP - Match against similarity of the context units freq_auth_SSPs = {} list_of_freq_auth = list(freqauth_context_ind_dict.keys()) list_of_freq_auth_CI = list(freqauth_context_ind_dict.values()) len_list_of_freq_auth_CI = len(list_of_freq_auth_CI) context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float) for i in range (0,len_list_of_freq_auth_CI): for j in range (0,len_list_of_freq_auth_CI): cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j]) cos_sim = round(cos_sim, 3) if (i != j): context_indicator_similarity[i][j] = cos_sim context_indicator_similarity[j][i] = cos_sim context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int) for i in range(0,len(context_indicator_similarity)): context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:] SSP_Author_List = [] for i in range(0,len(list_of_freq_auth)): temp_author_list_ssp = [] for j in range(0,len(context_indicator_similarity_idx[i])): temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]]) SSP_Author_List.append(temp_author_list_ssp) SSP_Title_List = [] CI_list_title = list(freqauth_title_dict_closed.values()) CI_list1 = [] for i in (CI_list_title): temp_list3 = [] for j in i: CI_str = ' '.join(j[1]) temp_list3.append(CI_str) CI_list1.append(list(set(temp_list3))) for i in range(0,len(CI_list1)): temp_title_list_ssp = [] for j in range(0,len(context_indicator_similarity_idx[i])): ssp_str = CI_list1[context_indicator_similarity_idx[i][j]] temp_title_list_ssp.extend(ssp_str) SSP_Title_List.append(list(set(temp_title_list_ssp))) # Write the output to a CSV file # a) list_of_freq_auth # b) list_of_freq_auth_CI / freqauth_context_in_weights # c) freq_auth_transactions # d) SSP_Author_List # e) SSP_Title_List #for i in range(0, frequent_author_list): #print(len(SSP_Title_List)) #print(SSP_Title_List) titles_list_with_weight = list(freq_auth_transactions.values()) # Joining SSP authors SSP_authors_formatted = [] for i in range(0,len(SSP_Author_List)): temp_list = [] for j in range(0, len(SSP_Author_List[i])): authors = '&'.join(list(SSP_Author_List[i][j])) temp_list.append(authors) SSP_authors_formatted.append(temp_list) with open("./output.txt", 'w', encoding="utf-8") as f: f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' + 'Transaction 2' + '||' + 'Transaction 3' + '||' + 'Transaction 4' + '||' + 'SSP - Co-Author' + '||' + 'SSP - Title' + '\n') for i in range(0, len(list_of_freq_auth)): authors = ' '.join(list(list_of_freq_auth[i])) f.write(authors + '||') Context_indicators = '; '.join(list_of_freq_auth_CI[i]) f.write(Context_indicators + '||') for j in (titles_list_with_weight[i].keys()): f.write(j + '||') ssp_authors = '; '.join(SSP_authors_formatted[i]) f.write(ssp_authors + '||') ssp_titles = '; '.join(SSP_Title_List[i]) f.write(ssp_titles ) f.write('\n')
def aversion_direction_one_stacking_period(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" avg = [] curr_time = 0 aversion = [0.0, 0.0, "1"] temp = [] for row in spamreader: if not curr_usr == row[2]: mean = np.average(avg, axis=0) t = [] for i in temp: res = "c" if i[2] == "0": diff = [np.abs(a - b) for a, b in zip(i[0:2], mean)] # if np.abs((diff[0] / mean[0]) - (diff[1] / mean[1])) < treshold: if (np.abs(((diff[0] + mean[0]) / mean[0]) - ((diff[1] + mean[1]) / mean[1])) < treshold) or \ (((diff[0] + mean[0]) / mean[0]) > treshold2 and ( ((diff[1] + mean[1]) / mean[1]) > treshold2)): res = "f" elif diff[0] - diff[1] > 0: if i[0] < mean[0]: res = "l" if i[0] > mean[0]: res = "r" else: if i[1] < mean[1]: res = "u" if i[1] > mean[1]: res = "d" t.append(res) dict[curr_usr] = t curr_usr = row[2] temp = [] avg = [] if row[2] == "": continue if row[6] == "1": avg.append([ float(row[4].replace(",", ".")), float(row[5].replace(",", ".")) ]) curr_time += int(row[3]) if row[6] == "0": if aversion[2] == "1": temp.append(aversion) aversion = [ float(row[4].replace(",", ".")), float(row[5].replace(",", ".")), row[6] ] temp.append(aversion) aversion = [0.0, 0.0, "1"] curr_time = 0 if curr_time > PERIOD * 1000: temp.append(aversion) curr_time = curr_time - (PERIOD * 1000) aversion = [0.0, 0.0, "1"] for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("aversion direction one stacking period \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
def get_common(sequence, data): data = data.split("\t")[0].lower() data = [sequence, data] ps = PS(data) common = get_longest(ps.topk(1000)) return common
def recommend(trainingset=l_good, s_group=s_good, student=s_good[0], path_length=9, rl=resourcelist): # Here we put the influence or this stdent's learning log bigger. x10 for i in range(30): trainingset.append(trainingset[s_group.index(student)]) ps = PrefixSpan(trainingset) pattern = ps.topk(1000, filter=lambda patt, matches: len(patt) > 1 ) # pattern lenth should bigger than 1 pattern_time = {} #Here stores all pattern with appear times for i, element in enumerate(pattern): l_s = [] # store pattern in this element s = "" for i in range(len(element[1])): if i == 0: s = str(element[1][i]) else: l_s.append(s + "," + str(element[1][i])) s = str(element[1][i]) for j in l_s: if j in pattern_time.keys(): pattern_time[j] += element[0] else: pattern_time[j] = element[0] # ordered pattern in list pattern_time = sorted(pattern_time.items(), key=lambda pattern_time: pattern_time[1], reverse=True) print("pattern with time:", pattern_time) # delete repeat part #print(len(pattern_time)) """ Here is deduplication. we can't delete the item of list in for cycle. It will have 'index out of range problem'. So we store the repeat index and delete after """ delete_indice = [] for k1 in range(len(pattern_time)): starter = pattern_time[k1][0].split(",")[0] ender = pattern_time[k1][0].split(",")[1] if starter == ender: delete_indice.append(k1) if pattern_time[k1] == pattern_time[-1]: break for k2 in range(k1 + 1, len(pattern_time)): #print(pattern_time[k2]) temps_start = pattern_time[k2][0].split(",")[0] temps_end = pattern_time[k2][0].split(",")[1] if starter == temps_start: delete_indice.append(pattern_time[k2]) if ender == pattern_time[k2][0].split(",")[1]: delete_indice.append(pattern_time[k2]) for i in set(delete_indice): if i in pattern_time: pattern_time.remove(i) """ Here we organise the path from pattern list. We should firstly find the head then finish the path. """ element = [] pattern_result = [x[0] for x in pattern_time ] # delete pattern show times, keep pattern #print("unique pattern:",pattern_result) store = [] for i in range(len(pattern_result)): for j in range(len(pattern_result)): if i == j: continue if pattern_result[i].split(",")[0] in pattern_result[j]: store.append(pattern_result[i]) path = list(set(pattern_result).difference(set(store)))[0] print("begin_node of path:", path) compt = 0 c_b = 0 l_change = 2 while compt < path_length - 2: # First node has two element, so we should add path_length-2 c_b += 1 for i in pattern_result: if i.split(",")[0] == path.split(",")[-1]: path += "," + i.split(",")[-1] compt += 1 if l_change == len(path): c_b += 1 else: l_change = len(path) if c_b > 100000: break print("path:", path) return path
while line: db.append(eval(line)) line = file.readline() return db def generateFilename(tno, cno): "根据队编号与聚类编号产生文件名" return "Team" + str(tno) + "Cluster" + str(cno) + ".txt" path = "Cluster/" list_p = [] # tno 代表 team number # cno 代表 cluster number for tno in range(1, 3): for cno in range(0, 5): filepath = path + generateFilename(tno, cno) db = loadFile(filepath) ps = PrefixSpan(db) for x in range(0, 10): list_p.append(Pattern(ps.topk(10)[x][0], ps.topk(10)[x][1])) # 输出当前Cluster中出现频率最高的10个Pattern # print(ps.topk(10)) # 对Score进行排序 list_p = sorted(list_p, key=lambda x: x.score, reverse=True) print("#######################################") for x in list_p: print("score:", x.score, "freq:", x.freq)