def fit(self, X, y=None): import pandas as pd from prefixspan import PrefixSpan l = list() p = PrefixSpan(X) for i in range(2, self.__frequent + 1): l.extend(p.frequent(i)) df = pd.DataFrame(columns=["secuencia", "support", "tam"]) for i, j in enumerate(l): df.loc[i] = [j[1], j[0] / len(X), len(j[1])] df = df.sort_values("tam", ascending=True) df.drop("tam", axis=1, inplace=True) df = df[df["support"] >= self.__minSupport] df = df.reset_index(drop=True) for i in df.iterrows(): node = self.root for pos, j in enumerate(i[1]["secuencia"]): if node.existChildren(j): node = node.getChildren(j) if pos == len(i[1]["secuencia"]) - 1: node.setSupport(i[1]["support"]) else: child = nodo(se=j, su=i[1]["support"]) node.addChild(j, child) node = child return self
def sequence_mining(min_support, token): current_dir = os.path.dirname(os.path.abspath(__file__)) # data_path = "{}\\dataset\\upload_sequence_processed{}.txt".format( # data_path = "{}\\dataset\\upload_sequence_processed-{}.txt".format( # current_dir, token) data_path = os.path.join(current_dir, 'dataset', 'upload_sequence_processed-{}.txt'.format(token) ) db = [] with open(data_path, 'r') as f: file = reader(f, delimiter=' ', quotechar='\r') i = 0 for row in file: # if i % 2 == 0: # if i % 2 == 0 or i % 2 == 1: # print(row) db.append([int(item) for item in row]) i += 1 row_count = len(db) if min_support * row_count < 2: if row_count != 0: min_support = 2 / row_count # print(db) # print(db) # print(row_count) ps = PrefixSpan(db) all_sequence = ps.frequent(row_count*min_support) # all_sequence = ps.frequent(1) all_sequence_num = len(all_sequence) # print("="*99) # print(all_sequence_num) return all_sequence_num, all_sequence
def score_session(self, items: List[int], items_to_score: List[int], relevant_sessions_indices: Set[int]): scores = self.items_to_scores.get(str(items)) if scores is None: self.competence = [] self.last = len(items) self.bcs_sigma = (self.last - 1) / (2 * np.sqrt(2 * np.log(2))) if self.bcs_sigma == 0: self.bcs_sigma = 0.1 self.bcs_a = 1 / (self.bcs_sigma * np.sqrt(2 * np.pi)) total_bcs_weight = sum( [self.bcs_weight(i + 1) for i, x in enumerate(items)]) relevant_sessions = [self.db[i] for i in relevant_sessions_indices] for session in relevant_sessions: lcs = get_longest_common_subsequence(session, items) lcs_indices = get_indices(items, lcs) bcs = sum([self.bcs_weight(x) for x in lcs_indices]) / total_bcs_weight fes_last = len(session) self.lcs_last = get_indices(session, lcs)[-1] self.fes_sigma = (fes_last - self.lcs_last) / (2 * np.sqrt(2 * np.log(2))) if self.fes_sigma == 0: self.fes_sigma = 0.1 self.fes_a = 1 / (self.fes_sigma * np.sqrt(2 * np.pi)) cni = session[self.lcs_last:] unique_cni = set(cni) fes = sum( [self.fes_weight(cni.index(x) + 1) for x in unique_cni]) / len(items) self.competence.append(0 if bcs == 0 or fes == 0 else (bcs * fes) / (1 / 2 * (bcs + fes))) # mine patterns self.total_weight = sum(self.competence) ps = PrefixSpan(relevant_sessions) patterns = ps.frequent(self.delta, key=self.pattern_key, bound=self.pattern_key) scores = self.score_items(patterns) self.items_to_scores.update({str(items): scores}) predictions = np.zeros(len(items_to_score)) mask = np.isin(items_to_score, list(scores.keys())) scored_items = items_to_score[mask] values = [scores[x] for x in scored_items] predictions[mask] = values return pd.Series(data=predictions, index=items_to_score)
def base_sequence(dataset, num): data = [] for i in range(0, num): item = dataset[i].split("\t")[0].lower() data.append(item) ps_base = PS(data) base_sequence = get_longest(ps_base.topk(1000)) return base_sequence
def mine_string_patterns(doc): id, lines = doc docs = [] for i, line in enumerate(lines): lr = [] line = re.sub(r'\d+', '', line) toks = line.strip().split(' ') for t in toks: if t: lr.append(t) docs.append(lr) wordmap = {} # type: Dict[str, int] #problematic! idx = 0 for doc in docs: for tok in doc: if tok not in wordmap: wordmap[tok] = idx idx += 1 doc_vecs = [] for doc in docs: doc_vec = [] for tok in doc: doc_vec.append(wordmap[tok]) doc_vecs.append(doc_vec) db = doc_vecs ps = PrefixSpan(db) invwordmap = invert(wordmap) func = ps.frequent # lambda function for sorting key = None # upper bound bound = None # filter lambda function filter = None threshold = 2 closed = True generator = False ps.minlen = 2 ps.maxlen = 10 results = [] for freq, patt in func(threshold, closed=closed, generator=generator, key=key, bound=bound, filter=filter): pattern = ' '.join((invwordmap[i] for i in patt)) results.append([pattern, freq]) return id, results
def nlp_FreqSubsequenceMining(classfication, MINSUP=3, CLOSED=False, GENERATOR=True): Sequences.clear() print("Analyzing %s..." % classfication) nlp = spacy.load("en_core_web_sm") # Read from raw data file, then convert it to NodeID-sequences. file = open("./DATA-Train/DATA-%s" % classfication) while 1: # read a conf-id, a description from file line = file.readline() if not line: break try: conf_id, conf_desc_text_raw = line.split('\t') except ValueError: print(line) doc = nlp(conf_desc_text_raw.strip()) doc = MergeWords(doc) Sequences.append([ MapDict[( #root.dep_, token.pos_, MyWordTags(token.text))] for token in doc if (token.pos_, MyWordTags(token.text)) != ('PUNCT', 'OTHER') ]) size = len(Sequences) mean_len = np.mean([len(s) for s in Sequences]) print("Config & Desc: %d\nMean length: %.1f" % (size, mean_len)) # Mining FreqSeqs from those NodeID-sequences # FreqSeqs = ((sup, [seq]), (sup, [seq]), ...) FreqSeqs = PrefixSpan(Sequences) tmp = FreqSeqs.frequent(int(MINSUP), closed=CLOSED, generator=GENERATOR) res = {} for FreqSeq in tmp: res[tuple(FreqSeq[1])] = FreqSeq[0] print("Frequent Sub-sequences: %d\n" % len(res)) # FreqSeqs with support number, return res
def generate_rules(changes_sets, threshold): ps = PrefixSpan(changes_sets) print("Start rule generation") # freq_seqs = ps.frequent(minsup=int(len(new_changes) * 0.1), closed=True) freq_seqs = ps.frequent(minsup=threshold, closed=True) # freq_seqs = PrefixSpan_frequent( # ps, minsup=int(len(new_changes) * 0.1), closed=True) freq_seqs = [ x for x in freq_seqs if any([y.startswith("+") for y in x[1]]) and any([y.startswith("-") for y in x[1]]) ] freq_seqs = sorted(freq_seqs, reverse=True) return freq_seqs
def sequence_mining(min_support): current_dir = os.path.dirname(os.path.abspath(__file__)) data_path = "{}\\dataset\\upload_sequence_processed.txt".format( # data_path = "{}\\dataset\\upload_processed.txt".format( current_dir) db = [] with open(data_path, 'r') as f: file = reader(f, delimiter=' ', quotechar='\r') for row in file: db.append(row) # db = array(read_csv(data_path, sep=' ', header=None)) row_count = len(db) ps = PrefixSpan(db) all_sequence = ps.frequent(row_count * min_support) all_sequence_len = len(all_sequence) return all_sequence_len, all_sequence
def mine_frequent_span(log): input = [] different_events = set() for trace in log: trace_events = [] for event in trace: event_attribs = event.get_attributes() event_name = str(event_attribs["concept:name"]) if "lifecycle:transition" in event_attribs: event_name += "-" + str(event_attribs["lifecycle:transition"]) trace_events.append(event_name) different_events.add(event_name) input.append(trace_events) # Encode input encoding = {} decoding = {} for i, event in enumerate(different_events): encoding[event] = i decoding[i] = event # Encode traces minimum_size = 5 encoded = [[encoding[event] for event in sublist] for sublist in input] ps = PrefixSpan(encoded) outputs = ps.topk(10000) decoded_output = list( reversed( sorted([(sublist[0], [decoding[output] for output in sublist[1]]) for sublist in outputs], key=lambda x: x[0]))) #print(decoded_output) to_file = "\n".join(map(str, decoded_output)) with open("frequent_subs.txt", "w") as f: f.write(to_file)
def find_clusters_names(labels, features): groups = [[] for i in range(0, max(labels)+1)] for i in range(0, max(labels)+1): groups[i] = features[features['labels'] == i].index groups[i] = groups[i].tolist() for group in groups: for i in range(0, len(group)): group[i] = group[i].split("::") group[i] = group[i] + group[i][len(group[i])-1].split(" ") res= [] for group in groups : prefix = PrefixSpan(group) prefix.maxlen = 4 prefix.minlen = 4 res.append(prefix.topk(5, filter = lambda patt, matches : diversity_score(patt) >= len(patt))) return [create_str(res[i][0][1]) for i in range(0, len(res))]
def find_patterns(self): print(self.sampling_type) db = self.data ps = PrefixSpan(db) n_items = len(db) result = None opts = { "closed": self.closed, # Somehow does not work #"generator": self.generator } from pprint import pprint pprint(opts) if self.sampling_type: result = ps.topk(self.k, **opts) else: print("Support value:", self.min_support) print("Size:", n_items) print("Support:", n_items * self.min_support / 100) result = ps.frequent((self.min_support * n_items / 100.0), **opts) self.table.model().clear() model = QStandardItemModel(self.table) model.clear() for col, label in enumerate(["Support", "Pattern"]): item = QStandardItem(label) model.setHorizontalHeaderItem(col, item) sequences = [] for support, pattern in result: if len(pattern) < self.min_len: continue support /= n_items sequences.append((support, pattern)) sitem = self.NumericItem(support) pitem = QStandardItem(str(pattern)) model.appendRow([sitem, pitem]) self.Outputs.object.send(sequences) self.table.setModel(model)
def compute_prefix_span(self): r''' Accepts a list of list representing sequnces and a minimum support, returns the output of the PrefixSpan algorithm. Parameters ---------- database: (list of lists) The "database" (list) of sequences. min_support: (int) The minimum relative support for PrefixSpan. Returns ------- prefix_span: (list of tuples) Output of PrefixSpan.frequent. List of tuples of the form (frequency, sequence), where sequence is a list representing the sequence from the database. ''' ps = PrefixSpan(self.database) prefix_span = ps.frequent(self.min_support) return prefix_span
def one_stacking_period(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" temp = [] curr_time = 0 aversion = "c" for row in spamreader: if not curr_usr == row[2]: curr_usr = row[2] dict[curr_usr] = temp temp = [] if row[2] == "": continue curr_time += int(row[3]) if row[6] == "0": if aversion == "c": temp.append(aversion) aversion = "a" temp.append(aversion) aversion = "c" curr_time = 0 if curr_time > PERIOD * 1000: temp.append(aversion) curr_time = curr_time - (PERIOD * 1000) aversion = "c" for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("one stacking period \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
def spatronesintax(libro, posv=True): #empieza df = libro #df=pd.read_excel('../Visualization/Relatos_Benvenutto.xlsx') tes = df['Texto'].tolist() for i in range(len(tes)): tes[i] = tes[i] + '.\n' tes = ''.join(tes) #o=re.sub('…|[.]{3}','.',tes) o = re.sub('[“]+|[”]+|["]+', '', tes) listaprueba = sent_tokenize(o) listapos = [] for i in listaprueba: i = i.strip() doc = nlp(i) listapos.append(doc) oye = separo(listapos, posv) listanum, pola = labeltonum(oye) #dfl=pd.DataFrame(listalen) #dfl['ok']=listanum ps = PrefixSpan(oye) ps = PrefixSpan(listanum) lista = ps.frequent(int(len(oye) * 0.5)) lista2 = [] for i in lista: if len(i[1]) > 5: lista2.append(i) df2 = correr(lista2, listanum) listatrans = [] for i in df2['indis']: listaux2 = [] for j in i: listaux2.append(pola[j]) listatrans.append(listaux2) df2['transformer'] = listatrans df2.to_excel('pospattern.xlsx', index=False)
def raw_data(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" temp = [] for row in spamreader: if not curr_usr == row[2]: curr_usr = row[2] dict[curr_usr] = temp temp = [] if row[2] == "": continue if row[6] == "0": temp.append("a") else: temp.append("c") for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("raw data \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
NUM_EXAMPLES = 200 MIN_FREQ = 25 MIN_LEN = 5 MIN_DIST = 3 data_generator = ExamplesGenerator(seq_len=SEQ_LEN, vocab_size=VOCAB_SIZE, seed=111, multiple_patterns=multiple_patterns) data_sequences = [next(data_generator()) for _ in range(NUM_EXAMPLES)] positive_sequences = [s[0] for s in data_sequences if s[1] == 1] negative_sequences = [s[0] for s in data_sequences if s[1] == 0] positive_seq = PrefixSpan(positive_sequences).frequent(MIN_FREQ) long_seq = [s for s in positive_seq if len(s[1]) >= MIN_LEN] seq_by_freq = sorted(long_seq, key=lambda x: x[0], reverse=True) def distance_from_seqs(s, s_list: list): """return distance (in terms of number of different tokens) between the sequence s and the list of sequence s_list""" if not s_list: s_list = [[]] dist_per_seq = [len(set(s) - set(s2)) for s2 in s_list] return min(dist_per_seq) most_freq_seq = [] for s in seq_by_freq:
#! python3 # -*- coding:utf-8 -*- __author__ = "yoyo" from prefixspan import PrefixSpan as PS import os.path as path data_dir = "./dataset/vocabulary/" filename = "GRE_pure.txt" if __name__ == "__main__": filepath = path.join(data_dir, filename) f = open(filepath) vocabulary = f.read() vocabulary = vocabulary.split("\n") f.close() ps = PS(vocabulary) for sequence in ps.frequent(3): if len(sequence[1]) >= 4: print(sequence)
import pickle from prefixspan import PrefixSpan with open("../data/objects/paths", "rb") as f: paths = pickle.load(f) ps = PrefixSpan(paths) freqs = ps.frequent(2) with open("../data/objects/freqs", "wb") as f: pickle.dump(freqs, f)
class PrefixSpanManager: """ Classe d'outil a l'utilisation de prefixspan Parameters: * sax_engine: SaxEngine Instance de preprocessing SAX * export: Boolean Si oui ou non les donnees sont deja exportees au bon format Variables: * se_instance: SaxEngine L'instance de class SAX * data: Array[] Les donnees au format SAX """ def __init__(self, sax_engine, export = True): self.se_instance = sax_engine self.data = sax_engine.sax_data self.process_data = [] self.ps = None self.ploter = Plot(self) if export: self.export_format() def run(self): """ Creer l'instance PrefixSpan avec les donnees pretraites """ self.ps = PrefixSpan(self.process_data) def export_format(self): """ Modifie le format pour correspondre au besoin de l'instance de PrefixSpan """ tmp = [] for elmt in self.data: tmp.append(elmt.ravel()) self.process_data = tmp def topk(self, n, c = True): """ Affiche les motifs les plus frequents(plus grand support) et par defaut les fermes Parameters: * n: int Nombre de motifs a afficher Returns: Liste de motifs frequent """ return self.ps.topk(n, closed = c) def frequent(self, n): """ Retourne les frequent de support n Parameters: * n: int Support minimal Returns: Liste des motifs de support minimal n """ return self.ps.frequent(n) def plot(self, l): self.ploter.plot_prefixspan(l)
def run(self): """ Creer l'instance PrefixSpan avec les donnees pretraites """ self.ps = PrefixSpan(self.process_data)
def main(): dblp_data = pd.read_csv (r'DBLP_Dataset.csv',encoding="ISO-8859-1") author_title = dblp_data dataset = author_title.to_numpy() list1 = dataset[:,2].tolist() #convert authors to lower case list2 = [] for i in list1: sublist = i.lower().split() list2.append(sublist) te = TransactionEncoder() te_ary = te.fit(list2).transform(list2) df = pd.DataFrame(te_ary, columns=te.columns_) frequent = fpgrowth(df, min_support=0.001, use_colnames=True) frequent = frequent[frequent['itemsets'].str.len()>1] freqauth_list = [] for i in frequent['itemsets']: freqauth_list.append([x for x in i]) freqauth_dict = {} for i in freqauth_list: title_idx_sublist = [] for idx, j in enumerate(list2): if set(i).issubset(j): title_idx_sublist.append(idx) freqauth_dict.update({tuple(i):title_idx_sublist}) freqauth_title_dict = {} kstem = ks.PyKrovetzStemmer() for key, value in freqauth_dict.items(): title_df = author_title.iloc[value]['title'] title_sublist = list(title_df) title_sublists = [] temp_list = [] for x in title_sublist: tempx = re.sub(r'[.]','', x) temp_list = re.sub(r'[^\x00-\x7F]+','', tempx).lower().split() temp_list2 = [] if isinstance(temp_list, list): temp_list2.append([kstem.stem(z) for z in temp_list if not z in stopwordlist]) title_sublists.extend(temp_list2) else: if not temp_list in stopwordlist: title_sublists.extend([kstem.stem(temp_list)]) freqauth_title_dict.update({key:title_sublists}) # Closed / Top k titles of frequent authors freqauth_title_dict_closed = {} for k, v in freqauth_title_dict.items(): ps = PrefixSpan(v) closed_Seq_pattern = ps.topk(5, closed=True) freqauth_title_dict_closed.update({k:closed_Seq_pattern}) # To get frequent author's context indicators frequentlist = freqauth_list cleanedList = list2 new_author_list = [] for i in range(0,len(frequentlist)): temp_author_list = [] authorlist = list(frequentlist[i]) found = 0 for k in range(0,len(cleanedList)): for j in range(0, len(authorlist)): if (authorlist[j] in(cleanedList[k])): found = 1 else: found = 0 break if found == 1: for jj in range(0,len(authorlist)): if (authorlist[jj] in(cleanedList[k])): cleanedList[k].remove(authorlist[jj]) temp_author_list.append(cleanedList[k]) new_author_list.append(temp_author_list) context_indicator_list = [] for i in range(0,len(new_author_list)): te = TransactionEncoder() te_ary = te.fit(new_author_list[i]).transform(new_author_list[i]) df = pd.DataFrame(te_ary, columns=te.columns_) frequent_author_list = fpgrowth(df, min_support=0.5, use_colnames=True) supp = frequent_author_list.support.unique() # all unique support count # Dictionary storing itemset with same support count key freq_dic = {} for i in range(len(supp)): inset = list(frequent_author_list.loc[frequent_author_list.support == supp[i]]['itemsets']) freq_dic[supp[i]] = inset # Dictionary storing itemset with support count <= key freq_dic2 = {} for i in range(len(supp)): inset2 = list(frequent_author_list.loc[frequent_author_list.support <= supp[i]]['itemsets']) freq_dic2[supp[i]] = inset2 # Find Closed frequent itemset close_freq = [] for index, row in frequent_author_list.iterrows(): isclose = True cli = row['itemsets'] cls = row['support'] checkset = freq_dic[cls] for i in checkset: if (cli != i): if (frozenset.issubset(cli, i)): isclose = False break if (isclose): close_freq.append([x for x in (row['itemsets'])]) context_indicator_list.append(close_freq) freqauth_context_ind_dict = {} for authpair, titlelist in freqauth_title_dict_closed.items(): cleantitlelist = [] for i in titlelist: if isinstance(i, tuple): if isinstance(i[1], list): listtostring = ' '.join(i[1]) cleantitlelist.append(listtostring) freqauth_context_ind_dict.update({authpair:cleantitlelist}) # Merging both titles and Context indicator author for frequent pattern authors for idx, key in enumerate(freqauth_context_ind_dict): newval = [] if len(context_indicator_list[idx])> 0: for i in context_indicator_list[idx]: if len(i) > 0: tempstr = '&'.join(i) newval = freqauth_context_ind_dict[key] newval.append(tempstr) freqauth_context_ind_dict.update({key:newval}) # Context Indicator Weighting CI_list = list(freqauth_context_ind_dict.values()) freqauth_context_in_weights = {} for key, value in freqauth_context_ind_dict.items(): freq_auth_CI_list = value length_of_CI = len(value) temp_dict = {} for i in freq_auth_CI_list: count_tmp = 0 for j in CI_list: if (i in (j)): count_tmp += 1 weight = round(1 - ((count_tmp - 1) / count_tmp), 2) if (weight > 0.1): temp_dict.update({i:weight}) sorted_weights_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True) freqauth_context_in_weights.update({key:sorted_weights_dict}) freq_auth_transactions = {} list_of_freq_auth = list(freqauth_context_in_weights.keys()) for i in range(0, len(freqauth_title_dict)): temp_dict = {} title_list = freqauth_title_dict.get(list_of_freq_auth[i]) CI_list = freqauth_context_in_weights[list_of_freq_auth[i]] CI_list_auth = [] for n, c in enumerate(CI_list): CI_list_auth.append(c[0]) for j in range(0, len(title_list)): cos_sim = cos_similarity(CI_list_auth,title_list[j]) cos_sim = round(cos_sim, 3) t_title = ' '.join(freqauth_title_dict[list_of_freq_auth[i]][j]) temp_dict.update({t_title:cos_sim}) sorted_title_dict = sorted(temp_dict.items(), key=lambda x: x[1], reverse=True) t_len = len(list(temp_dict.values())) max_len = t_len if (t_len > 4): max_len = 4 sorted_title_dict1 = dict(list(sorted_title_dict)[0:max_len]) freq_auth_transactions.update({list_of_freq_auth[i]:sorted_title_dict1}) # To find the strongest SSP - Match against similarity of the context units freq_auth_SSPs = {} list_of_freq_auth = list(freqauth_context_ind_dict.keys()) list_of_freq_auth_CI = list(freqauth_context_ind_dict.values()) len_list_of_freq_auth_CI = len(list_of_freq_auth_CI) context_indicator_similarity = np.zeros([len_list_of_freq_auth_CI, len_list_of_freq_auth_CI],dtype = float) for i in range (0,len_list_of_freq_auth_CI): for j in range (0,len_list_of_freq_auth_CI): cos_sim = cos_similarity(list_of_freq_auth_CI[i],list_of_freq_auth_CI[j]) cos_sim = round(cos_sim, 3) if (i != j): context_indicator_similarity[i][j] = cos_sim context_indicator_similarity[j][i] = cos_sim context_indicator_similarity_idx = np.zeros([len_list_of_freq_auth_CI, 3], dtype=int) for i in range(0,len(context_indicator_similarity)): context_indicator_similarity_idx[i] = np.argsort(context_indicator_similarity[i])[-3:] SSP_Author_List = [] for i in range(0,len(list_of_freq_auth)): temp_author_list_ssp = [] for j in range(0,len(context_indicator_similarity_idx[i])): temp_author_list_ssp.append(list_of_freq_auth[context_indicator_similarity_idx[i][j]]) SSP_Author_List.append(temp_author_list_ssp) SSP_Title_List = [] CI_list_title = list(freqauth_title_dict_closed.values()) CI_list1 = [] for i in (CI_list_title): temp_list3 = [] for j in i: CI_str = ' '.join(j[1]) temp_list3.append(CI_str) CI_list1.append(list(set(temp_list3))) for i in range(0,len(CI_list1)): temp_title_list_ssp = [] for j in range(0,len(context_indicator_similarity_idx[i])): ssp_str = CI_list1[context_indicator_similarity_idx[i][j]] temp_title_list_ssp.extend(ssp_str) SSP_Title_List.append(list(set(temp_title_list_ssp))) # Write the output to a CSV file # a) list_of_freq_auth # b) list_of_freq_auth_CI / freqauth_context_in_weights # c) freq_auth_transactions # d) SSP_Author_List # e) SSP_Title_List #for i in range(0, frequent_author_list): #print(len(SSP_Title_List)) #print(SSP_Title_List) titles_list_with_weight = list(freq_auth_transactions.values()) # Joining SSP authors SSP_authors_formatted = [] for i in range(0,len(SSP_Author_List)): temp_list = [] for j in range(0, len(SSP_Author_List[i])): authors = '&'.join(list(SSP_Author_List[i][j])) temp_list.append(authors) SSP_authors_formatted.append(temp_list) with open("./output.txt", 'w', encoding="utf-8") as f: f.write('Pattern' + '||' + 'Context Indicator' + '||' + 'Transaction 1' + '||' + 'Transaction 2' + '||' + 'Transaction 3' + '||' + 'Transaction 4' + '||' + 'SSP - Co-Author' + '||' + 'SSP - Title' + '\n') for i in range(0, len(list_of_freq_auth)): authors = ' '.join(list(list_of_freq_auth[i])) f.write(authors + '||') Context_indicators = '; '.join(list_of_freq_auth_CI[i]) f.write(Context_indicators + '||') for j in (titles_list_with_weight[i].keys()): f.write(j + '||') ssp_authors = '; '.join(SSP_authors_formatted[i]) f.write(ssp_authors + '||') ssp_titles = '; '.join(SSP_Title_List[i]) f.write(ssp_titles ) f.write('\n')
def apply(grouped_stream, all_labels, parameters=None): """ Applies the prefix span algorithm Parameters ------------- grouped_stream Grouped stream all_labels Indexed labels parameters All the parameters of the algorithm Returns -------------- frequents List containing frequent itemsets as label indexes frequents_label List containing frequent itemsets as labels frequents_encodings List containing frequent itemsets as word encodings frequents_occurrences List containing all the sequences of events associated to the corresponding itemset """ if parameters is None: parameters = {} final_label_idx = parameters[FINAL_LABEL_IDX] if FINAL_LABEL_IDX in parameters else DEFAULT_FINAL_LABEL_IDX m = parameters[M] if M in parameters else DEFAULT_M data = [[y[final_label_idx] for y in x] for x in grouped_stream] ps = PrefixSpan(data) frequents = [x[1] for x in ps.frequent(m)] frequents_label = [" ".join([all_labels[y] for y in x]) for x in frequents] F = tempfile.NamedTemporaryFile(suffix='.txt') F.close() F2 = open(F.name, "w") for label in frequents_label: F2.write(label+"\n") F2.close() model = fasttext.train_unsupervised(F.name) frequents_encodings = [] for i in range(len(frequents)): phrase = [x for x in frequents_label[i].split() if x in model.words] v = None for w in phrase: if v is None: v = model.get_word_vector(w) else: v = v + model.get_word_vector(w) frequents_encodings.append(v) frequents_occurrences = [] for f in frequents: frequents_occurrences.append([]) for g in grouped_stream: d = [x[final_label_idx] for x in g] for i in range(len(d)-len(f)): if d[i] == f[0] and d[i+len(f)-1] == f[len(f)-1]: if d[i:i+len(f)] == f: frequents_occurrences[-1].append(g[i:i+len(f)]) return frequents, frequents_label, frequents_encodings, frequents_occurrences
OUTPUT_JSON_NAME = "data/rules/" + owner + "_" + repo + "_" + lang + ".json" with open(INPUT_JSON_NAME, mode='r', encoding='utf-8') as f: changes_sets = load(f) changes = [x["changes_set"] for x in changes_sets] new_changes = [] for tokens in changes: new_tokens = [ x for x in tokens if not x.endswith("\n") and not x.endswith(" ") ] if new_tokens != [] and new_tokens not in new_changes: new_changes.append(new_tokens) print("Start rule generation") ps = PrefixSpan(new_changes) freq_seqs = ps.frequent(minsup=int(len(new_changes) * 0.1), closed=True) # freq_seqs = PrefixSpan_frequent( # ps, minsup=int(len(new_changes) * 0.1), closed=True) freq_seqs = [ x for x in freq_seqs if any([y.startswith("+") for y in x[1]]) and any([y.startswith("-") for y in x[1]]) ] freq_seqs = sorted(freq_seqs, reverse=True) with open(OUTPUT_JSON_NAME, mode='w', encoding='utf-8') as f: dump(freq_seqs, f, indent=1)
def get_common(sequence, data): data = data.split("\t")[0].lower() data = [sequence, data] ps = PS(data) common = get_longest(ps.topk(1000)) return common
def recommend(trainingset=l_good, s_group=s_good, student=s_good[0], path_length=9, rl=resourcelist): # Here we put the influence or this stdent's learning log bigger. x10 for i in range(30): trainingset.append(trainingset[s_group.index(student)]) ps = PrefixSpan(trainingset) pattern = ps.topk(1000, filter=lambda patt, matches: len(patt) > 1 ) # pattern lenth should bigger than 1 pattern_time = {} #Here stores all pattern with appear times for i, element in enumerate(pattern): l_s = [] # store pattern in this element s = "" for i in range(len(element[1])): if i == 0: s = str(element[1][i]) else: l_s.append(s + "," + str(element[1][i])) s = str(element[1][i]) for j in l_s: if j in pattern_time.keys(): pattern_time[j] += element[0] else: pattern_time[j] = element[0] # ordered pattern in list pattern_time = sorted(pattern_time.items(), key=lambda pattern_time: pattern_time[1], reverse=True) print("pattern with time:", pattern_time) # delete repeat part #print(len(pattern_time)) """ Here is deduplication. we can't delete the item of list in for cycle. It will have 'index out of range problem'. So we store the repeat index and delete after """ delete_indice = [] for k1 in range(len(pattern_time)): starter = pattern_time[k1][0].split(",")[0] ender = pattern_time[k1][0].split(",")[1] if starter == ender: delete_indice.append(k1) if pattern_time[k1] == pattern_time[-1]: break for k2 in range(k1 + 1, len(pattern_time)): #print(pattern_time[k2]) temps_start = pattern_time[k2][0].split(",")[0] temps_end = pattern_time[k2][0].split(",")[1] if starter == temps_start: delete_indice.append(pattern_time[k2]) if ender == pattern_time[k2][0].split(",")[1]: delete_indice.append(pattern_time[k2]) for i in set(delete_indice): if i in pattern_time: pattern_time.remove(i) """ Here we organise the path from pattern list. We should firstly find the head then finish the path. """ element = [] pattern_result = [x[0] for x in pattern_time ] # delete pattern show times, keep pattern #print("unique pattern:",pattern_result) store = [] for i in range(len(pattern_result)): for j in range(len(pattern_result)): if i == j: continue if pattern_result[i].split(",")[0] in pattern_result[j]: store.append(pattern_result[i]) path = list(set(pattern_result).difference(set(store)))[0] print("begin_node of path:", path) compt = 0 c_b = 0 l_change = 2 while compt < path_length - 2: # First node has two element, so we should add path_length-2 c_b += 1 for i in pattern_result: if i.split(",")[0] == path.split(",")[-1]: path += "," + i.split(",")[-1] compt += 1 if l_change == len(path): c_b += 1 else: l_change = len(path) if c_b > 100000: break print("path:", path) return path
discrete_time = [] with open(filename, "r", encoding="utf-8") as weights_file: print(f"Reading file {filename}") for i, weights_triple in enumerate(weights_file): current_weights = weights_triple.replace(",", ".").split("\t") weights.append(int(current_weights[1])) discrete_time_base = int(current_weights[0].strip()) discrete_time.append(discrete_time_base) curr_frequency = int(current_weights[3].strip()) frequency.append(curr_frequency) for k in range(0, curr_frequency): weights.append(int(current_weights[1])) discrete_time_base += 1 discrete_time.append(discrete_time_base) if limit is not None and (i == limit or discrete_time_base >= limit): print("Limit reached") break return discrete_time, weights if __name__ == '__main__': basedir = "C:/Users/havar/Home/cache_simulation_results/" _t, _w = _read_db(basedir + "scaled_w_01.csv") data = list(chunks(_w, 1000)) ps = PrefixSpan(data) ps.minlen = 5 ps.maxlen = 100 print(ps.frequent(5, closed=True))
from prefixspan import PrefixSpan db = [ [0, 1, 2, 3, 4], [1, 1, 1, 3, 4], [2, 1, 2, 2, 0], [1, 1, 1, 2, 2], ] ps = PrefixSpan(db) print(ps.frequent(2))
def aversion_direction_one_stacking_period(): dict = {} with open(FILE) as csvfile: spamreader = csv.reader(csvfile, delimiter=',', quotechar='"') next(spamreader, None) curr_usr = "******" avg = [] curr_time = 0 aversion = [0.0, 0.0, "1"] temp = [] for row in spamreader: if not curr_usr == row[2]: mean = np.average(avg, axis=0) t = [] for i in temp: res = "c" if i[2] == "0": diff = [np.abs(a - b) for a, b in zip(i[0:2], mean)] # if np.abs((diff[0] / mean[0]) - (diff[1] / mean[1])) < treshold: if (np.abs(((diff[0] + mean[0]) / mean[0]) - ((diff[1] + mean[1]) / mean[1])) < treshold) or \ (((diff[0] + mean[0]) / mean[0]) > treshold2 and ( ((diff[1] + mean[1]) / mean[1]) > treshold2)): res = "f" elif diff[0] - diff[1] > 0: if i[0] < mean[0]: res = "l" if i[0] > mean[0]: res = "r" else: if i[1] < mean[1]: res = "u" if i[1] > mean[1]: res = "d" t.append(res) dict[curr_usr] = t curr_usr = row[2] temp = [] avg = [] if row[2] == "": continue if row[6] == "1": avg.append([ float(row[4].replace(",", ".")), float(row[5].replace(",", ".")) ]) curr_time += int(row[3]) if row[6] == "0": if aversion[2] == "1": temp.append(aversion) aversion = [ float(row[4].replace(",", ".")), float(row[5].replace(",", ".")), row[6] ] temp.append(aversion) aversion = [0.0, 0.0, "1"] curr_time = 0 if curr_time > PERIOD * 1000: temp.append(aversion) curr_time = curr_time - (PERIOD * 1000) aversion = [0.0, 0.0, "1"] for i in list(dict.values()): print(" -1 ".join(i) + " -2") # print(dict.values()) ps = PrefixSpan(list(dict.values())) print("aversion direction one stacking period \n\n") ps.minlen = 3 ps.maxlen = 8 for i in ps.topk(20): print(i) print("\n") for i in ps.topk(20, closed=True): print(i) print("\n") for i in ps.topk(20, generator=True): print(i) print("\n") # for i in ps.frequent(2): # print(i) print("\n\n\n")
while line: db.append(eval(line)) line = file.readline() return db def generateFilename(tno, cno): "根据队编号与聚类编号产生文件名" return "Team" + str(tno) + "Cluster" + str(cno) + ".txt" path = "Cluster/" list_p = [] # tno 代表 team number # cno 代表 cluster number for tno in range(1, 3): for cno in range(0, 5): filepath = path + generateFilename(tno, cno) db = loadFile(filepath) ps = PrefixSpan(db) for x in range(0, 10): list_p.append(Pattern(ps.topk(10)[x][0], ps.topk(10)[x][1])) # 输出当前Cluster中出现频率最高的10个Pattern # print(ps.topk(10)) # 对Score进行排序 list_p = sorted(list_p, key=lambda x: x.score, reverse=True) print("#######################################") for x in list_p: print("score:", x.score, "freq:", x.freq)
def preprocess_dataset(path_): #Extraction données------------------------------------------------------------------------------ path = path_ date_column = 'Date-heure UTC (événement)' action_column = 'Pages' identity_col = 'Visiteurs uniques ID' dataset = pd.read_csv(path, sep=';', parse_dates=[date_column]) dataset[date_column] = pd.to_datetime(dataset[date_column], errors='coerce') dataset = dataset.dropna(subset=[date_column]) dataset = dataset[dataset[action_column] != '-'] dataset.index = dataset[date_column] dataset.drop(columns=date_column, inplace=True) dataset.sort_index(ascending=True, inplace=True) #'particulier::compte::compte-conseil univers de besoin' valeurs_interdites = [ 'particulier::acces-CR::acces-CR-store locator trouver ma CR 50', 'particulier::particulier-accueil particuliers et BP' ] dataset_after = dataset[~dataset[action_column].isin(valeurs_interdites)] person_list = dataset_after[identity_col].unique() print("Le nombre de personnes répertoriées est : " + str(len(person_list))) #liste_pages_dataset = dataset[action_column].unique().tolist() #comparaisons du nombre de visiteurs avant/ après : 77406 - 24682 = 52724 personnes #52724 personnes ne visitent que les pages de valeurs-interdites .... List_actions = [] #start = time.time() #parameter authorized_session_time defines the maximal inactivity time before we consider the client #opened two distincts sessions authorized_inactivity_time = datetime.timedelta(minutes=30) for i in range(0, len(person_list)): personne = person_list[i] subdata = dataset_after[dataset_after[identity_col] == personne] start = 0 for j in range(0, len(subdata.index) - 1): duree = subdata.index[j + 1] - subdata.index[j] if duree > authorized_inactivity_time: actions = subdata[action_column].iloc[start:j + 1].tolist() start = j + 1 List_actions.append(actions) actions = subdata[action_column].iloc[start:len(subdata.index)].tolist( ) List_actions.append(actions) #Premier coup de PrefixSpan pour trouver les parcours pertinents--------------------------------------- first_search = PrefixSpan(List_actions) first_search.min_len = 2 first_search.max_len = 7 results_search1 = first_search.frequent( 15, filter=lambda patt, matches: diversity_score(patt) >= len(patt)) results_search1.sort(key=lambda x: -x[0]) #Deuxieme passage pour obtenir la liste des transitions de taille 2 et leurs effectifs--------------------- second_search = PrefixSpan(List_actions) second_search.min_len = 2 second_search.max_len = 2 filter_list = compute_transitions_list(results_search1) results_search2 = second_search.frequent( 5, filter=lambda patt, matches: patt in filter_list) results_search2.sort(key=lambda x: -x[0]) #Tracé du Sankey ------------------------------------------------------------------------------------- liste_resfinal = results_search2 labels = [] sources = [] targets = [] values = [] links = [] #A link only appears in the graph if it constitutes more than rate% of the incoming/outgoing traffic of the #two nodes involved in the link rate = 0.11 for match in liste_resfinal: if len(match[1]) == second_search.minlen: pattern = match[1] else: pattern = match[1][len(match[1]) - 2:len(match[1])] for label in pattern: renamed = rename(label) if renamed not in labels: labels.append(renamed) if match[1].index(label) < len(match[1]) - 1: targetted = rename(match[1][match[1].index(label) + 1]) res_exit = 0 res_entry = 0 res_incoming = 0 res_ongoing = 0 if labels.index(renamed) in targets: for i in range(0, len(targets)): x = targets[i] if x == labels.index(renamed): res_exit += values[i] if labels.index(renamed) in sources: for i in range(0, len(sources)): if sources[i] == labels.index(renamed): res_incoming += values[i] if targetted in labels: if labels.index(targetted) in sources: for i in range(0, len(sources)): x = sources[i] if x == labels.index(targetted): res_entry += values[i] if labels.index(targetted) in targets: for i in range(0, len(targets)): if targets[i] == labels.index(targetted): res_ongoing += values[i] if (renamed, targetted) not in links: if match[0] > rate * res_exit and match[ 0] > rate * res_entry and match[ 0] > rate * res_ongoing and match[ 0] > rate * res_incoming: if ((targetted, renamed) in links): if values[links.index( (targetted, renamed))] <= match[0]: links.append((renamed, targetted)) sources.append(labels.index(renamed)) if targetted in labels: targets.append(labels.index(targetted)) else: labels.append(targetted) targets.append(labels.index(targetted)) sources.pop(links.index((targetted, renamed))) targets.pop(links.index((targetted, renamed))) values.pop(links.index((targetted, renamed))) links.pop(links.index((targetted, renamed))) values.append(match[0]) else: links.append((renamed, targetted)) sources.append(labels.index(renamed)) if targetted in labels: targets.append(labels.index(targetted)) else: labels.append(targetted) targets.append(labels.index(targetted)) values.append(match[0]) else: values[links.index((renamed, targetted))] += match[0] global_matrix = generate_global_matrix(List_actions, labels) return [labels, links, values, global_matrix, liste_resfinal, List_actions] #pk.dump(labels, open('Listedespages.pkl', 'wb')) #pk.dump(links, open('Listofedges.pkl', 'wb')) #pk.dump(values, open('Listofvalues.pkl', 'wb')) #pk.dump((subgraphs[23],list_subsources[23], list_subtargets[23], list_subvalues[23]), open('subgraphtotestonclustering.pkl','wb'))