def test_relim(self): ts1 = perftesting.get_default_transactions() relim_input = itemmining.get_relim_input(ts1) report = itemmining.relim(relim_input, 2) self.assertEqual(17, len(report)) self.assertEqual(6, report[frozenset(['b', 'd'])]) ts2 = perftesting.get_default_transactions_alt() relim_input = itemmining.get_relim_input(ts2) report = itemmining.relim(relim_input, 2) self.assertEqual(19, len(report)) self.assertEqual(5, report[frozenset(['a', 'b'])])
def draw(): d2wl = get_health_data_2d(0, 2, "mining-data/Health Data.csv") # frequent transactions = [round(x[1] * 1) / 1 for x in d2wl[1]] transactions = zip(transactions) relim_input = itemmining.get_relim_input(transactions) report = itemmining.relim(relim_input, min_support=2) report_keys = report.keys() report_values = report.values() joint_sort_report = zip(report_values, report_keys) joint_sort_report.sort(reverse=True) print('Health analyse - walk&run:relim frequence:') f = open('result/Health analyse - walk&run - frequent(relim method).txt', 'w') print >> f, joint_sort_report f.close() ks.draw_2d(d2wl[1], 5, 'Health analyse - walk&run', d2wl[0][0], d2wl[0][1], is_time_sequence=True, marker='x', save_path='result/walk&run') d2wl = get_health_data_2d(0, 2, "mining-data/Sleep Analysis.csv") # frequent transactions = [round(x[1] * 1) / 1 for x in d2wl[1]] transactions = zip(transactions) relim_input = itemmining.get_relim_input(transactions) report = itemmining.relim(relim_input, min_support=2) report_keys = report.keys() a = [] for x in report_keys: for i in x: a.append(i) a.sort() report_keys = [frozenset([x]) for x in a] report_sort = [{k: report[k]} for k in report_keys] print('Health analyse - sleep:relim frequence:') f = open('result/Health analyse - sleep - frequent(relim method).txt', 'w') print >> f, report_sort f.close() ks.draw_2d(d2wl[1], 5, 'Health analyse - sleep', d2wl[0][0], d2wl[0][1], is_time_sequence=True, marker='x', save_path='result/sleep')
def __findFS(df, sup): """ This function is used to find the "frequent" itemsets for the database """ if type(df) == pd.core.frame.DataFrame: L = df.shape[0] tss = [] for i in range(L): tss.append(df.columns[df.iloc[i] > 0].tolist()) relim = itemmining.get_relim_input(tss) else: relim = itemmining.get_relim_input(df) report = itemmining.relim(relim, min_support=sup) return report
def associationRules(transactions, userid, followed=(), c_userid=None): relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=2) rules = assocrules.mine_assoc_rules(item_sets, min_support=2, min_confidence=0.5) recom_user = {} for rule_user in rules: if userid in rule_user[0] and not any( map(rule_user[1].__contains__, followed)) and not c_userid in rule_user[1]: # 支持度 support = rule_user[2] / len(transactions) # リフト値 1より大きい場合は、Aが発生するとBが発生しやすくなると解釈できる lift = (rule_user[3] / support, ) if lift[0] <= 1: continue rule_user += lift recom_user[rule_user[1]] = rule_user[4] recom_user_sorted = sorted(recom_user.items(), key=lambda x: x[1], reverse=True) print("*" * 100) print("ユーザーレコメンド(バスケット分析)") print(recom_user_sorted) print("*" * 100) rcom_userid_list = set() for rcom_userid in recom_user_sorted: rcom_userid_list = rcom_userid_list.union(rcom_userid[0]) return list(rcom_userid_list)
def getCooccur(ts,groups,reverse,min_s=2,min_c=0.5): crules=dict() result=dict() relim_input=itemmining.get_relim_input(ts) itemset=itemmining.relim(relim_input,min_support=min_s) rules=assocrules.mine_assoc_rules(itemset,min_support=min_s\ ,min_confidence=min_c) # Now calculate the best rule for each cis # Clean the rules for rule in rules: if len(rule[0])>1: continue else: if not rule[0] in crules.keys(): crules[rule[0]]=dict() for elem in rule[1]: crules[rule[0]][elem]=rule[3] for x in reverse.keys(): if not frozenset({x}) in crules.keys(): continue for y in reverse.keys(): if not y in crules[frozenset({x})].keys(): continue result[(x,y)]=crules[frozenset({x})][y] return result
def getFrequentItems(transactions): #function to get frequent itemsets based on given transactions relim_input = itemmining.get_relim_input(transactions) #restructure transactions into relim input item_sets = itemmining.relim(relim_input, MINSUP) #get itemsets with minimum support results = [] for k, v in item_sets.items(): #return results results.append(list(k)) return results
def printPyminingResult(transactions, support, confidence): print '\n\nPymining algorithm:' relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=support * 196) print 'Frequent item set:( size:', len(item_sets), ')' for key in item_sets: print '[', for keys in key: print keys, ',', print '], ', rules = assocrules.mine_assoc_rules(item_sets, min_support=support * 196, min_confidence=confidence) print '\n\nRules:' for rule in rules: print '[', for _ in rule[0]: print _, if (len(rule[0]) > 1): print ',', print '->', for _ in rule[1]: print _, if (len(rule[1]) > 1): print ',', print '], confidence:', rule[3], ', support:', rule[2] / float(196)
def freqPaatterns(transactions, len, outfName): print "fp mining" relim_input = itemmining.get_relim_input(transactions) report = itemmining.relim(relim_input, min_support = len * 0.01) with open(outfName, "w") as fout: for fp in report.items(): #print "%s\t%f\n" % (";".join(fp[0]), fp[1] / float(len)) fout.write("%s\t%f\n" % (";".join(fp[0]), fp[1] / float(len)))
def find_frequent_artists(sample_clusters): """ Finds frequent artists from a sample cluster object of users, cluster labels, and artist data """ print("Finding frequent item sets") print(sample_clusters) # sample cluster data on 5000 random american users, k = 10 for k means, and top 5 artists frequent_artist_dict = {} for cluster, user_data in sample_clusters: print("\nFinding frequent artists for cluster {}".format(cluster)) num_users = len(user_data.user_id) # calculates the minimum support of artists according to some proportion of users # ex: pass in 10, so min support is num users / 10, or 10% of users # for some reason we can't import this number as a parameter...? min_sup = math.floor(num_users / 5) if min_sup == 0: min_sup = 1 # this is for humongous clusters where a large minimum support ( > 300 ) doesn't really make sense # for the Last.fm data set # if num_users > 1000: # min_sup = num_users/20 # print("min sup: ", min_sup) # print("number of users: {}".format(num_users)) # create a list of "transactions" for frequent mining from the top artists for the current user transactions = (list(user_data.top_artists)) relim_input = itemmining.get_relim_input(transactions) # the report stores each frequent item as a dictionary of the form: # frozenset(artist id, artist name) : count report = itemmining.relim(relim_input, min_support=min_sup) # each frequent item is stored as a frozen set # process each frozen set item by converting it into a list and accessing the data # (through the 0th index, because it's a list with just 1 element) # then grabbing just the artist name through the 1st index # (because it is the 2nd item in the (artist ID, artist name) tuple for each frozen set report = [(list(item)[0][1], report[item]) for item in report if len(item) == 1] # sort the report object in reverse order so the highest played artists are first report = sorted(report, key=lambda tup: tup[1], reverse=True) # print(report) # store the report list for the cluster number in the frequent artist dictionary frequent_artist_dict[cluster] = report return frequent_artist_dict
def frequent_itemsets(self, min_support=2): """Mine frequent item sets (FIM) using the RELIM algorithm. :param min_support: Minimum count of occurences an itemset must have to be returned. :returns: A mapping of itemsets to their supportcount. :rtype: dict(set=int) """ relim_input = itemmining.get_relim_input(list(self.groups())) return itemmining.relim(relim_input, min_support=min_support)
def freq_items(self): #Give this function a list of transactions and a key function, returns a data #structure used as the input of the relim algorithm. relim_input = itemmining.get_relim_input(self.transactions) #use the input from get_relim_input() and minimum support #to return frequent item sets of items appearing in a list of transactions #based on Recursive Elimination item_sets = itemmining.relim(relim_input, self.min_sup) return item_sets
def testDefaultSupportConf(self): ts1 = perftesting.get_default_transactions() relim_input = itemmining.get_relim_input(ts1) report = itemmining.relim(relim_input, 2) rules = assocrules.mine_assoc_rules(report, min_support=2) self.assertEqual(23, len(rules)) a_rule = (frozenset(['b', 'e']), frozenset(['d']), 2, 1.0) self.assertTrue(a_rule in rules) ts2 = perftesting.get_default_transactions_alt() relim_input = itemmining.get_relim_input(ts2) report = itemmining.relim(relim_input, 2) rules = assocrules.mine_assoc_rules(report, min_support=2) self.assertEqual(29, len(rules)) a_rule = (frozenset(['e']), frozenset(['a', 'd']), 2, 2.0/3.0) self.assertTrue(a_rule in rules)
def testDefaultSupportConf(self): ts1 = perftesting.get_default_transactions() relim_input = itemmining.get_relim_input(ts1) report = itemmining.relim(relim_input, 2) rules = assocrules.mine_assoc_rules(report, min_support=2) self.assertEqual(20, len(rules)) a_rule = (frozenset(['b', 'e']), frozenset(['d']), 2, 1.0) self.assertTrue(a_rule in rules) ts2 = perftesting.get_default_transactions_alt() relim_input = itemmining.get_relim_input(ts2) report = itemmining.relim(relim_input, 2) rules = assocrules.mine_assoc_rules(report, min_support=2) self.assertEqual(20, len(rules)) a_rule = (frozenset(['e']), frozenset(['a', 'd']), 2, 2.0/3.0) self.assertTrue(a_rule in rules)
def testSupport5(self): ts1 = perftesting.get_default_transactions() relim_input = itemmining.get_relim_input(ts1) report = itemmining.relim(relim_input, 5) rules = assocrules.mine_assoc_rules(report, min_support=5) self.assertEqual(2, len(rules)) a_rule = (frozenset(['d']), frozenset(['b']), 6, 0.75) self.assertTrue(a_rule in rules)
def testConfidence075(self): ts1 = perftesting.get_default_transactions() relim_input = itemmining.get_relim_input(ts1) report = itemmining.relim(relim_input, 2) rules = assocrules.mine_assoc_rules(report, min_support=2, min_confidence=0.75) self.assertEqual(8, len(rules)) a_rule = (frozenset(["b"]), frozenset(["d"]), 6, 0.75) self.assertTrue(a_rule in rules)
def freq_items(self): """ Import the itemming tools and perform relim mining Returns: item_sets: frequent item sets """ relim_input = itemmining.get_relim_input(self.transactions) item_sets = itemmining.relim(relim_input, self.min_sup) return item_sets
def get_association_rules(seqs, min_support=2): transactions = list(seqs) # print transactions relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=min_support) rules = assocrules.mine_assoc_rules(item_sets, min_support=min_support, min_confidence=0.5) # print(rules) return rules
def find_freq_sets(): """Match""" print "\n", my.TS_START, my.TS_WINDOW, "\n" SQL = """SELECT timestamp, text\ FROM {rel_tweet} \ WHERE timestamp BETWEEN '{ts_start}' AND timestamp '{ts_start}' + INTERVAL '{window} days' """.format( rel_tweet=my.REL_TWEET, ts_start=my.TS_START, window=my.TS_WINDOW ) print "Querying DB..." con = psycopg2.connect(my.DB_CONN_STRING) cur = con.cursor() cur.execute(SQL) recs = cur.fetchall() con.close() print "{count} records retrieved.".format(count=len(recs)) global sw sw = stopwords.words("english") sw.extend(my.STOPWORDS) sw = list(set(sw)) global tokens with open("data/" + my.DATA_FOLDER + "tokens.json", "rb") as fp: tokens = sorted(anyjson.loads(fp.read())) print len(tokens) pool = Pool(processes=my.PROCESSES) tweets = pool.map(_trim_tweet, recs) tweets = filter(None, tweets) tweets_len = len(tweets) recs = None print "{count} tokenized tweets to be processed.".format(count=len(tweets)) # Frequent itemset mining relim_input = itemmining.get_relim_input(tweets) tweets = None print "Generated Relim input." sets = itemmining.relim(relim_input, min_support=int(math.sqrt(tweets_len))) relim_input = None print len(sets), min(sets.values()), max(sets.values()) sets = sorted(sets.items(), key=lambda x: x[1], reverse=True) texts = [] for s, f in sets[: my.K]: txt = " ".join(tokens[i] for i in list(s)) texts.append(txt) filename = "data/" + my.DATA_FOLDER + "frequent_sets" with open(filename + "_all" + ".pickle", "wb") as fp: pickle.dump(sets, fp) with open(filename + ".pickle", "wb") as fp: pickle.dump(sets[: my.K], fp) with open(filename + ".txt", "wb") as fp: fp.write("\n".join(texts))
def test_relim(should_print=False, ts=None, support=2): if ts is None: ts = get_default_transactions() relim_input = get_relim_input(ts, lambda e: e) fis = set() report = {} n = _relim(relim_input, fis, report, support) if should_print: print(n) print(report) return (n, report)
def testConfidenceForComplexRules(self): transab = (("a", "b"),) * 1000 transac = (("a", "c"),) * 1000 transabc = (("a", "b", "c"),) * 5 trans = transab + transac + transabc relim_input = itemmining.get_relim_input(trans) report = itemmining.relim(relim_input, 5) rules = assocrules.mine_assoc_rules(report, min_support=5, min_confidence=0.9) self.assertEqual(3, len(rules)) a_rule = (frozenset(["b", "c"]), frozenset(["a"]), 5, 1.0) self.assertTrue(a_rule in rules)
def testConfidenceForComplexRules(self): transab = (('a', 'b'), ) * 1000 transac = (('a', 'c'), ) * 1000 transabc = (('a', 'b', 'c'), ) * 5 trans = transab + transac + transabc relim_input = itemmining.get_relim_input(trans) report = itemmining.relim(relim_input, 5) rules = assocrules.mine_assoc_rules( report, min_support=5, min_confidence=0.9) self.assertEqual(3, len(rules)) a_rule = (frozenset(['b', 'c']), frozenset(['a']), 5, 1.0) self.assertTrue(a_rule in rules)
def frequentSet(input): relim_input = itemmining.get_relim_input(input) report = itemmining.relim(relim_input, min_support=MIN_SUPPORT) return report # by pass filtering parameters filteredResult = [] for item in report: for key in item: if validDependent(list(key)): filteredResult.append(item) return (0,filteredResult)
def _freq_phrases(id): '''Map function''' with open('data/' + my.DATA_FOLDER + 'sets.json', 'rb') as fp: sets = anyjson.loads(fp.read()) keywords = tuple(itertools.chain(*sets[str(id)])) data_path = 'data/' + my.DATA_FOLDER + 'data/' with open(data_path + str(id) + '.txt', 'rb') as fp: cr = csv.reader(fp, delimiter=',') tweets = [row[1].split() for row in cr] tweets_len = len(tweets) print 'Tweets:', tweets_len if tweets_len < 5000: return fd = FreqDist(tuple(itertools.chain(*tweets))) vocab = fd.keys()[:-fd.Nr(1)] for w in keywords: if w in vocab: vocab.remove(w) print 'Tokens:', fd.N(), ',', fd.B(), '-', fd.Nr(1), '=', len(vocab) path = 'data/' + my.DATA_FOLDER + 'frequent_tokens/' if not os.path.exists(path): os.makedirs(path) with open(path + str(id) + '.txt', 'wb') as fp: fp.write('\n'.join(vocab)) words = dict((w, vocab.index(w)) for w in vocab) tweets = tuple(tuple(words[w] for w in tw if w in words) \ for tw in tweets) relim_input = itemmining.get_relim_input(tweets) print 'Generated Relim input... Min support:', math.sqrt(tweets_len) sets = itemmining.relim(relim_input, min_support=int(math.sqrt(tweets_len))) print len(sets), min(sets.values()), max(sets.values()) sets = sorted(sets.items(), key=lambda x:x[1], reverse=True) texts = [] for s, f in sets[:1000]: txt = ' '.join(vocab[i] for i in tuple(s)) texts.append(txt) path = 'data/' + my.DATA_FOLDER + 'frequent_phrases/' if not os.path.exists(path): os.makedirs(path) with open(path + str(id) + '_all' + '.pickle', 'wb') as fp: pickle.dump(sets, fp) with open(path + str(id) + '.pickle', 'wb') as fp: pickle.dump(sets[:my.K], fp) with open(path + str(id) + '.txt', 'wb') as fp: fp.write('\n'.join(texts))
def mine_rules_relim(self, baskets): print "preparing itemset" relim_input = itemmining.get_relim_input(baskets) print "finding frequent itemsets" self.item_sets = itemmining.relim(relim_input, min_support = len(baskets) * self.min_support) print "finding association rules" self.rules = assocrules.mine_assoc_rules(self.item_sets, len(baskets), min_support = self.min_support, min_confidence = self.min_confidence, min_lift = self.min_lift) # sort by support self.nonmax_suppression() self.rules = sorted(self.rules, key = lambda x: -x[2])
def fun1(): transactions = (('a', 'b', 'c'), ('b'), ('a'), ('a', 'c', 'd'), ('b', 'c'), ('b', 'c')) relim_input = itemmining.get_relim_input(transactions) report = itemmining.relim(relim_input, min_support=2) print("Here is the default transactions data.\n{}".format(transactions)) time.sleep(0.5) print("See the default frequent itemsets?[y/n]") p1 = input() if p1 == 'y': print(report) input("Press any button to return to CONTENT") else: input("Thank you for visiting, press Any button to return to CONTENT")
def getAssoc(transactions,min_s=2,min_c=0.5): ''' getAssoc will return the association rule in the following format ''' result=dict() for key in transactions.keys(): relim_input=itemmining.get_relim_input(\ transactions[key]) itemset=itemmining.relim(relim_input\ ,min_support=min_s) result[key]=assocrules.mine_assoc_rules(\ itemset,min_support=min_s,min_confidence=min_c) return result
def execute(self,data_source): import csv transactions = [] with open(data_source, 'r') as f: reader = csv.reader(f) transactions = list(reader) # print(transactions) # transactions = [['a', 'b', 'c'], ['b'], ['a'], ['a', 'c', 'd'], ['b', 'c'], ['b', 'c']] # print(type(transactions)) relim_input = itemmining.get_relim_input(transactions) report = itemmining.relim(relim_input, min_support = self.support.get()) result = "" for itemset, count in report.items(): result = result + ", ".join(itemset) + ": " + str(count) + "\n" # print(report) return result
def find_similarity_sets(segmented_df, minimal_support): global TRANSACTIONS, ITEMSETS, FINAL_ROWS TRANSACTIONS = [] FINAL_ROWS = {} print("Getting going") segmented_df = segmented_df.groupby("time_group").apply(squash) print("FIM...") transactions = tuple(TRANSACTIONS) #perftesting.get_default_transactions() print("Got " + str(len(transactions)) + " transactions.") relim_input = itemmining.get_relim_input(transactions) ITEMSETS = itemmining.relim(relim_input, min_support=minimal_support) # clean for closed frequent patterns itemsets1 = [] print("Closing the patterns...") for s in ITEMSETS: can_add = True for j in range(len(itemsets1)): if set(s).issubset(itemsets1[j]): can_add = False break if set(s).issuperset(itemsets1[j]): itemsets1[j] = set(s) can_add = False break if can_add: itemsets1.append(s) ITEMSETS = itemsets1 # per itemset determine rows print("Per Window go...") segmented_df.index = range(len(segmented_df)) max_idx = segmented_df["time_group"].max() segmented_df = segmented_df.groupby("time_group").apply( lambda x: put_to_final(x, max_idx)) # write result res_df = pandas.DataFrame( [[str(a) for a in r[:-2]] + [r[-2], r[-1]] for r in list(FINAL_ROWS.values())], columns=["indices", "values", "length_indices", "length_content"]) res_df = res_df[res_df["length_content"] >= minimal_support] return res_df.groupby("indices").apply(lambda x: x.iloc[-1])
def mine_patterns(data, MINING_METHOD, CONFUSION_MATRIX): if (MINING_METHOD == 'seq_mining'): mined_patterns = { KEY: sorted(seqmining.freq_seq_enum([data[KEY][trace_id] for trace_id in data[KEY]], min_support=2)) for KEY in CONFUSION_MATRIX } if (MINING_METHOD == 'item_mining'): mined_patterns_to_be_preprocessed = { KEY: itemmining.relim(itemmining.get_relim_input([data[KEY][trace_id] for trace_id in data[KEY]]), min_support=2) for KEY in CONFUSION_MATRIX } mined_patterns = { KEY: [ (tuple(element), mined_patterns_to_be_preprocessed[KEY][element]) for element in mined_patterns_to_be_preprocessed[KEY]] for KEY in CONFUSION_MATRIX } return mined_patterns
def execute(self,data_source): import csv transactions = [] with open(data_source, 'r') as f: reader = csv.reader(f) transactions = list(reader) # print(transactions) # transactions = [['a', 'b', 'c'], ['b'], ['a'], ['a', 'c', 'd'], ['b', 'c'], ['b', 'c']] # print(type(transactions)) relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support = self.support.get()) rules = assocrules.mine_assoc_rules(item_sets, min_support=self.support.get(), min_confidence=self.confidence.get_float()) result = "" for rule in rules: print(rule[0]) result = result + ", ".join(rule[0]) + " => " + ", ".join(rule[1]) + "\n" # result = result + ", ".join(rule[0]) + " => " + ", ".join(rule[1]) + ": " + str(rule[2]) + ", " + str(rule[3]) + "\n" # print(report) return result
def association_rules(data, min_support, min_confidence): """ Generates association rules from crawled data """ badges = data['badges'] transactions = data['transactions'] # pymining only works, if the identifiers are one character strings :( transactions = tuple(tuple(chr(b) for b in t) for t in transactions) # pymining dance relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=min_support) rules = assocrules.mine_assoc_rules(item_sets, min_support=min_support, min_confidence=min_confidence) # translate identifiers back to badge names rules = [[frozenset(badges[ord(b)] for b in r[0]), frozenset(badges[ord(b)] for b in r[1]), r[2], r[3]] for r in rules] return rules
def fun2(): transactions = (('a', 'b', 'c'), ('b'), ('a'), ('a', 'c', 'd'), ('b', 'c'), ('b', 'c')) relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=2) rules = assocrules.mine_assoc_rules(item_sets, min_support=2, min_confidence=0.5) print("The default transactions data is:") print(transactions) time.sleep(0.5) input("Press any button to continue...") print( "Here is the association rules we have mined. Frozenset means the pattern in the transactions" ) time.sleep(1) print(rules) print( "\nNote:(frozenset({e'}), frozenset({'b', 'd'}), 2, 1.0) means:\n # e -> b, d with support 2 and confidence 0.66" ) input("Press Any button to return to CONTENT")
def getAssoc2(ts,groups,reverse,min_s=2,min_c=0.5): crules=dict() result=dict() relim_input=itemmining.get_relim_input(ts) itemset=itemmining.relim(relim_input,min_support=min_s) rules=assocrules.mine_assoc_rules(itemset,min_support=min_s\ ,min_confidence=min_c) # Now calculate the best rule for each cis # Clean the rules for rule in rules: if len(rule[0])>1: continue else: if rule[0] in crules.keys(): if (len(rule[1])+1)*rule[3]<=crules[rule[0]]: continue crules[rule[0]]=(len(rule[1])+1)*rule[3] for cis in groups.keys(): key=frozenset({groups[cis]}) if key in crules.keys(): result[cis]=crules[key] return result
def _build_phrase(self): input_list = {} for post in self.valid: for cate in post["category"]: if cate not in input_list: input_list[cate] = [] input_list[cate].append(post["title"] + post["content"]) phrases = {} for cate, posts in input_list.items(): relim_input = itemmining.get_relim_input(posts) fis = itemmining.relim(relim_input, min_support=2) phrases[cate] = {} for phrase, count in fis.items(): new_p = list(phrase) if len(new_p) >= 2: phrases[cate]['_'.join(new_p)] = count print(cate) print(phrases[cate]) phrase_json = open(os.path.join("../data/sto/phrase_dict.json"), 'w') json.dump(phrase_json, phrases) phrase_json.close() return phrases
def _execute(self): self.transactions = mongoComputeHashTagItemSets(self.name) relim_input = itemmining.get_relim_input(self.transactions) self.item_sets = itemmining.relim(relim_input, self.min_support) self.rules = assocrules.mine_assoc_rules(self.item_sets, self.min_support, self.min_confidence)
#%% sorted_rules = sorted(rules, key=lambda tup: tup[2], reverse=True) sorted_rules top_rules = sorted_rules[:10] def get_product_names(list_ids): return tuple([df_fruits.loc[id, 'product_name'] for id in list_ids]) top_rules_names = [tuple(list(map(get_product_names, rule[:2]))+ [rule[2]]) for rule in top_rules] #%% Fast implemented #This takes long from pymining import itemmining, assocrules transactions=aisle_orders['products'] item_sets = itemmining.get_relim_input(transactions) #%% min_supp= SUPPORT_THRESHOLD * NUMBER_ORDERS_AISLE item_sets = itemmining.relim(item_sets, min_support=min_supp) #%% thresholds = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] times = [] max_lengths = [] numbers = [] for t in thresholds: start = time.time() rules = assocrules.mine_assoc_rules(item_sets, min_support=min_supp, min_confidence=t) execution_time = time.time() - start times.append(execution_time)
def FreqentPattern(fileloc): megawordlist= ReadProcessedSent(fileloc) relim_input = itemmining.get_relim_input(megawordlist) patternlist=itemmining.relim(relim_input, min_support=0.007*len(megawordlist)) return patternlist
def get_relim_input(self): list_of_neighbors = self.g.neighborhood(vertices=self.g.vs,order=1) return itemmining.get_relim_input(list_of_neighbors)
def Apriori_three(data_tuple): transactions = perftesting.get_default_transactions() relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=50)#////最小关联度 rules = assocrules.mine_assoc_rules(item_sets, min_support=2, min_confidence=0.5) print(rules)
def Apriori_tow(data_tuple): transactions = data_tuple relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=100) rules = assocrules.mine_assoc_rules(item_sets, min_support=100, min_confidence=0.5) print(rules)
def Apriori_one(data_tuple): relim = itemmining.get_relim_input(data_tuple) report = itemmining.relim(relim, min_support=100)# //最小关联度 print(report)
break rulepitlens = [] ruledurlens = [] rulepitdurlens = [] patpitlens = [] patdurlens = [] patpitdurlens = [] #20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1 for sup in [ 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2 ]: print(sup) relim_input = itemmining.get_relim_input(allpit) print("length relim_input:" + str(len(relim_input))) for item in relim_input: print("length item:" + str(len(item))) print(item) item_sets = itemmining.relim(relim_input, min_support=sup) print("length item_sets:" + str(item_sets)) #rules = assocrules.mine_assoc_rules(item_sets, min_support=sup, min_confidence=0.5) #print(len(rules)) # print((rules)) #rulepitlens.append(len(rules)) # print(nonsense) # relim_input = itemmining.get_relim_input(durfam)
def freq_items(self): relim_input = itemmining.get_relim_input(self.transactions) item_sets = itemmining.relim(relim_input, self.min_sup) return item_sets
# print(gramsstring) # print(len(gramsstring)) for things in grams: inputlist.append(things) print("input item counts:"+str(len(inputlist))) # for inputentry in inputlist: # print(inputentry) # print(inputlist) #20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1 # ,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2 for sup in [30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2]: # print(sup) relim_input = itemmining.get_relim_input(inputlist) # print("length relim_input:" + str(len(relim_input))) # for item in relim_input: # print("length item:"+str(len(item))) # print(item) item_sets = itemmining.relim(relim_input, min_support=sup) # print("length item_sets:"+str(len(item_sets))) itempitlens.append(len(item_sets)) # print(item_sets) rules = assocrules.mine_assoc_rules(item_sets, min_support=sup, min_confidence=0) # print(len(rules)) # print((rules)) rulepitlens.append(len(rules))
def twitter_analysis(path,files): tweets=[] words=[] bigrams=[] hashtags=[] mentions=[] twitterpic=[] instagrampic=[] otherUrls=[] positive_terms=[] negative_terms=[] usefulwords=[] #read input files and save tweets# f=open(str(path)+'/tweets/'+str(files)+'.txt_parsed.txt_final.txt') for line in f: if line.startswith('@@@'): try: tweet=line.strip().lower().split('\t')[3] except IndexError: tweet=" " tweets.append(tweet) #words# terms=tweet.split() words.append(terms) ##useful words## usefulword=[] for term in terms: if term in english_stopwords: continue else: usefulword.append(term) usefulwordt=tuple(usefulword) usefulwords.append(usefulwordt) usefulwordst=tuple(usefulwords) #two grams# twograms=ngrams(terms, 2) tgs=[] for twogram in twograms: joined='_'.join(twogram) tgs.append(joined) ##the original code will return genrator so I changed## bigrams.append(tgs) #hash tags# myHashtags=re.findall('#[^ ]+',tweet) hashtags.append(myHashtags) #mentions# myMentions=re.findall('@[^ ]+',tweet) mentions.append(myMentions) #twitter pic# myTp=re.findall('pic.twitter.com/[^ ]+',tweet) twitterpic.append(myTp) #instagram pic# myIp=re.findall('http://instagram.com/p/[^ ]+',tweet) instagrampic.append(myIp) #other Url# otherUrl=re.findall('http://[^ ]+',tweet) other=[] for Url in otherUrl: if "http://instagram.com/p/" not in Url: other.append(Url) otherUrls.append(other) #positive words# myPos=[] for term in terms: if term in english_stopwords: continue if term in pos:myPos.append(term) positive_terms.append(myPos) #negative words# myNeg=[] for term in terms: if term in english_stopwords: continue if term in neg:myNeg.append(term) negative_terms.append(myNeg) ##twitter_analysis('/Users/yuehan/Desktop/twitter','tial.txt_parsed.txt_final.txt')##This is for demo analysis ##save csv files## newpath = str(path)+'/text_results/' if not os.path.exists(newpath): os.makedirs(newpath) with open(str(path)+'/text_results/'+str(files)+'_textresults.csv','wb') as f1: w=csv.writer(f1) row1=['tweets','words','bigrams','hashtags','mentions','twitterpic','instagrampic','otherUrls','positive_terms','negative_terms'] w.writerow(row1) for v in range(0,len(tweets)): tweetss=tweets[v] wordss=words[v] wordss=','.join(wordss) bigramss=bigrams[v] bigramss=','.join(bigramss) hashtagss=hashtags[v] hashtagss=','.join(hashtagss) mentionss=mentions[v] mentionss=','.join(mentionss) twitterpics=twitterpic[v] twitterpics=','.join(twitterpics) instagrampics=instagrampic[v] instagrampics=','.join(instagrampics) otherUrlss=otherUrls[v] otherUrlss=','.join(otherUrlss) positive_termss=positive_terms[v] positive_termss=','.join(positive_termss) negative_termss=negative_terms[v] negative_termss=','.join(negative_termss) w.writerow([tweetss,wordss,bigramss,hashtagss,mentionss,twitterpics,instagrampics,otherUrlss,positive_termss,negative_termss]) ##find frequent Item Sets (which shows up more than 3 times, I tried with 5 and some will have blank sets) relim_input = itemmining.get_relim_input(usefulwordst) report = itemmining.relim(relim_input, min_support=3) ##print report.keys() newpath = str(path)+'/frequentsets/' if not os.path.exists(newpath): os.makedirs(newpath) writer=csv.writer(open(str(path)+'/frequentsets/'+str(files)+'_frequentsets.csv','wb')) for key,value in report.items(): if "', '" in str(key): key=str(key) key=key.replace("frozenset(['","") key=key.replace("'])","") key=key.replace('frozenset(["','') key=key.replace('"])','') key=key.replace("',",",") key=key.replace('",',',') key=key.replace(', "',', ') key=key.replace(", '",", ") writer.writerow([key,value]) else: pass
import pandas as pd import numpy as np from pymining import seqmining, itemmining, assocrules, perftesting import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns sns.set() studydf = pd.read_csv("studydf.csv") violationdf = studydf[['INSPECTION DATE','VIOLATION CODE']].reset_index() violationdf['VIOLATION CODE'] = violationdf['VIOLATION CODE'].astype('str') plotseries = violationdf['VIOLATION CODE'].value_counts().iloc[0:20] ax = sns.barplot(y=plotseries.index, x=plotseries.values, palette="Blues_d") testdf = violationdf.groupby(['CAMIS'])['VIOLATION CODE'].apply(list) minelist = testdf.tolist()[0:10] minelist = tuple(tuple(x) for x in minelist) relim_input = itemmining.get_relim_input(minelist) item_sets = itemmining.relim(relim_input, min_support=2) rules = assocrules.mine_assoc_rules(item_sets, min_support=2, min_confidence=0.5) print rules freq_seqs = seqmining.freq_seq_enum(minelist, 2) print freq_seqs rules2 = assocrules.mine_assoc_rules(item_sets, min_support=1, min_confidence=0.5) print rules2
print("---------------------------") print("---------------------------") print("Eigenvector centrality (a measure of the influence of a node in a network)") print(sorted(list(nx.eigenvector_centrality(G).items()),key=operator.itemgetter(1),reverse=True)) print("---------------------------") print("---------------------------") print("Katz centrality (relative influence of a node)") print(sorted(list(nx.katz_centrality_numpy(G).items()),key=operator.itemgetter(1),reverse=True)) print("---------------------------") def sumupRelationship(relationship_mat,subject): relationship_mat.drop([subject],inplace=True).sort_values(inplace=True) friends = [] addAllNodes(nodes) buildAllGroupLink(groups) #drawNetwork() relations = buildRelationshipMat(plot=False,half=False) sumupRelationship(relations,"Micoud") #displayDegrees() #clustering() relim_input = itemmining.get_relim_input(groups_tuples) report = itemmining.relim(relim_input, 2) #http://www.cl.cam.ac.uk/~cm542/teaching/2011/stna-pdfs/stna-lecture11.pdf
#!/usr/bin/env python from pymining import itemmining, assocrules data = (('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'), ('a', 'f', 'g'), ('b', 'd', 'e', 'f', 'j'), ('a', 'b', 'd', 'i', 'k'), ('a', 'b', 'e', 'g')) min_sup = 3 min_conf = 0.5 # get frequent itemsets using pymining relim_input = itemmining.get_relim_input(data) frequent_itemsets = itemmining.relim(relim_input, min_sup) # get association rules using pymining results = assocrules.mine_assoc_rules(frequent_itemsets, min_sup, min_conf) for key in frequent_itemsets.keys(): print(str(key) + " : " + str(frequent_itemsets[key])) for key in results: print(str(key))
def frequency_item_set(self, columns = None, support = 0.1, rules = False, confidence = 0.8, engine = 'pymining'): """ Use frequency item set mining to find subgroups where data goes missing together. Parameters: ---------- columns: list, default None Subset of the columns you want to use. support: float, default 0.1 Minimum support to use while item set mining. Too small values can break memory. Support should be greater than zero and less than 1. rules: bool, default True Whether association rules should be mined. If True, method returns two_sample dataframes instead of one. confidence: float, default Minimum confidence for rules being mined. Should be between 0 and 1. engine: {'pymining'} Only one engine is being supported right now. Returns: ------- item_sets_df, rules_df : DataFrame, DataFrame Tabulated results for itemsets and association rules mined. """ from pymining import itemmining, assocrules if support<=0 or support>1: #support should be between one and zero. print('Support has to be between 0 and 1') return if confidence<0 or confidence>1: #confidence can be zero. print('Confidence has to be between 0 and 1') return mf_ = self._masked_missframe(where = None, columns = columns, how = 'any') # Converting all missing values to 1, and non-missing to nan. bench = pd.DataFrame(np.where(mf_, 1, np.nan), columns = mf_.columns) # Replacing 1's with the index of the column they belong to. # Converting to numbers instead of column names for supposed performance boost. bench = bench * list(range(0, mf_.columns.shape[0])) rows = bench.values transactions = [] for row in rows: # Removing the nans in each row and compressing the rows. # (nan, 1, nan, 3) --> (1, 3) transactions.append(tuple(row[~np.isnan(row)])) # Converting float threshold to represent number of rows. support = int(support*mf_.shape[0]) relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=support) # Converting to DataFrames and getting columns names back. item_sets_df = pd.DataFrame({'Itemset':list(item_sets.keys()), 'Support': list(item_sets.values())}) item_sets_df.Itemset = item_sets_df.Itemset.apply(lambda x: mf_.columns[list(x)].tolist()) # For now the same supports being used in FIM and Association Rules. rules = assocrules.mine_assoc_rules(item_sets, min_support=support, min_confidence=confidence) rules_df = pd.DataFrame(rules, columns = ['X =>', 'Y', 'Support', 'Confidence']) # Converting rules to DataFrame and getting columns names back. rules_df['X =>'] = rules_df['X =>'].apply(lambda x: mf_.columns[list(x)].tolist()) rules_df['Y'] = rules_df['Y'].apply(lambda x: mf_.columns[list(x)].tolist()) return item_sets_df, rules_df
for key, value in arr_backup.items(): if len(value)>1: # JUST in case ALL elements are same in the list/set - since a "set" does not allow for duplicates, we use this construct in here if len(set(value))==1: support = len(value) #converting to 1st charater in the company name back to upper case for i in range(0,len(value)): value[i]=str(value[i][0]).upper()+str(value[i][1:]) company.append(value) elif len(set(value))>1: list6=[] for m in range(0, len(value)): list6.append(str(value[m]).split()) transactions=list6 support = len(value) relim_input = itemmining.get_relim_input(transactions) report = itemmining.relim(relim_input, support) c=report.keys() c.sort() m=0 flag=0 for m in range(0, len(value)): for n in range(0,len(list(c[-1]))): if re.search(value[m],list(c[-1])[n]): flag=1 else : flag =0 company.append([str(value[m][0]).upper()+value[m][1:]]*support) break elif len(value)==1: try:
def find_frequent_itemsets(transactions, support): relim_input = itemmining.get_relim_input(transactions) item_sets = itemmining.relim(relim_input, min_support=support) return item_sets