예제 #1
0
    def test_relim(self):
        ts1 = perftesting.get_default_transactions()
        relim_input = itemmining.get_relim_input(ts1)
        report = itemmining.relim(relim_input, 2)
        self.assertEqual(17, len(report))
        self.assertEqual(6, report[frozenset(['b', 'd'])])

        ts2 = perftesting.get_default_transactions_alt()
        relim_input = itemmining.get_relim_input(ts2)
        report = itemmining.relim(relim_input, 2)
        self.assertEqual(19, len(report))
        self.assertEqual(5, report[frozenset(['a', 'b'])])
예제 #2
0
    def test_relim(self):
        ts1 = perftesting.get_default_transactions()
        relim_input = itemmining.get_relim_input(ts1)
        report = itemmining.relim(relim_input, 2)
        self.assertEqual(17, len(report))
        self.assertEqual(6, report[frozenset(['b', 'd'])])

        ts2 = perftesting.get_default_transactions_alt()
        relim_input = itemmining.get_relim_input(ts2)
        report = itemmining.relim(relim_input, 2)
        self.assertEqual(19, len(report))
        self.assertEqual(5, report[frozenset(['a', 'b'])])
예제 #3
0
def draw():
    d2wl = get_health_data_2d(0, 2, "mining-data/Health Data.csv")
    # frequent
    transactions = [round(x[1] * 1) / 1 for x in d2wl[1]]
    transactions = zip(transactions)
    relim_input = itemmining.get_relim_input(transactions)
    report = itemmining.relim(relim_input, min_support=2)
    report_keys = report.keys()
    report_values = report.values()
    joint_sort_report = zip(report_values, report_keys)
    joint_sort_report.sort(reverse=True)
    print('Health analyse - walk&run:relim frequence:')
    f = open('result/Health analyse - walk&run - frequent(relim method).txt',
             'w')
    print >> f, joint_sort_report
    f.close()
    ks.draw_2d(d2wl[1],
               5,
               'Health analyse - walk&run',
               d2wl[0][0],
               d2wl[0][1],
               is_time_sequence=True,
               marker='x',
               save_path='result/walk&run')

    d2wl = get_health_data_2d(0, 2, "mining-data/Sleep Analysis.csv")
    # frequent
    transactions = [round(x[1] * 1) / 1 for x in d2wl[1]]
    transactions = zip(transactions)
    relim_input = itemmining.get_relim_input(transactions)
    report = itemmining.relim(relim_input, min_support=2)

    report_keys = report.keys()
    a = []
    for x in report_keys:
        for i in x:
            a.append(i)
    a.sort()
    report_keys = [frozenset([x]) for x in a]
    report_sort = [{k: report[k]} for k in report_keys]

    print('Health analyse - sleep:relim frequence:')
    f = open('result/Health analyse - sleep - frequent(relim method).txt', 'w')
    print >> f, report_sort
    f.close()
    ks.draw_2d(d2wl[1],
               5,
               'Health analyse - sleep',
               d2wl[0][0],
               d2wl[0][1],
               is_time_sequence=True,
               marker='x',
               save_path='result/sleep')
예제 #4
0
def __findFS(df, sup):
    """
    This function is used to find the "frequent" itemsets
    for the database
    """
    if type(df) == pd.core.frame.DataFrame:
        L = df.shape[0]
        tss = []
        for i in range(L):
            tss.append(df.columns[df.iloc[i] > 0].tolist())
        relim = itemmining.get_relim_input(tss)
    else:
        relim = itemmining.get_relim_input(df)
    report = itemmining.relim(relim, min_support=sup)
    return report
예제 #5
0
def associationRules(transactions, userid, followed=(), c_userid=None):
    relim_input = itemmining.get_relim_input(transactions)
    item_sets = itemmining.relim(relim_input, min_support=2)
    rules = assocrules.mine_assoc_rules(item_sets,
                                        min_support=2,
                                        min_confidence=0.5)

    recom_user = {}
    for rule_user in rules:
        if userid in rule_user[0] and not any(
                map(rule_user[1].__contains__,
                    followed)) and not c_userid in rule_user[1]:
            # 支持度
            support = rule_user[2] / len(transactions)
            # リフト値 1より大きい場合は、Aが発生するとBが発生しやすくなると解釈できる
            lift = (rule_user[3] / support, )
            if lift[0] <= 1:
                continue
            rule_user += lift
            recom_user[rule_user[1]] = rule_user[4]
    recom_user_sorted = sorted(recom_user.items(),
                               key=lambda x: x[1],
                               reverse=True)
    print("*" * 100)
    print("ユーザーレコメンド(バスケット分析)")
    print(recom_user_sorted)
    print("*" * 100)
    rcom_userid_list = set()
    for rcom_userid in recom_user_sorted:
        rcom_userid_list = rcom_userid_list.union(rcom_userid[0])
    return list(rcom_userid_list)
예제 #6
0
def getCooccur(ts,groups,reverse,min_s=2,min_c=0.5):
    crules=dict()
    result=dict()
    relim_input=itemmining.get_relim_input(ts)
    itemset=itemmining.relim(relim_input,min_support=min_s)
    rules=assocrules.mine_assoc_rules(itemset,min_support=min_s\
                                      ,min_confidence=min_c)
    # Now calculate the best rule for each cis
    # Clean the rules
    for rule in rules:
        if len(rule[0])>1:
            continue
        else:
            if not rule[0] in crules.keys():
                crules[rule[0]]=dict()
                for elem in rule[1]:
                    crules[rule[0]][elem]=rule[3]
    for x in reverse.keys():
        if not frozenset({x}) in crules.keys():
            continue
        for y in reverse.keys():
            if not y in crules[frozenset({x})].keys():
                continue
            result[(x,y)]=crules[frozenset({x})][y]
            
        
    return result
예제 #7
0
def getFrequentItems(transactions): #function to get frequent itemsets based on given transactions
    relim_input = itemmining.get_relim_input(transactions) #restructure transactions into relim input 
    item_sets = itemmining.relim(relim_input, MINSUP) #get itemsets with minimum support 
    results = []
    for k, v in item_sets.items(): #return results
        results.append(list(k))
    return results
def printPyminingResult(transactions, support, confidence):
    print '\n\nPymining algorithm:'
    relim_input = itemmining.get_relim_input(transactions)
    item_sets = itemmining.relim(relim_input, min_support=support * 196)

    print 'Frequent item set:( size:', len(item_sets), ')'
    for key in item_sets:
        print '[',
        for keys in key:
            print keys, ',',
        print '], ',

    rules = assocrules.mine_assoc_rules(item_sets,
                                        min_support=support * 196,
                                        min_confidence=confidence)
    print '\n\nRules:'
    for rule in rules:
        print '[',
        for _ in rule[0]:
            print _,
            if (len(rule[0]) > 1):
                print ',',
        print '->',
        for _ in rule[1]:
            print _,
            if (len(rule[1]) > 1):
                print ',',
        print '], confidence:', rule[3], ', support:', rule[2] / float(196)
예제 #9
0
def freqPaatterns(transactions, len, outfName):
    print "fp mining"
    relim_input = itemmining.get_relim_input(transactions)
    report = itemmining.relim(relim_input, min_support = len * 0.01)
    with open(outfName, "w") as fout:
        for fp in report.items():
            #print "%s\t%f\n" % (";".join(fp[0]), fp[1] / float(len))
            fout.write("%s\t%f\n" % (";".join(fp[0]), fp[1] / float(len)))
def find_frequent_artists(sample_clusters):
    """
     Finds frequent artists from a sample cluster object of users, cluster labels, and artist data
    """

    print("Finding frequent item sets")
    print(sample_clusters)

    # sample cluster data on 5000 random american users, k = 10 for k means, and top 5 artists
    frequent_artist_dict = {}

    for cluster, user_data in sample_clusters:

        print("\nFinding frequent artists for cluster {}".format(cluster))

        num_users = len(user_data.user_id)

        # calculates the minimum support of artists according to some proportion of users
        # ex: pass in 10, so min support is num users / 10, or 10% of users
        # for some reason we can't import this number as a parameter...?
        min_sup = math.floor(num_users / 5)

        if min_sup == 0:
            min_sup = 1

        # this is for humongous clusters where a large minimum support ( > 300 ) doesn't really make sense
        # for the Last.fm data set
        # if num_users > 1000:
        #     min_sup = num_users/20

        # print("min sup: ", min_sup)
        # print("number of users: {}".format(num_users))

        # create a list of "transactions" for frequent mining from the top artists for the current user
        transactions = (list(user_data.top_artists))
        relim_input = itemmining.get_relim_input(transactions)

        # the report stores each frequent item as a dictionary of the form:
        # frozenset(artist id, artist name) : count
        report = itemmining.relim(relim_input, min_support=min_sup)

        # each frequent item is stored as a frozen set
        # process each frozen set item by converting it into a list and accessing the data
        # (through the 0th index, because it's a list with just 1 element)
        # then grabbing just the artist name through the 1st index
        # (because it is the 2nd item in the (artist ID, artist name) tuple for each frozen set

        report = [(list(item)[0][1], report[item]) for item in report
                  if len(item) == 1]

        # sort the report object in reverse order so the highest played artists are first
        report = sorted(report, key=lambda tup: tup[1], reverse=True)
        # print(report)

        # store the report list for the cluster number in the frequent artist dictionary
        frequent_artist_dict[cluster] = report

    return frequent_artist_dict
예제 #11
0
    def frequent_itemsets(self, min_support=2):
        """Mine frequent item sets (FIM) using the RELIM algorithm.

        :param min_support: Minimum count of occurences an itemset must have to be returned.
        :returns: A mapping of itemsets to their supportcount.
        :rtype: dict(set=int)
        """
        relim_input = itemmining.get_relim_input(list(self.groups()))
        return itemmining.relim(relim_input, min_support=min_support)
예제 #12
0
파일: qq.py 프로젝트: ilemfans/COMP4331
 def freq_items(self):
     #Give this function a list of transactions and a key function, returns a data
     #structure used as the input of the relim algorithm.
     relim_input = itemmining.get_relim_input(self.transactions)
     #use the input from get_relim_input() and minimum support
     #to return frequent item sets of items appearing in a list of transactions
     #based on Recursive Elimination
     item_sets = itemmining.relim(relim_input, self.min_sup)
     return item_sets
예제 #13
0
    def testDefaultSupportConf(self):
        ts1 = perftesting.get_default_transactions()
        relim_input = itemmining.get_relim_input(ts1)
        report = itemmining.relim(relim_input, 2)
        rules = assocrules.mine_assoc_rules(report, min_support=2)
        self.assertEqual(23, len(rules))

        a_rule = (frozenset(['b', 'e']), frozenset(['d']), 2, 1.0)
        self.assertTrue(a_rule in rules)

        ts2 = perftesting.get_default_transactions_alt()
        relim_input = itemmining.get_relim_input(ts2)
        report = itemmining.relim(relim_input, 2)
        rules = assocrules.mine_assoc_rules(report, min_support=2)
        self.assertEqual(29, len(rules))

        a_rule = (frozenset(['e']), frozenset(['a', 'd']), 2, 2.0/3.0)
        self.assertTrue(a_rule in rules)
예제 #14
0
    def testDefaultSupportConf(self):
        ts1 = perftesting.get_default_transactions()
        relim_input = itemmining.get_relim_input(ts1)
        report = itemmining.relim(relim_input, 2)
        rules = assocrules.mine_assoc_rules(report, min_support=2)
        self.assertEqual(20, len(rules))

        a_rule = (frozenset(['b', 'e']), frozenset(['d']), 2, 1.0)
        self.assertTrue(a_rule in rules)

        ts2 = perftesting.get_default_transactions_alt()
        relim_input = itemmining.get_relim_input(ts2)
        report = itemmining.relim(relim_input, 2)
        rules = assocrules.mine_assoc_rules(report, min_support=2)
        self.assertEqual(20, len(rules))

        a_rule = (frozenset(['e']), frozenset(['a', 'd']), 2, 2.0/3.0)
        self.assertTrue(a_rule in rules)
예제 #15
0
    def testSupport5(self):
        ts1 = perftesting.get_default_transactions()
        relim_input = itemmining.get_relim_input(ts1)
        report = itemmining.relim(relim_input, 5)
        rules = assocrules.mine_assoc_rules(report, min_support=5)
        self.assertEqual(2, len(rules))

        a_rule = (frozenset(['d']), frozenset(['b']), 6, 0.75)
        self.assertTrue(a_rule in rules)
예제 #16
0
    def testConfidence075(self):
        ts1 = perftesting.get_default_transactions()
        relim_input = itemmining.get_relim_input(ts1)
        report = itemmining.relim(relim_input, 2)
        rules = assocrules.mine_assoc_rules(report, min_support=2, min_confidence=0.75)
        self.assertEqual(8, len(rules))

        a_rule = (frozenset(["b"]), frozenset(["d"]), 6, 0.75)
        self.assertTrue(a_rule in rules)
예제 #17
0
    def testSupport5(self):
        ts1 = perftesting.get_default_transactions()
        relim_input = itemmining.get_relim_input(ts1)
        report = itemmining.relim(relim_input, 5)
        rules = assocrules.mine_assoc_rules(report, min_support=5)
        self.assertEqual(2, len(rules))

        a_rule = (frozenset(['d']), frozenset(['b']), 6, 0.75)
        self.assertTrue(a_rule in rules)
예제 #18
0
 def freq_items(self):
     """
     Import the itemming tools and perform relim mining
     Returns:
         item_sets: frequent item sets
     """
     relim_input = itemmining.get_relim_input(self.transactions)
     item_sets = itemmining.relim(relim_input, self.min_sup)
     return item_sets
예제 #19
0
def get_association_rules(seqs, min_support=2):
    transactions = list(seqs)

    # print transactions
    relim_input = itemmining.get_relim_input(transactions)
    item_sets = itemmining.relim(relim_input, min_support=min_support)
    rules = assocrules.mine_assoc_rules(item_sets, min_support=min_support, min_confidence=0.5)
    # print(rules)

    return rules
예제 #20
0
def find_freq_sets():
    """Match"""
    print "\n", my.TS_START, my.TS_WINDOW, "\n"
    SQL = """SELECT timestamp, text\
			FROM {rel_tweet} \
			WHERE timestamp BETWEEN '{ts_start}'
				AND timestamp '{ts_start}' + INTERVAL '{window} days'
			""".format(
        rel_tweet=my.REL_TWEET, ts_start=my.TS_START, window=my.TS_WINDOW
    )

    print "Querying DB..."
    con = psycopg2.connect(my.DB_CONN_STRING)
    cur = con.cursor()
    cur.execute(SQL)
    recs = cur.fetchall()
    con.close()
    print "{count} records retrieved.".format(count=len(recs))

    global sw
    sw = stopwords.words("english")
    sw.extend(my.STOPWORDS)
    sw = list(set(sw))

    global tokens
    with open("data/" + my.DATA_FOLDER + "tokens.json", "rb") as fp:
        tokens = sorted(anyjson.loads(fp.read()))
    print len(tokens)
    pool = Pool(processes=my.PROCESSES)
    tweets = pool.map(_trim_tweet, recs)
    tweets = filter(None, tweets)
    tweets_len = len(tweets)
    recs = None
    print "{count} tokenized tweets to be processed.".format(count=len(tweets))

    # Frequent itemset mining
    relim_input = itemmining.get_relim_input(tweets)
    tweets = None
    print "Generated Relim input."
    sets = itemmining.relim(relim_input, min_support=int(math.sqrt(tweets_len)))
    relim_input = None
    print len(sets), min(sets.values()), max(sets.values())
    sets = sorted(sets.items(), key=lambda x: x[1], reverse=True)
    texts = []
    for s, f in sets[: my.K]:
        txt = " ".join(tokens[i] for i in list(s))
        texts.append(txt)

    filename = "data/" + my.DATA_FOLDER + "frequent_sets"
    with open(filename + "_all" + ".pickle", "wb") as fp:
        pickle.dump(sets, fp)
    with open(filename + ".pickle", "wb") as fp:
        pickle.dump(sets[: my.K], fp)
    with open(filename + ".txt", "wb") as fp:
        fp.write("\n".join(texts))
예제 #21
0
def test_relim(should_print=False, ts=None, support=2):
    if ts is None:
        ts = get_default_transactions()
    relim_input = get_relim_input(ts, lambda e: e)
    fis = set()
    report = {}
    n = _relim(relim_input, fis, report, support)
    if should_print:
        print(n)
        print(report)
    return (n, report)
예제 #22
0
def test_relim(should_print=False, ts=None, support=2):
    if ts is None:
        ts = get_default_transactions()
    relim_input = get_relim_input(ts, lambda e: e)
    fis = set()
    report = {}
    n = _relim(relim_input, fis, report, support)
    if should_print:
        print(n)
        print(report)
    return (n, report)
예제 #23
0
    def testConfidenceForComplexRules(self):
        transab = (("a", "b"),) * 1000
        transac = (("a", "c"),) * 1000
        transabc = (("a", "b", "c"),) * 5
        trans = transab + transac + transabc
        relim_input = itemmining.get_relim_input(trans)
        report = itemmining.relim(relim_input, 5)
        rules = assocrules.mine_assoc_rules(report, min_support=5, min_confidence=0.9)
        self.assertEqual(3, len(rules))

        a_rule = (frozenset(["b", "c"]), frozenset(["a"]), 5, 1.0)
        self.assertTrue(a_rule in rules)
예제 #24
0
    def testConfidenceForComplexRules(self):
        transab = (('a', 'b'), ) * 1000
        transac = (('a', 'c'), ) * 1000
        transabc = (('a', 'b', 'c'), ) * 5
        trans = transab + transac + transabc
        relim_input = itemmining.get_relim_input(trans)
        report = itemmining.relim(relim_input, 5)
        rules = assocrules.mine_assoc_rules(
            report, min_support=5, min_confidence=0.9)
        self.assertEqual(3, len(rules))

        a_rule = (frozenset(['b', 'c']), frozenset(['a']), 5, 1.0)
        self.assertTrue(a_rule in rules)
예제 #25
0
def frequentSet(input):
    relim_input = itemmining.get_relim_input(input)
    report = itemmining.relim(relim_input, min_support=MIN_SUPPORT)

    return report
    # by pass filtering parameters
    filteredResult = []
    for item in report:
        for key in item:
            if validDependent(list(key)):
                filteredResult.append(item)

 
    return (0,filteredResult)
예제 #26
0
파일: freq.py 프로젝트: nbir/gambit-scripts
def _freq_phrases(id):
	'''Map function'''
	with open('data/' + my.DATA_FOLDER + 'sets.json', 'rb') as fp:
		sets = anyjson.loads(fp.read())
	keywords = tuple(itertools.chain(*sets[str(id)]))

	data_path = 'data/' + my.DATA_FOLDER + 'data/'
	with open(data_path + str(id) + '.txt', 'rb') as fp:
		cr = csv.reader(fp, delimiter=',')
		tweets = [row[1].split() for row in cr]
	tweets_len = len(tweets)
	print 'Tweets:', tweets_len
	if tweets_len < 5000: return

	fd = FreqDist(tuple(itertools.chain(*tweets)))
	vocab = fd.keys()[:-fd.Nr(1)]
	for w in keywords:
		if w in vocab: vocab.remove(w)
	print 'Tokens:', fd.N(), ',', fd.B(), '-', fd.Nr(1), '=', len(vocab)

	path = 'data/' + my.DATA_FOLDER + 'frequent_tokens/'
	if not os.path.exists(path): os.makedirs(path)
	with open(path + str(id) + '.txt', 'wb') as fp:
		fp.write('\n'.join(vocab))
	
	words = dict((w, vocab.index(w)) for w in vocab)
	tweets = tuple(tuple(words[w] for w in tw if w in words) \
									for tw in tweets)
	
	relim_input = itemmining.get_relim_input(tweets)
	print 'Generated Relim input... Min support:', math.sqrt(tweets_len)
	sets = itemmining.relim(relim_input, 
							min_support=int(math.sqrt(tweets_len)))
	print len(sets), min(sets.values()), max(sets.values())
	sets = sorted(sets.items(), key=lambda x:x[1], reverse=True)
	
	texts = []
	for s, f in sets[:1000]:
		txt = ' '.join(vocab[i] for i in tuple(s))
		texts.append(txt)

	path = 'data/' + my.DATA_FOLDER + 'frequent_phrases/'
	if not os.path.exists(path): os.makedirs(path)
	with open(path + str(id) + '_all' + '.pickle', 'wb') as fp:
		pickle.dump(sets, fp)
	with open(path + str(id) + '.pickle', 'wb') as fp:
		pickle.dump(sets[:my.K], fp)
	with open(path + str(id) + '.txt', 'wb') as fp:
		fp.write('\n'.join(texts))
예제 #27
0
 def mine_rules_relim(self, baskets):
   print "preparing itemset"
   relim_input = itemmining.get_relim_input(baskets)
   
   print "finding frequent itemsets"
   self.item_sets = itemmining.relim(relim_input, min_support = len(baskets) * self.min_support)
   
   print "finding association rules"
   self.rules = assocrules.mine_assoc_rules(self.item_sets, len(baskets), 
       min_support = self.min_support, min_confidence = self.min_confidence, 
       min_lift = self.min_lift)
   
   # sort by support
   self.nonmax_suppression()
   self.rules = sorted(self.rules, key = lambda x: -x[2])
예제 #28
0
def fun1():
    transactions = (('a', 'b', 'c'), ('b'), ('a'), ('a', 'c', 'd'), ('b', 'c'),
                    ('b', 'c'))
    relim_input = itemmining.get_relim_input(transactions)
    report = itemmining.relim(relim_input, min_support=2)
    print("Here is the default transactions data.\n{}".format(transactions))

    time.sleep(0.5)
    print("See the default frequent itemsets?[y/n]")
    p1 = input()
    if p1 == 'y':
        print(report)
        input("Press any button to return to CONTENT")
    else:
        input("Thank you for visiting, press Any button to return to CONTENT")
예제 #29
0
def getAssoc(transactions,min_s=2,min_c=0.5):
    '''
    getAssoc will return the association rule in the following 
    format
    '''
    result=dict()
    for key in transactions.keys():
        relim_input=itemmining.get_relim_input(\
            transactions[key])
        itemset=itemmining.relim(relim_input\
                                 ,min_support=min_s)
        result[key]=assocrules.mine_assoc_rules(\
                                                itemset,min_support=min_s,min_confidence=min_c)

    return result
예제 #30
0
파일: assoc.py 프로젝트: ersushil/infotron
    def execute(self,data_source):
        import csv
        transactions = []
        with open(data_source, 'r') as f:
            reader = csv.reader(f)
            transactions = list(reader)
        # print(transactions)
        # transactions = [['a', 'b', 'c'], ['b'], ['a'], ['a', 'c', 'd'], ['b', 'c'], ['b', 'c']]
        # print(type(transactions))
        relim_input = itemmining.get_relim_input(transactions)
        report = itemmining.relim(relim_input, min_support = self.support.get())

        result = ""
        for itemset, count in report.items():
            result = result + ", ".join(itemset) + ": " + str(count) + "\n"
        # print(report)
        return result
예제 #31
0
def find_similarity_sets(segmented_df, minimal_support):
    global TRANSACTIONS, ITEMSETS, FINAL_ROWS
    TRANSACTIONS = []
    FINAL_ROWS = {}
    print("Getting going")
    segmented_df = segmented_df.groupby("time_group").apply(squash)

    print("FIM...")
    transactions = tuple(TRANSACTIONS)  #perftesting.get_default_transactions()
    print("Got " + str(len(transactions)) + " transactions.")
    relim_input = itemmining.get_relim_input(transactions)
    ITEMSETS = itemmining.relim(relim_input, min_support=minimal_support)

    # clean for closed frequent patterns
    itemsets1 = []
    print("Closing the patterns...")
    for s in ITEMSETS:
        can_add = True
        for j in range(len(itemsets1)):
            if set(s).issubset(itemsets1[j]):
                can_add = False
                break
            if set(s).issuperset(itemsets1[j]):
                itemsets1[j] = set(s)
                can_add = False
                break
        if can_add:
            itemsets1.append(s)
    ITEMSETS = itemsets1

    # per itemset determine rows
    print("Per Window go...")
    segmented_df.index = range(len(segmented_df))
    max_idx = segmented_df["time_group"].max()
    segmented_df = segmented_df.groupby("time_group").apply(
        lambda x: put_to_final(x, max_idx))

    # write result
    res_df = pandas.DataFrame(
        [[str(a) for a in r[:-2]] + [r[-2], r[-1]]
         for r in list(FINAL_ROWS.values())],
        columns=["indices", "values", "length_indices", "length_content"])
    res_df = res_df[res_df["length_content"] >= minimal_support]

    return res_df.groupby("indices").apply(lambda x: x.iloc[-1])
예제 #32
0
def mine_patterns(data, MINING_METHOD, CONFUSION_MATRIX):
    if (MINING_METHOD == 'seq_mining'):
        mined_patterns = {
            KEY: sorted(seqmining.freq_seq_enum([data[KEY][trace_id] for trace_id in data[KEY]], min_support=2))
            for KEY in CONFUSION_MATRIX
        }
    if (MINING_METHOD == 'item_mining'):
        mined_patterns_to_be_preprocessed = {
            KEY: itemmining.relim(itemmining.get_relim_input([data[KEY][trace_id] for trace_id in data[KEY]]), min_support=2)
            for KEY in CONFUSION_MATRIX
        }

        mined_patterns = {
            KEY: [
                (tuple(element), mined_patterns_to_be_preprocessed[KEY][element])
                for element in mined_patterns_to_be_preprocessed[KEY]]
            for KEY in CONFUSION_MATRIX
        }
    return mined_patterns
예제 #33
0
파일: assoc.py 프로젝트: ersushil/infotron
    def execute(self,data_source):
        import csv
        transactions = []
        with open(data_source, 'r') as f:
            reader = csv.reader(f)
            transactions = list(reader)
        # print(transactions)
        # transactions = [['a', 'b', 'c'], ['b'], ['a'], ['a', 'c', 'd'], ['b', 'c'], ['b', 'c']]
        # print(type(transactions))
        relim_input = itemmining.get_relim_input(transactions)
        item_sets = itemmining.relim(relim_input, min_support = self.support.get())
        rules = assocrules.mine_assoc_rules(item_sets, min_support=self.support.get(), min_confidence=self.confidence.get_float())
        result = ""
        for rule in rules:
            print(rule[0])
            result = result + ", ".join(rule[0]) + " => " + ", ".join(rule[1]) + "\n"

            # result = result + ", ".join(rule[0]) + " => " + ", ".join(rule[1]) + ": " + str(rule[2]) + ", " + str(rule[3]) + "\n"
        # print(report)
        return result
예제 #34
0
파일: miner.py 프로젝트: hroncok/badger
def association_rules(data, min_support, min_confidence):
    """
    Generates association rules from crawled data
    """
    badges = data['badges']
    transactions = data['transactions']

    # pymining only works, if the identifiers are one character strings :(
    transactions = tuple(tuple(chr(b) for b in t) for t in transactions)

    # pymining dance
    relim_input = itemmining.get_relim_input(transactions)
    item_sets = itemmining.relim(relim_input, min_support=min_support)
    rules = assocrules.mine_assoc_rules(item_sets, min_support=min_support,
                                        min_confidence=min_confidence)

    # translate identifiers back to badge names
    rules = [[frozenset(badges[ord(b)] for b in r[0]),
              frozenset(badges[ord(b)] for b in r[1]),
              r[2], r[3]] for r in rules]
    return rules
예제 #35
0
def fun2():
    transactions = (('a', 'b', 'c'), ('b'), ('a'), ('a', 'c', 'd'), ('b', 'c'),
                    ('b', 'c'))
    relim_input = itemmining.get_relim_input(transactions)
    item_sets = itemmining.relim(relim_input, min_support=2)
    rules = assocrules.mine_assoc_rules(item_sets,
                                        min_support=2,
                                        min_confidence=0.5)
    print("The default transactions data is:")
    print(transactions)

    time.sleep(0.5)
    input("Press any button to continue...")
    print(
        "Here is the association rules we have mined. Frozenset means the pattern in the transactions"
    )
    time.sleep(1)
    print(rules)
    print(
        "\nNote:(frozenset({e'}), frozenset({'b', 'd'}), 2, 1.0) means:\n # e -> b, d with support 2 and confidence 0.66"
    )
    input("Press Any button to return to CONTENT")
예제 #36
0
def getAssoc2(ts,groups,reverse,min_s=2,min_c=0.5):
    crules=dict()
    result=dict()
    relim_input=itemmining.get_relim_input(ts)
    itemset=itemmining.relim(relim_input,min_support=min_s)
    rules=assocrules.mine_assoc_rules(itemset,min_support=min_s\
                                      ,min_confidence=min_c)
    # Now calculate the best rule for each cis
    # Clean the rules
    for rule in rules:
        if len(rule[0])>1:
            continue
        else:
            if rule[0] in crules.keys():
                if (len(rule[1])+1)*rule[3]<=crules[rule[0]]:
                    continue
                crules[rule[0]]=(len(rule[1])+1)*rule[3]
    for cis in groups.keys():
        key=frozenset({groups[cis]})
        if key in crules.keys():
            result[cis]=crules[key]
        
    return result
예제 #37
0
    def _build_phrase(self):
        input_list = {}
        for post in self.valid:
            for cate in post["category"]:
                if cate not in input_list:
                    input_list[cate] = []
                input_list[cate].append(post["title"] + post["content"])

        phrases = {}
        for cate, posts in input_list.items():
            relim_input = itemmining.get_relim_input(posts)
            fis = itemmining.relim(relim_input, min_support=2)
            phrases[cate] = {}
            for phrase, count in fis.items():
                new_p = list(phrase)
                if len(new_p) >= 2:
                    phrases[cate]['_'.join(new_p)] = count
            print(cate)
            print(phrases[cate])

        phrase_json = open(os.path.join("../data/sto/phrase_dict.json"), 'w')
        json.dump(phrase_json, phrases)
        phrase_json.close()
        return phrases
예제 #38
0
파일: mining.py 프로젝트: cevaris/nebula
 def _execute(self):
     
     self.transactions = mongoComputeHashTagItemSets(self.name)
     relim_input = itemmining.get_relim_input(self.transactions)
     self.item_sets = itemmining.relim(relim_input, self.min_support)
     self.rules = assocrules.mine_assoc_rules(self.item_sets, self.min_support, self.min_confidence)
예제 #39
0
#%%
sorted_rules = sorted(rules, key=lambda tup: tup[2], reverse=True)
sorted_rules

top_rules = sorted_rules[:10]

def get_product_names(list_ids):
    return tuple([df_fruits.loc[id, 'product_name'] for id in list_ids])

top_rules_names = [tuple(list(map(get_product_names, rule[:2]))+ [rule[2]]) for rule in top_rules]
#%% Fast implemented

#This takes long
from pymining import itemmining, assocrules
transactions=aisle_orders['products']
item_sets = itemmining.get_relim_input(transactions)
#%%

min_supp= SUPPORT_THRESHOLD * NUMBER_ORDERS_AISLE
item_sets = itemmining.relim(item_sets, min_support=min_supp)

#%%
thresholds = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
times = []
max_lengths = []
numbers = []
for t in thresholds:
    start = time.time()          
    rules = assocrules.mine_assoc_rules(item_sets, min_support=min_supp, min_confidence=t)
    execution_time = time.time() - start
    times.append(execution_time)
def FreqentPattern(fileloc):

    megawordlist= ReadProcessedSent(fileloc)
    relim_input = itemmining.get_relim_input(megawordlist)
    patternlist=itemmining.relim(relim_input, min_support=0.007*len(megawordlist))
    return patternlist
 def get_relim_input(self):
     list_of_neighbors = self.g.neighborhood(vertices=self.g.vs,order=1)
     return itemmining.get_relim_input(list_of_neighbors)
예제 #42
0
def Apriori_three(data_tuple):
    transactions = perftesting.get_default_transactions()
    relim_input = itemmining.get_relim_input(transactions)
    item_sets = itemmining.relim(relim_input, min_support=50)#////最小关联度
    rules = assocrules.mine_assoc_rules(item_sets, min_support=2, min_confidence=0.5)
    print(rules)
예제 #43
0
def Apriori_tow(data_tuple):
    transactions = data_tuple
    relim_input = itemmining.get_relim_input(transactions)
    item_sets = itemmining.relim(relim_input, min_support=100)
    rules = assocrules.mine_assoc_rules(item_sets, min_support=100, min_confidence=0.5)
    print(rules)
예제 #44
0
def Apriori_one(data_tuple):
    relim = itemmining.get_relim_input(data_tuple)
    report = itemmining.relim(relim, min_support=100)# //最小关联度
    print(report)
예제 #45
0
                break
rulepitlens = []
ruledurlens = []
rulepitdurlens = []

patpitlens = []
patdurlens = []
patpitdurlens = []
#20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1
for sup in [
        30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13,
        12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2
]:
    print(sup)

    relim_input = itemmining.get_relim_input(allpit)
    print("length relim_input:" + str(len(relim_input)))
    for item in relim_input:
        print("length item:" + str(len(item)))
        print(item)
    item_sets = itemmining.relim(relim_input, min_support=sup)
    print("length item_sets:" + str(item_sets))

    #rules = assocrules.mine_assoc_rules(item_sets, min_support=sup, min_confidence=0.5)

    #print(len(rules))
    # print((rules))
    #rulepitlens.append(len(rules))

    # print(nonsense)
    # relim_input = itemmining.get_relim_input(durfam)
    def freq_items(self):

        relim_input = itemmining.get_relim_input(self.transactions)
        item_sets = itemmining.relim(relim_input, self.min_sup)
        return item_sets
예제 #47
0
    # print(gramsstring)
    # print(len(gramsstring))
    for things in grams:
        inputlist.append(things)

    print("input item counts:"+str(len(inputlist)))
    # for inputentry in inputlist:
    #     print(inputentry)
    # print(inputlist)
    #20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1
    # ,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2
    for sup in [30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2]:
        # print(sup)


        relim_input = itemmining.get_relim_input(inputlist)
        # print("length relim_input:" + str(len(relim_input)))
        # for item in relim_input:
        #     print("length item:"+str(len(item)))
            # print(item)
        item_sets = itemmining.relim(relim_input, min_support=sup)
        # print("length item_sets:"+str(len(item_sets)))
        itempitlens.append(len(item_sets))
        # print(item_sets)

        rules = assocrules.mine_assoc_rules(item_sets, min_support=sup, min_confidence=0)

        # print(len(rules))
        # print((rules))
        rulepitlens.append(len(rules))
예제 #48
0
def twitter_analysis(path,files):
    tweets=[]
    words=[]
    bigrams=[]
    hashtags=[]
    mentions=[]
    twitterpic=[]
    instagrampic=[]
    otherUrls=[]
    positive_terms=[]
    negative_terms=[]
    usefulwords=[]

    #read input files and save tweets#
    f=open(str(path)+'/tweets/'+str(files)+'.txt_parsed.txt_final.txt')
    for line in f:
        if line.startswith('@@@'):
            try:
                tweet=line.strip().lower().split('\t')[3]
            except IndexError:
                tweet=" "
            tweets.append(tweet)
            
            #words#
            terms=tweet.split()
            words.append(terms)

            ##useful words##
            usefulword=[]
            for term in terms:
                if term in english_stopwords: continue          
                else: usefulword.append(term)
            usefulwordt=tuple(usefulword)
            usefulwords.append(usefulwordt)            
            usefulwordst=tuple(usefulwords)
            
            #two grams#
            twograms=ngrams(terms, 2)
            tgs=[]
            for twogram in twograms:
                joined='_'.join(twogram)
                tgs.append(joined) ##the original code will return genrator so I changed##
            bigrams.append(tgs)

            #hash tags#
            myHashtags=re.findall('#[^ ]+',tweet)
            hashtags.append(myHashtags)

            #mentions#
            myMentions=re.findall('@[^ ]+',tweet)
            mentions.append(myMentions)

            #twitter pic#
            myTp=re.findall('pic.twitter.com/[^ ]+',tweet)
            twitterpic.append(myTp)

            #instagram pic#
            myIp=re.findall('http://instagram.com/p/[^ ]+',tweet)
            instagrampic.append(myIp)            
            
            #other Url#
            otherUrl=re.findall('http://[^ ]+',tweet)
            other=[]
            for Url in otherUrl:
                if "http://instagram.com/p/" not in Url:
                    other.append(Url)
            otherUrls.append(other)
          
            #positive words#
            myPos=[]
            for term in terms:
                if term in english_stopwords: continue          
                if term in pos:myPos.append(term)
            positive_terms.append(myPos)

            #negative words#
            myNeg=[]
            for term in terms:
                if term in english_stopwords: continue          
                if term in neg:myNeg.append(term)
            negative_terms.append(myNeg)

##twitter_analysis('/Users/yuehan/Desktop/twitter','tial.txt_parsed.txt_final.txt')##This is for demo analysis

    ##save csv files##
    newpath = str(path)+'/text_results/'
    if not os.path.exists(newpath): os.makedirs(newpath)
    
    with open(str(path)+'/text_results/'+str(files)+'_textresults.csv','wb') as f1:
        w=csv.writer(f1)
        row1=['tweets','words','bigrams','hashtags','mentions','twitterpic','instagrampic','otherUrls','positive_terms','negative_terms']
        w.writerow(row1)
        for v in range(0,len(tweets)):
            tweetss=tweets[v]
            wordss=words[v]
            wordss=','.join(wordss)
            bigramss=bigrams[v]
            bigramss=','.join(bigramss)
            hashtagss=hashtags[v]
            hashtagss=','.join(hashtagss)
            mentionss=mentions[v]
            mentionss=','.join(mentionss)
            twitterpics=twitterpic[v]
            twitterpics=','.join(twitterpics)
            instagrampics=instagrampic[v]
            instagrampics=','.join(instagrampics)
            otherUrlss=otherUrls[v]
            otherUrlss=','.join(otherUrlss)
            positive_termss=positive_terms[v]
            positive_termss=','.join(positive_termss)
            negative_termss=negative_terms[v]
            negative_termss=','.join(negative_termss)

            w.writerow([tweetss,wordss,bigramss,hashtagss,mentionss,twitterpics,instagrampics,otherUrlss,positive_termss,negative_termss])

    ##find frequent Item Sets (which shows up more than 3 times, I tried with 5 and some will have blank sets)
    relim_input = itemmining.get_relim_input(usefulwordst)
    report = itemmining.relim(relim_input, min_support=3)
    ##print report.keys()
    newpath = str(path)+'/frequentsets/'
    if not os.path.exists(newpath): os.makedirs(newpath)
    writer=csv.writer(open(str(path)+'/frequentsets/'+str(files)+'_frequentsets.csv','wb'))
    for key,value in report.items():
        if "', '" in str(key):
            key=str(key)
            key=key.replace("frozenset(['","")
            key=key.replace("'])","")
            key=key.replace('frozenset(["','')
            key=key.replace('"])','')
            key=key.replace("',",",")
            key=key.replace('",',',')
            key=key.replace(', "',', ')
            key=key.replace(", '",", ")
            writer.writerow([key,value])
        else:
            pass
예제 #49
0
import pandas as pd
import numpy as np
from pymining import seqmining, itemmining, assocrules, perftesting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

studydf = pd.read_csv("studydf.csv")
violationdf = studydf[['INSPECTION DATE','VIOLATION CODE']].reset_index()
violationdf['VIOLATION CODE'] = violationdf['VIOLATION CODE'].astype('str')
plotseries = violationdf['VIOLATION CODE'].value_counts().iloc[0:20]
ax = sns.barplot(y=plotseries.index, x=plotseries.values, palette="Blues_d")
testdf = violationdf.groupby(['CAMIS'])['VIOLATION CODE'].apply(list)
minelist = testdf.tolist()[0:10]
minelist = tuple(tuple(x) for x in minelist)
relim_input = itemmining.get_relim_input(minelist)
item_sets = itemmining.relim(relim_input, min_support=2)
rules = assocrules.mine_assoc_rules(item_sets, min_support=2, min_confidence=0.5)
print rules
freq_seqs = seqmining.freq_seq_enum(minelist, 2)
print freq_seqs
rules2 = assocrules.mine_assoc_rules(item_sets, min_support=1, min_confidence=0.5)
print rules2
예제 #50
0
	print("---------------------------")
	print("---------------------------")
	print("Eigenvector centrality (a measure of the influence of a node in a network)")
	print(sorted(list(nx.eigenvector_centrality(G).items()),key=operator.itemgetter(1),reverse=True))
	print("---------------------------")
	print("---------------------------")
	print("Katz centrality (relative influence of a node)")
	print(sorted(list(nx.katz_centrality_numpy(G).items()),key=operator.itemgetter(1),reverse=True))
	print("---------------------------")

def sumupRelationship(relationship_mat,subject):
	relationship_mat.drop([subject],inplace=True).sort_values(inplace=True)
	friends = []


addAllNodes(nodes)
buildAllGroupLink(groups)
#drawNetwork()
relations = buildRelationshipMat(plot=False,half=False)
sumupRelationship(relations,"Micoud")
#displayDegrees()
#clustering()



relim_input = itemmining.get_relim_input(groups_tuples)
report = itemmining.relim(relim_input, 2)


#http://www.cl.cam.ac.uk/~cm542/teaching/2011/stna-pdfs/stna-lecture11.pdf
예제 #51
0
#!/usr/bin/env python

from pymining import itemmining, assocrules

data = (('a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'), ('a', 'f', 'g'),
        ('b', 'd', 'e', 'f', 'j'), ('a', 'b', 'd', 'i', 'k'), ('a', 'b', 'e',
                                                               'g'))

min_sup = 3
min_conf = 0.5

# get frequent itemsets using pymining
relim_input = itemmining.get_relim_input(data)
frequent_itemsets = itemmining.relim(relim_input, min_sup)

# get association rules using pymining
results = assocrules.mine_assoc_rules(frequent_itemsets, min_sup, min_conf)

for key in frequent_itemsets.keys():
    print(str(key) + " : " + str(frequent_itemsets[key]))

for key in results:
    print(str(key))
예제 #52
0
    def frequency_item_set(self, columns = None, support = 0.1, rules = False, confidence = 0.8, engine = 'pymining'):
        """
        Use frequency item set mining to find subgroups where data goes 
        missing together.
        
        Parameters:
        ----------
        columns: list, default None
            Subset of the columns you want to use.
        
        support: float, default 0.1
            Minimum support to use while item set mining. Too small values can break memory.
            Support should be greater than zero and less than 1.

        rules: bool, default True
            Whether association rules should be mined. If True, method returns two_sample
            dataframes instead of one.

        confidence: float, default
            Minimum confidence for rules being mined. Should be between 0 and 1.

        engine: {'pymining'}
            Only one engine is being supported right now.
        
       
        Returns:
        -------
        item_sets_df, rules_df : DataFrame, DataFrame
            Tabulated results for itemsets and association rules mined. 
            
        """ 

        from pymining import itemmining, assocrules

        if support<=0 or support>1: #support should be between one and zero.
            print('Support has to be between 0 and 1')
            return

        if confidence<0 or confidence>1: #confidence can be zero.
            print('Confidence has to be between 0 and 1')
            return
            


        mf_ = self._masked_missframe(where = None, columns = columns, how = 'any')
        
        # Converting all missing values to 1, and non-missing to nan.
        bench = pd.DataFrame(np.where(mf_, 1, np.nan), columns = mf_.columns)

        # Replacing 1's with the index of the column they belong to.
        # Converting to numbers instead of column names for supposed performance boost.
        bench = bench * list(range(0, mf_.columns.shape[0]))

        rows = bench.values
        transactions = []
        for row in rows:
            # Removing the nans in each row and compressing the rows.
            # (nan, 1, nan, 3) --> (1, 3)
            transactions.append(tuple(row[~np.isnan(row)]))

        # Converting float threshold to represent number of rows.
        support = int(support*mf_.shape[0])

        relim_input = itemmining.get_relim_input(transactions)
        item_sets = itemmining.relim(relim_input, min_support=support)
        
        # Converting to DataFrames and getting columns names back.
        item_sets_df = pd.DataFrame({'Itemset':list(item_sets.keys()), 'Support': list(item_sets.values())})
        item_sets_df.Itemset = item_sets_df.Itemset.apply(lambda x: mf_.columns[list(x)].tolist())

        
        # For now the same supports being used in FIM and Association Rules.
        rules = assocrules.mine_assoc_rules(item_sets, min_support=support, min_confidence=confidence)

        rules_df = pd.DataFrame(rules, columns = ['X =>', 'Y', 'Support', 'Confidence'])
        # Converting rules to DataFrame and getting columns names back.
        rules_df['X =>'] = rules_df['X =>'].apply(lambda x: mf_.columns[list(x)].tolist())
        rules_df['Y'] = rules_df['Y'].apply(lambda x: mf_.columns[list(x)].tolist())
        
        return item_sets_df, rules_df
for key, value in arr_backup.items():
    if len(value)>1:
        # JUST in case ALL elements are same in the list/set - since a "set" does not allow for duplicates, we use this construct in here
        if len(set(value))==1:
                support = len(value)
                #converting to 1st charater in the company name back to upper case
                for i in range(0,len(value)):
                    value[i]=str(value[i][0]).upper()+str(value[i][1:])
                company.append(value)
        elif len(set(value))>1:
            list6=[]
            for m in range(0, len(value)):
                list6.append(str(value[m]).split())
            transactions=list6
            support = len(value)
            relim_input = itemmining.get_relim_input(transactions)
            report = itemmining.relim(relim_input, support)
            c=report.keys()
            c.sort()
            m=0
            flag=0
            for m in range(0, len(value)):
                for n in range(0,len(list(c[-1]))):
                    if re.search(value[m],list(c[-1])[n]):
                        flag=1
                    else :
                        flag =0
                company.append([str(value[m][0]).upper()+value[m][1:]]*support)
                break
    elif len(value)==1:
        try:
예제 #54
0
def find_frequent_itemsets(transactions, support):
    relim_input = itemmining.get_relim_input(transactions)
    item_sets = itemmining.relim(relim_input, min_support=support)
    return item_sets