示例#1
0
def fi(data):
    print("Using apriori for fim : ")
    freq_list = fim.apriori(tracts=data, supp=5)
    print("The frequent item list is : ")
    print(freq_list)
    rules = fim.apriori(tracts=data, target='r', eval='c', report='c')
    print("The rules are : ")
    print(rules)
    rules = fim.apriori(tracts=data, target='r', eval='l', report='l')
    print("The rules are (evaluated with lift): ")
    print(rules)
    print("lfi using apriori : ")
    lfi(freq_list)

    print("Using fp-growth for fim : ")
    freq_list = fim.fpgrowth(tracts=data, supp=5)
    print("The frequent item list is : ")
    print(freq_list)
    rules = fim.fpgrowth(tracts=data,
                         target='r',
                         eval='c',
                         report='c',
                         conf=60)
    print("The rules are (evaluated with confidence): ")
    print(rules)
    rules = fim.fpgrowth(tracts=data,
                         target='r',
                         eval='l',
                         report='l',
                         conf=60)
    print("The rules are (evaluated with lift): ")
    print(rules)

    print("lfi using fpgrowth is : ")
    lfi(freq_list)
def generate_associations(transactions, min_sup, problems, methods, datasets):
    apriori_patterns = apriori(transactions, supp=-min_sup)
    print '-------- Apriori --------'
    output = []
    for (pattern, support) in sorted(apriori_patterns, key=lambda x: -x[1]):
        print pattern, support
    print 'Number of patterns:', len(apriori_patterns)

    rules = apriori(transactions, target='r', supp=-5, conf=90, report='sc')
    print '-------- One-to-Many Association Rules --------'
    counter = 0
    for (ruleleft, ruleright, support,
         confidence) in sorted(rules, key=lambda x: x[0]):
        if ruleleft in datasets:
            for rule in ruleright:
                if rule not in datasets:
                    counter += 1
                    print ruleleft, '-->', ruleright, support, confidence
        elif ruleleft in problems:
            for rule in ruleright:
                if rule not in problems:
                    counter += 1
                    print ruleleft, '-->', ruleright, support, confidence
        elif ruleleft in methods:
            for rule in ruleright:
                if rule not in methods:
                    counter += 1
                    print ruleleft, '-->', ruleright, support, confidence
        #print ruleleft,'-->',ruleright,support,confidence
    print 'Number of rules:', len(rules)
示例#3
0
def main():
    ###############################################################################
    # Some basic data analysis
    ###############################################################################

    data = read_file("/Users/zxj/cs535/data/marketing.data",
                     lambda x: x.split(","))
    frequent_itemset = apriori(data, supp=-3, zmin=2, target='s', report='a')
    rules = apriori(data, supp=-3, zmin=2, target='r', report='rCL')
    print("Frequent itemsets are: ")
    print(frequent_itemset)
    print("Rules are:")
    print(rules)
示例#4
0
    def mine_frequent_itemsets(self, pandas_df, minsup):
        txns_classless = TransactionDB.from_DataFrame(pandas_df.iloc[:, :-1])

        frequent_itemsets = fim.apriori(txns_classless.string_representation,
                                        supp=minsup * 100,
                                        report="s")

        return frequent_itemsets
示例#5
0
def get_onetomany_rules(transactions, output_file):
    # One-to-Many Association Rules (ResponseBot 8)
    rules = apriori(transactions, target='r', supp=-1000, conf=70, report='sc')
    #output
    for (ruleleft, ruleright, support,
         confidence) in sorted(rules, key=lambda x: x[0]):
        #p = ','.join(pattern)
        output_file.write('{}' + '-->' + ' {} {} {}\n'.format(
            ruleleft, ruleright, str(support), str(confidence)))
def run_fim_apriori(df, minsup):
    print("running fim apriori function")
    processed_df = process_dataset(df)
    print("dataset processed")
    result_raw = fim.apriori(processed_df, supp=(minsup * 100))
    print("apriori runned")
    result = list(map(lambda i: list(i[0]), result_raw))
    print("apriori results processed")
    return result
def Apriori_main(data_fname, minSupport, out_fname='Apriori_results.txt'):
    lines, tid = readDataset(data_fname)
    t1 = clock()
    temp_freq = apriori(tid,
                        target='s',
                        supp=float(minSupport * 100),
                        conf=100)
    CPU_time = clock() - t1
    freq_items = convert2dic(temp_freq, lines)
    printResults(data_fname, minSupport, CPU_time, freq_items, out_fname)
    return (freq_items, CPU_time)
示例#8
0
def do_apriori(transactions, output_file):
    print("Preforming Apriori...")
    # http://www.borgelt.net/pyfim.html
    patterns = apriori(transactions,
                       supp=-1000)  # +: percentage -: absolute number
    for (pattern, support) in sorted(patterns, key=lambda x: -x[1]):
        #pattern is a tuple
        if len(set(pattern)) <= 1: continue
        p = ','.join(pattern)
        output_file.write('{} {} \n'.format(p, str(support)))
    print('Number of patterns: {}'.format(len(patterns)))
示例#9
0
def run_apriori(transactions, min_sup, author_number):
    apriori_patterns = apriori(transactions, supp=-min_sup)
    print '-------- Apriori --------'
    output = []
    for (pattern, support) in sorted(apriori_patterns, key=lambda x: -x[1]):
        if len(pattern) < author_number:
            continue
        print pattern, support
        output.append([pattern, support])
    print 'Number of patterns:', len(apriori_patterns)
    return output
示例#10
0
def find_by_year(years):
	print 'For year %s:' %years
	t_years = get_transactions('%s.csv' %years)
	report_years = fim.apriori(t_years,supp=0,conf=0,zmax=4)
	result_years = {}
	for i in range(1,5):
		result_years[i] = {}
	for words,counts in report_years:
		result_years[len(words)][words] = counts[0]
	for i in result_years:
		print 'Top 20 most frequent patterns with length %d' %i
		print_top_n(result_years[i],20)	
示例#11
0
def _generate_rules_for_user(recs, settings):
    tracts = []
    for rec in recs:
        if rec.selected and rec.forecast:
            tracts.append(_process_recommendations(rec, settings))

    metadata = {'total_trips': len(tracts)}

    if not tracts:
        return None, metadata

    return [listify(rule) for rule in apriori(tracts, supp=-1)], metadata
示例#12
0
def pass_one(inputfile, support):
    result = []
    p = 0.05 # 0.6
    baskets = generate_baskets(inputfile)
    sample_baskets = sampling(baskets, p)
    result.append(len(sample_baskets)*1./len(baskets))
    adjusted_support = 0.9 * p * support # use lower threshold
    frequent_items = [x[0] for x in fim.apriori(sample_baskets, supp=adjusted_support)]
    result.append(frequent_items)
    print frequent_items
    negative_border_items = generate_negative_border(frequent_items, sample_baskets)
    print negative_border_items
    result.append(negative_border_items)
    return result
示例#13
0
 def mine(self, df, supp, zmin, zmax, is_count=False):
     """return association rules from netflow df
     
     Arguments:
         df {dataframe} -- netflow dataframe
         supp {number} -- [description]
         zmin {number} -- minimum number in itemset
         zmax {number} -- max number in itemset
     
     Keyword Arguments:
         is_count {bool} -- true if minsup is absolute number, else percentage (default: {False})
     """
     transacts = self.netflow_to_transc(df)
     supp = -supp if is_count else supp  # (positive: percentage, negative: absolute number)
     return apriori(transacts, target='m', supp=supp, zmin=zmin, zmax=zmax)
def run_fim_apriori(df: pd.DataFrame, min_suppport_thr: float) -> List[Transaction]:
    try:
        import fim
    except Exception as e:
        raise e

    print("running fim apriori function")
    dataset_transactions: List[Transaction] = dataframe_to_list_of_transactions(df)
    print("dataset processed")

    frequent_itemsets_raw = fim.apriori(dataset_transactions, supp=(min_suppport_thr*100))  # List[Tuple[]]
    print("apriori runned")

    frequent_itemsets: List[Transaction] = list(map(lambda i: list(i[0]), frequent_itemsets_raw))  # Li
    print("apriori results processed")
    return frequent_itemsets
示例#15
0
def pass_one(transactions, support, epsilon, delta, mu):
    result = []
    n = calculate_bound(epsilon, delta)
    adjusted_support = support - lower_by(n, mu)

    sampled_transactions = random.sample(transactions, n) 

    # TODO why?
    result.append(len(sampled_transactions)*1./len(transactions))

    frequent_items =\
        [x[0] for x in\
                apriori(sampled_transactions, supp=adjusted_support*100)]

    result.append(frequent_items)

    negative_border_items = generate_negative_border(frequent_items, sampled_transactions)
        
def generateCARs(transactionDB: TransactionDB,
                 support: float = 1,
                 confidence: float = 50,
                 maxlen: int = 10,
                 **kwargs):
    """Function for generating ClassAssociationRules from a TransactionDB

    Parameters
    ----------
    :param transactionDB : TransactionDB

    support : float
        minimum support in percents if positive
        absolute minimum support if negative

    confidence : float
        minimum confidence in percents if positive
        absolute minimum confidence if negative

    maxlen : int
        maximum length of mined rules

    **kwargs :
        arbitrary number of arguments that will be
        provided to the fim.apriori function

    Returns
    -------
    list of CARs

    """
    appear = transactionDB.appeardict

    rules = fim.apriori(transactionDB.string_representation,
                        supp=support,
                        conf=confidence,
                        mode="o",
                        target="r",
                        report="sc",
                        appear=appear,
                        **kwargs,
                        zmax=maxlen)

    return createCARs(rules)
示例#17
0
tid = int(argv[1])
if tid < -2:
    print(fpgrowth.__doc__)
elif tid < -1:
    print(eclat.__doc__)
elif tid < 0:
    print(apriori.__doc__)
else:
    tracts = [[1, 2, 3], [1, 4, 5], [2, 3, 4], [1, 2, 3, 4], [2, 3], [1, 2, 4],
              [4, 5], [1, 2, 3, 4], [3, 4, 5], [1, 2, 3]]
    print('transactions:')
    for t in tracts:
        print(t)
    if tid < 1:
        print('apriori(tracts, supp=-3, zmin=2):')
        for r in apriori(tracts, supp=-3, zmin=2):
            print r
    elif tid < 2:
        print('eclat(tracts, supp=-3, zmin=2):')
        for r in eclat(tracts, supp=-3, zmin=2):
            print r
    elif tid < 3:
        print('fpgrowth(tracts, supp=-3, zmin=2):')
        for r in fpgrowth(tracts, supp=-3, zmin=2):
            print r
    else:
        print('fim(tracts, supp=-3, zmin=2, report=\'#\'):')
        for r in fim(tracts, supp=-3, zmin=2, report='#'):
            print r
示例#18
0
#SUPPORT = -3
#ZMIN = 2
SUPPORT = -0.01 * len(data)
ZMIN = 1
#ZMAX = 5
CONF = 80
LIFT = 1.03
ITEMSET_REPORT = 'a'
#RULES_REPORT='rCL'
RULES_REPORT = 'C'

#frequent_itemset = apriori(data, supp=SUPPORT, zmin=ZMIN, conf=CONF, eval='l', thresh=LIFT, target='s', report=ITEMSET_REPORT)
frequent_itemset = apriori(data,
                           supp=SUPPORT,
                           zmin=ZMIN,
                           target='s',
                           report=ITEMSET_REPORT)

#rules = apriori(data, supp=SUPPORT, zmin=ZMIN, conf=CONF, eval='l', thresh=LIFT, target='r', report=RULES_REPORT)
rules = apriori(data, supp=SUPPORT, zmin=ZMIN, target='r', report=RULES_REPORT)

#for item in data:
#	print(item)
print('========================')
#for itemset in frequent_itemset: print(itemset)
for i in range(5):
    print(random.choice(frequent_itemset))
print('------------------------')
#for rule in rules: print(rule)
for i in range(5):
示例#19
0
        transactions.append([x for x in line.rstrip('\n').split(',') if filter_fun_str(x) ])

trcount = len(transactions)
print(trcount)
transactions = filter(lambda x: len(x) >= 1, transactions)
print(len(transactions))

def ele_to_str(ele):
    global db
    return util.ele_to_str(db, ele)

sets = map(set, transactions)
print('running algorithm', file=sys.stderr)
before = time.time()
if algo == "apriori":
    s = fim.apriori(transactions, supp=2)
    s = sorted(s, key=lambda x:x[1])
    for items,supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp/float(trcount), "|".join(items)).encode('utf-8'))
if algo == "eclat":
    s = fim.eclat(transactions, supp=2)
    s = sorted(s, key=lambda x:x[1])
    for items,supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8'))
elif algo == "eclat-rules":
    rules = fim.eclat(transactions, target='r', report='aC')
    rules = sorted(rules, key = lambda x: x[3])
    for consequence, condition, support_count, confidence_percent in rules:
        condition = map(ele_to_str, condition)
示例#20
0
__author__ = 'ssatpati'

import re
from fim import apriori

baskets = []
with open('ProductPurchaseData.txt', 'r') as f:
    for line in f:
        items = re.split(r'\s', line)
        items.sort()
        baskets.append(items)

for r in apriori(baskets, target='r', zmax=2, supp= -100, report='c', eval='c', conf=90):
    if r[0]:
	print '%s\t%s\t%s' %(r[0],r[1],r[2])
示例#21
0
import fim
import sys

from data import load_transactions

if __name__ == '__main__':
    filename = sys.argv[1]
    threshold = int(sys.argv[2])

    transactions = load_transactions(filename)
    fis = fim.apriori(transactions)

    print fis
__author__ = 'Daniel Bernardes, Mamadou Diaby, Raphael Fournier, Francoise Fogelman Soulie and Emmanuel Viennet'
__version__ = '1.0'

import sys, fim

if __name__ == "__main__":

    if len(sys.argv) > 1:
        threshold = 100 * float(sys.argv[1])
    else:
        threshold = 1  # default: 1%

    fin = sys.stdin
    profiles = []
    for line in fin:
        tokens = line.split()
        profiles.append(map(int, tokens[1:]))
    fin.close()

    confidence = fim.apriori(profiles,
                             max=2,
                             supp=threshold,
                             report='e',
                             eval='c',
                             thresh=threshold)

    for triplet in confidence:
        print '{0:d} {1:d} {2:.5f}'.format(triplet[0][0], triplet[0][1],
                                           triplet[1][0])
示例#23
0
# edges
pos={}
pos.update(pos_a)
pos.update(pos_b)
nx.draw_networkx_edges(g,pos,edgelist=nx.edges(g),width=1,alpha=0.8,edge_color='g')
nx.draw_networkx_labels(g,pos,font_size=10,font_family='sans-serif')

plt.title('Graph representation')
plt.show()


###############################################################################
# Now do rule finding
###############################################################################

frequent_itemset = apriori(data, supp=-3, zmin=2, target='s', report='a')
rules = apriori(data, supp=-3, zmin=2, target='r', report='rCL')

print(frequent_itemset)
print(rules)



frequent_itemset_1=[]
frequent_itemset_2=[]
frequent_itemset_3=[]
'''
original apriori, generate 1 item
''' 
def apriori(data,supp=3,zmin,target,report):
    frequent_itemset=[]
示例#24
0
                transaction.add(words[i] + '_' + words[i + 1])
                i += 2
                continue
            #if words[i] in stopwords or len(words[i]) == 1 or words[i].isdigit() or not words[i].isalnum() or len(words[i]) > 25:
            #   i += 1
            #   continue
            if not check_string_guality(words[i]):
                i += 1
                continue
            transaction.add(words[i])
            i += 1
        if not transaction: continue
        author_transactions.append(list(transaction))
    # output
    #for transaction in author_transactions:
    #   print transaction

    print "Done with author transactions"

    #patterns = apriori(author_lists, supp=-12)
    patterns = apriori(author_transactions, supp=-10)
    print '-------- Author Affiliation Apriori --------'
    #for (pattern,support) in sorted(patterns,key=lambda x:-x[1]):
    #   if len(pattern) <= 1: continue
    #  print pattern,support
    for (pattern, support) in sorted(patterns, key=lambda x: -x[1]):
        #pattern is a tuple
        if len(pattern) <= 1: continue
        output_file.write('{} {} \n'.format(pattern, str(support)))
    print 'Number of patterns:', len(patterns)
plt.subplot(121)
nx.draw_networkx_nodes(g,pos_a,nodelist=a,node_color='r',node_size=300,alpha=0.8)
nx.draw_networkx_nodes(g,pos_b,nodelist=b,node_color='b',node_size=300,alpha=0.8)

# edges
pos={}
pos.update(pos_a)
pos.update(pos_b)
nx.draw_networkx_edges(g,pos,edgelist=nx.edges(g),width=1,alpha=0.8,edge_color='g')
nx.draw_networkx_labels(g,pos,font_size=10,font_family='sans-serif')

plt.title('Graph representation')
plt.show()

'''
###############################################################################
# Now do rule finding
###############################################################################

#frequent_itemset = apriori(data, supp=10, zmin=2, target='s', report='a')
rules = apriori(data, supp=10, zmin=2, zmax=5, target='r', report='SCl')

#print(frequent_itemset)
#print(rules)

###############################################################################
#sort the result
r = sorted(rules, reverse=True, key=lambda x: x[4])
print(r)
print(len(r))
示例#26
0
    def build(self):
        """
        Takes variables from constructor and outputs
        anomaly scores for each row/observation as a pandas data frame
        """

        # create variables which hold number of rows and columns
        rows = len(self.data.index)
        cols = len(self.data.columns)

        # default value of mlen parameter is equal to number of columns
        if self.mlen == 0.5:
            self.mlen = cols

        # adding column name to each row
        data2 = pd.DataFrame({col: str(col) + '='
                              for col in self.data},
                             index=self.data.index) + self.data.astype(str)

        # transforming dataset to list of lists
        records = []
        for i in range(0, rows):
            records.append([str(data2.values[i, j]) for j in range(0, cols)])

        # creating transaction dataset
        print("Creating transactions from a dataset")
        t = time.process_time()
        te = TransactionEncoder()
        oht_ary = te.fit(records).transform(records, sparse=True)
        elapsed_time = time.process_time() - t
        print("Transactions created in: " + str(elapsed_time))
        # creating sparse data frame from transaction encoder
        sparse_df = pd.SparseDataFrame(oht_ary,
                                       columns=te.columns_,
                                       default_fill_value=False)

        # using apriori to find frequent itemsets
        supp = self.support / 100
        print("Running apriori with settings: support={}, maxlen={}".format(
            supp, self.mlen))
        t = time.process_time()
        apr = fim.apriori(records,
                          target="s",
                          supp=self.support,
                          zmax=self.mlen,
                          report="s")
        elapsed_time = time.process_time() - t
        print("Apriory finished in: " + str(elapsed_time))

        # adding new column length of the rule
        frequent_itemsets = pd.DataFrame(apr)
        frequent_itemsets['length'] = frequent_itemsets[0].apply(
            lambda x: len(x))
        print(frequent_itemsets.index)
        # creating a numpy array of lengths and qualities so operation such as multiplication can be done
        fiLenghts = np.array([frequent_itemsets['length']], np.int8)
        fiQualities = np.array([frequent_itemsets[1]], np.float16)

        # converting itemsets to frozensets so subsetting can be done
        print("Converting to datasets frozensets and computing coverages")
        t = time.process_time()
        items_list = []
        fi = frequent_itemsets[0]
        for i in fi:
            items_frozen = frozenset(i)
            items_list.append(items_frozen)

        # converting transactions to frozensets
        transactions = []
        for i in records:
            i = frozenset(i)
            transactions.append(i)

        # list that will temporarily store coverages
        tmp = []
        print("Computing coverages")
        # comparing each transaction with itemsets
        for i in items_list:
            for i2 in transactions:
                if i.issubset(i2):
                    tmp.append(1)
                else:
                    tmp.append(0)

        # converting coverages to numpy array
        coverages = np.array([tmp])
        elapsed_time = time.process_time() - t
        print("Computing coverages finished in: " + str(elapsed_time))
        # converting coverages to valid shape and creating transpose matrix
        fiCoverages = coverages.reshape(len(frequent_itemsets), rows)
        fiCoveragesT = np.array(np.transpose(fiCoverages))
        fiQualitiesT = np.transpose(fiQualities)

        # compute basic score for each coverage
        t = time.process_time()
        print("Computing results for each coverage")
        result = np.array(1 / (fiLenghts * np.transpose(fiQualities)),
                          dtype=np.float16)
        print(result)
        elapsed_time = time.process_time() - t
        print("Computing results finished in: " + str(elapsed_time))
        # create matrix with results on diagonal
        result2 = np.diagonal(result)
        shape = (len(frequent_itemsets), len(frequent_itemsets))

        # it was necessary to create matrix with zeros to have matrix with particular shape with values only on the diagonal
        diagonalHelper = np.zeros(shape)
        np.fill_diagonal(diagonalHelper, result2)

        # matrix multiplication
        print("Computing individual scores")
        scores = np.array(np.matmul(fiCoveragesT, diagonalHelper))
        print("Done")
        # prepare  items for subsetting
        data_items = sparse_df.columns.values.tolist()

        dataItems = pd.DataFrame(data_items)

        # coverage of each data item
        dataItemsList = []

        # converting to frozenset so subsetting can be done
        for i in range(0, len(dataItems.values)):
            dataItemsList.append(
                frozenset([str(dataItems.values[i, j]) for j in range(0, 1)]))

        dataItemsCoverage = []

        # subsetting columns with items
        for i in dataItemsList:
            for i2 in items_list:
                if i2.issubset(i):
                    dataItemsCoverage.append(1)
                else:
                    dataItemsCoverage.append(0)

        # converting coverages to numpy array
        dataItemsCoverageArr = np.array([dataItemsCoverage])

        tmp4 = dataItemsCoverageArr.reshape(len(dataItems.values),
                                            len(frequent_itemsets))

        # variable that stores sum of columns
        print("Computing penalizations")
        t = time.process_time()
        colSums = np.array(self.data.count(axis=1))

        # variable that stores sum of rows
        rowSums = np.array([fiCoveragesT.sum(axis=1)])

        # preparing parts of the equation
        part1 = np.matmul(fiCoveragesT, np.transpose(tmp4))

        part2 = part1.sum(axis=1)

        # compute how many items of each transaction is not covered by appropriate frequent itemsets
        fiC = colSums - part2
        elapsed_time = time.process_time() - t
        print("Computing penalizations finished in: " + str(elapsed_time))

        # compute final score as a mean value of scores and penalizations: (sum of scores + penalization*number of transactions)/(number of scores + penalization)
        print("Computing scores for each row")
        t = time.process_time()
        scorings = (scores.sum(axis=1) + fiC * rows) / (rowSums + fiC)
        elapsed_time = time.process_time() - t
        print("Computing final scores finished in: " + str(elapsed_time))
        # creating pandas data frame with Scores column
        columnOutput = ["Scores"]
        output = pd.DataFrame(data=np.transpose(scorings),
                              index=data2.values,
                              columns=columnOutput,
                              dtype=object)

        # print anomaly scores for each row/observation
        print(output)

        # returns maximum value of anomaly scores
        print(output[output['Scores'] == output['Scores'].max()])

        print(fiC)
        return output
hotel = json.load(jsonHotel)
hotelAddress = hotel.get('HotelInfo').get('Address').encode('ascii', 'ignore').lower()
hotelAddress = re.split(r'<.[^>]+>([^<]*)<.[^>]+>', hotelAddress)
hotelName = hotel.get('HotelInfo').get('Name').encode('ascii','ignore').lower()
nameTokens = tokenizer.tokenize(hotelName)
stopset = set().union(stopset, nameTokens)
stopset = set().union(stopset, hotelAddress)
#tokens = word_tokenize(hotel.get('Reviews')[0].get('Content').encode('ascii', 'ignore'))

#print removeStopWords(tokens)

test = [('l1', 'l2', 'l5'), ('l2', 'l4'), ('l2', 'l3'), ('l1', 'l2', 'l4'), ('l1', 'l3'), ('l2', 'l3'), ('l1', 'l3'), ('l1', 'l2', 'l3', 'l5'),('l1', 'l2', 'l3')]
#trans = []
#
#for review in hotel.get('Reviews'):
#    content = review.get('Content').encode('ascii', 'ignore')
#    allTokens = tokenizer.tokenize(content.lower())
#    strippedTokens = removeStopWords(allTokens)
#    trans += [(strippedTokens)]
    
setmax = 2
support = 35
confidence = 50
#feats = apriori(trans, zmax= setmax,supp=support, conf = confidence, target='r', report = 'CS')
testFeats = apriori(test, zmin=2, zmax=setmax, supp=support, conf = confidence, target = 'r', report = 'Cabh')

#file = open('Hotel'+fileNum+'_z'+str(setmax)+'_s'+str(support)+'_c'+str(confidence)+'.txt', 'w')
#file.write(str(feats))
#file.close()
print testFeats
示例#28
0
        if words[i] in stopwordset or len(words[i]) == 1 or words[i].isdigit():
            i += 1
            continue
        transaction.add(words[i])
        i += 1
    transactions.append(list(transaction))
# output
for transaction in transactions:
    print transaction

# In[5]:

# http://www.borgelt.net/pyfim.html
from fim import apriori, fpgrowth

patterns = apriori(transactions, supp=-3)  # +: percentage -: absolute number
# output
print '-------- Apriori --------'
for (pattern, support) in sorted(patterns, key=lambda x: -x[1]):
    print pattern, support
print 'Number of patterns:', len(patterns)

# In[6]:

patterns = fpgrowth(transactions, supp=-3)
# output
print '-------- FP-Growth --------'
for (pattern, support) in sorted(patterns, key=lambda x: -x[1]):
    print pattern, support
print 'Number of patterns:', len(patterns)
示例#29
0
trcount = len(transactions)
print(trcount)
transactions = filter(lambda x: len(x) >= 1, transactions)
print(len(transactions))


def ele_to_str(ele):
    global db
    return util.ele_to_str(db, ele)


sets = map(set, transactions)
print('running algorithm', file=sys.stderr)
before = time.time()
if algo == "apriori":
    s = fim.apriori(transactions, supp=2)
    s = sorted(s, key=lambda x: x[1])
    for items, supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp / float(trcount),
                                     "|".join(items)).encode('utf-8'))
if algo == "eclat":
    s = fim.eclat(transactions, supp=2)
    s = sorted(s, key=lambda x: x[1])
    for items, supp in s:
        items = map(ele_to_str, items)
        print(u"{} items: {}".format(supp, "|".join(items)).encode('utf-8'))
elif algo == "eclat-rules":
    rules = fim.eclat(transactions, target='r', report='aC')
    rules = sorted(rules, key=lambda x: x[3])
    for consequence, condition, support_count, confidence_percent in rules:
示例#30
0
文件: FIM.py 项目: designer357/Small
__author__ = 'chengmin'
import fim,os
#with open(os.getcwd()+'/APRIORIREADME.TXT',"w")as fout:
    #fout.write(fim.apriori.__doc__)
    #fout.write(str(help(fim.apriori)))
    #fout.write(fim.apriori)
fin=open("FIMtest").read()
print(fim.apriori(fin))
示例#31
0
    df[i] = i + " " + df[i].astype(str)
df["credit_default"] = "default " + df["credit_default"].astype(str)

baskets = df.values.tolist()
'''
itemsets = apriori(baskets, supp=10, zmin=2, target='m')
print (len(itemsets))

foo = open("itemsets", "w")

for itemset in itemsets:
    foo.write(""+str(itemset))
    foo.write("\n\n")
'''

rules = apriori(baskets, supp=10, zmin=1, target='r', conf=40, report='ascl')

f = open("rules", "w")
count = 0
lista = list()
for rule in rules:
    if rule[5] > 2 and "female" in rule[1]:
        count += 1
        f.write("" + str(rule))
        f.write("\n\n")
        lista.append(rule[0])

print(count)
print(set(lista))
'''
#FREQUENT ITEMSET
示例#32
0
    def mbasket(self,
                data_p,
                support_par,
                confidence_par,
                method='apriori',
                lift_par=1.2):
        """
        :param
        :return:
        """
        start0 = time()
        ## Apriori analysis + association rules creation
        # find association rules with default settings
        rules = pd.DataFrame()
        if method == 'fpgrowth':
            start = time()
            frequent_itemsets = pd.DataFrame(fpgrowth(data_p[1],
                                                      supp=support_par * 100,
                                                      zmin=1,
                                                      target='s',
                                                      report='s',
                                                      mode='o'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("fpgrowth() -", run_time, "s")

        elif method == 'eclat':
            start = time()
            frequent_itemsets = pd.DataFrame(eclat(data_p[1],
                                                   supp=support_par * 100,
                                                   zmin=1,
                                                   target='s',
                                                   report='s',
                                                   mode='o'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("eclat() -", run_time, "s")

        elif method == 'relim':
            start = time()
            frequent_itemsets = pd.DataFrame(relim(data_p[1],
                                                   supp=support_par * 100,
                                                   zmin=1,
                                                   target='s',
                                                   report='s'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("relim() -", run_time, "s")

        elif method == 'sam':
            start = time()
            frequent_itemsets = pd.DataFrame(sam(data_p[1],
                                                 supp=support_par * 100,
                                                 zmin=1,
                                                 target='s',
                                                 report='s'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("sam() -", run_time, "s")

        elif method == 'ista':
            start = time()
            frequent_itemsets = pd.DataFrame(ista(data_p[1],
                                                  supp=support_par * 100,
                                                  zmin=1,
                                                  report='s'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            print("ista() -", run_time, "s")

        else:
            start = time()
            frequent_itemsets = pd.DataFrame(apriori(data_p[1],
                                                     supp=support_par * 100,
                                                     zmin=1,
                                                     target='s',
                                                     report='s',
                                                     mode='o'),
                                             columns=['itemsets', 'support'])
            run_time = round(time() - start, 3)
            rules = self.find_rules(frequent_itemsets, lift_par,
                                    confidence_par)
            print("apriori() -", run_time, "s")

        # users with antedecents from the rules calculated above
        if rules.shape[0] > 0:
            pivot_binary_tr = data_p[0].transpose()
            recom = {}
            pb = {}
            rules['antecedents'] = rules['antecedents'].apply(
                lambda x: frozenset(x))
            for user in pivot_binary_tr.columns:
                products_bought = pivot_binary_tr.index[pivot_binary_tr[user]
                                                        == 1]
                pb[user] = products_bought
                suitable_rules = []
                for ante in rules['antecedents'].iteritems():
                    if ante[1].issubset(products_bought):  # do poprawy
                        suitable_rules.append(ante[0])
                recom[user] = suitable_rules

            recom = pd.DataFrame.from_dict(recom,
                                           orient='index').stack().reset_index(
                                               level=1,
                                               drop=True).reset_index()
            recom.columns = ['review_profilename', 'Rule']

            # products bought - zeby wykluczyc te produkty z rekomendacji
            pb2 = pd.DataFrame.from_dict(pb,
                                         orient='index').stack().reset_index(
                                             level=1, drop=True).reset_index()
            pb2.columns = ['review_profilename', 'antecedents1']

            rule_cons = rules[['antecedents', 'consequents']].reset_index()
            rule_cons['consequents'] = [
                i for i, *_ in rule_cons['consequents']
            ]  # change format from frozensets to normal
            rule_cons['antecedents'] = [
                list(i) for i in rule_cons['antecedents']
            ]
            rule_cons.columns = ['Rule', 'antecedents', 'consequents']
            recom = recom.merge(rule_cons, on='Rule')
            recom.drop_duplicates(['review_profilename', 'consequents'],
                                  keep='first',
                                  inplace=True)

            # exclude from recommendations products already bought
            recom_already_satisfied = pb2.merge(
                recom,
                left_on=['review_profilename', 'antecedents1'],
                right_on=['review_profilename', 'consequents'])
            recom_already_satisfied['beer_already_known'] = 1
            sum_recom_already_satisfied = recom_already_satisfied[
                'beer_already_known'].sum()

            recom_new = recom.merge(
                recom_already_satisfied[[
                    'review_profilename', 'Rule', 'consequents',
                    'beer_already_known'
                ]],
                on=['review_profilename', 'Rule', 'consequents'],
                how='left')
            recom_new = recom_new[recom_new['beer_already_known'] != 1][[
                'review_profilename', 'Rule', 'antecedents', 'consequents'
            ]]
        else:
            rule_cons = 0
            recom_new = 0
            sum_recom_already_satisfied = 0

        mba_time = round(time() - start0, 2)
        print("mbasket() -", mba_time, "s")

        return [
            rule_cons, recom_new, mba_time, sum_recom_already_satisfied,
            run_time
        ]
示例#33
0
    item_counts = {}
    frequencies = []
    rules = []

    # read input file by line and split to
    # store each line as list of items
    # fim apriori expects this data structure as input
    baskets = [
        line.split()
        for line in open('ProductPurchaseData.txt').read().strip().split('\n')
    ]

    # target = 's'       -> frequent item sets
    # supp   = negative  -> minimum support of an item set
    # zmax   = number    -> maximum number of items per item set
    item_sets = fim.apriori(baskets, target='s', supp=-100, zmax=2)

    for r in item_sets:
        # apriori reports in the format ((itemset), support)
        item_set, item_count = r
        # k = 1
        if len(item_set) == 1:
            item_counts[item_set[0]] = item_count
        # k = 2
        elif len(item_set) == 2:
            item1, item2 = item_set
            # lexicographial ordering of the rules
            # report the rule a->b but not the rule b->a
            if item1 < item2:
                frequencies.append(((item1, item2), float(item_count)))
示例#34
0
import sqlite3
from fim import apriori, fpgrowth

def extract_data(db):
    command = 'SELECT keyword, paper_id from Keywords'
    output = db.execute(command)
    transactions = {}
    for t in output:
        if t[0] not in transactions:
            transactions[t[1]] = [t[0]]
        else:
            transactions[t[1]].append(t[0])

    list_tran = []
    for t in transactions:
        list_tran.append(transactions[t])

    return list_tran

if __name__ == '__main__':
    conn = sqlite3.connect('data/database.db')
    c = conn.cursor()
    transactions = extract_data(c)
    print 'loaded data'
    apriori_patterns = apriori(transactions, supp=-7)
    print '-------- Apriori --------'
    output = []
    for (pattern,support) in sorted(apriori_patterns,key=lambda x:-x[1]):
        if len(pattern) > 1:
            print pattern,support
    print 'Number of patterns:',len(apriori_patterns)
def BBMax_Accuracy_main(fname1, fname2, fname3, sup, m_time):
    global tid
    global lines
    change_raw_data = 0
    
    lines,tid = readDataset(fname3)
    abs_supp = ceil(sup*lines-0.5)

    F = readLargeData(fname1)
    
    S = minSet(readSensitiveSet(fname2))
    SS = supersets(S, F.keys())
    
    Rev_Fd = list(set(F) - SS)
    start_time = clock()   
    
    Rev_pos_bord = convert2frozen_m(apriori(Rev_Fd, target = 'm', supp = float(0.0), conf=100))
    
    sens_ind = []
    for i in xrange(lines):
        flag = True
        for itemset in S:
            if itemset.issubset(tid[i]):
                sens_ind.append(i)
                flag = False
                break

        if flag:
            for itemset in Rev_pos_bord:
                if itemset.issubset(tid[i]):
                    sens_ind.append(i)
                    break

    
    sens_ind = list(set(sens_ind))
    N = len(sens_ind)
    
    cpx = cplex.Cplex()
    cpx.set_results_stream(None)
    cpx.objective.set_sense(cpx.objective.sense.minimize)
    cpx.variables.add(obj = (1,)*N + (lines,)*len(Rev_pos_bord), lb =(0,)*(N+len(Rev_pos_bord)),
                      ub=(1,)*N+(cplex.infinity,)*len(Rev_pos_bord),
                      types=(cpx.variables.type.integer,)*(N+len(Rev_pos_bord)))

    for itemset in S:
        ind = []
        cur_supp = 0
        for i in xrange(N):
            if itemset.issubset(tid[sens_ind[i]]):
                ind.append(i)
                cur_supp += 1
##        print(ind)
##        print(itemset)
##        print("GreaterEq than ",cur_supp - abs_supp + 1)
        cpx.linear_constraints.add(lin_expr = [SparsePair(ind = ind, val=(1,)*len(ind))],
            senses=["G"], rhs=[cur_supp - abs_supp + 1])


    rpb_c = 0       
    for itemset in Rev_pos_bord:
        ind = []
        cur_supp = 0
        for i in range(N):
            if itemset.issubset(tid[sens_ind[i]]):
                ind.append(i)
                cur_supp += 1

        
        ind.append(N+rpb_c)
        rpb_c += 1
##        print(ind)
##        print(itemset)
##        print("LessEq than ",cur_supp - abs_supp)
        cpx.linear_constraints.add(lin_expr = [SparsePair(ind = ind, val=(1,)*(len(ind)-1)+(-1,))],
            senses=["L"], rhs=[cur_supp - abs_supp])

    cpx.parameters.mip.pool.relgap.set(0)
##    cpx.parameters.preprocessing.presolve.set(cpx.parameters.preprocessing.presolve.values.off)
##    cpx.populate_solution_pool()
    cpx.solve()
    if any([i for i in map(int, cpx.solution.get_values())[lines:(lines+len(Rev_pos_bord))]]):
        print("System would be infeasible!!")

    
    print("Number of solutions: ", cpx.solution.pool.get_num())
    
##    print(map(int, cpx.solution.get_values()))
##    print("Objective: ", cpx.solution.get_objective_value())
    for i in get_indices(map(int, cpx.solution.get_values())[0:N], 1):
        
        temp_set = set()
        for itemset in S:
            if itemset.issubset(tid[sens_ind[i]]):
                temp_set.add(itemset)

        while len(temp_set) > 0:
            item_dic = {}
            for itemset in temp_set:
                for item in itemset:
                    if item not in item_dic:
                        item_dic[item] = 0

                    item_dic[item] += 1
            max_val = 0
            for item, freq in item_dic.items():
                if max_val < freq:
                    max_val = freq
                    element = frozenset([item])

            if item_dic.values().count(max_val) > 1:
                candidates = [frozenset([item])  for item, freq in item_dic.items() if freq==max_val]
                element = candidates[randrange(0, len(candidates))]

            tid[sens_ind[i]] = tid[sens_ind[i]] - element
            change_raw_data += 1
            for itemset in temp_set:
                if element.issubset(itemset):
                    temp_set = temp_set - set([itemset])
    
    exec_time=((clock()-start_time))
    total_time = exec_time + m_time,"sec"
    exec_time = exec_time,"sec"
    cpx = None

    ######----create out files-----######
    out_file = open('BBMax_Accuracy_results.txt', 'w')
    out_file2 = open('BBMax_Accuracy_visible.txt','w')
    print('Border-Based Max-Accuracy Results\n---------------\n',file = out_file2)
    print('\nThe Sanitized DB is:\n',file = out_file2)
    for i in xrange(lines):
        k = ' '.join(sorted(tid[i]))
        z = '{'+ k + '}'
        print(k, file = out_file)
        print(z, file = out_file2)
    
    out_file.close()
        
    print(file = out_file2)
    m_time = m_time, "sec"
    print('changes in raw data:', change_raw_data, file = out_file2)
    print('data min. alg. time = ', m_time, file = out_file2)
    print('hiding alg. time = ', exec_time, file = out_file2)
    print('total execution time = ', total_time, file = out_file2)
    out_file2.close()
    
    return(tid, change_raw_data, Rev_Fd)
            for sentences in content:            
                for word, pos in sentences:
                    if word.isalpha() and word.lower() not in stopset:
                        trimmedTokens += [(word.lower(), pos)]
                trans += [lemmatize.getLemmas(trimmedTokens)]
                trimmedTokens = []
            
    lemmatize.saveLemmaDict()                
    freqDict = cityFreq(city, corpus_path,  files)
        
    TotalHotels = freqDict['TotalHotels']
    TotalReviews = freqDict['TotalReviews']
        
    support = ceil(0.01*TotalReviews)/len(trans)*25
    
    feats = apriori(trans, zmin = setmin, zmax= setmax,supp=support, conf = confidence, target='r', report = 'CS')

    for rule in feats:
        ruleSupp = rule[2][1]
        ruleConf = rule[2][0]
        word1 = rule[0]
        word2 = rule[1][0]
        if not wordLookup.has_key(word1):
            addWord(word1, i)
            i+=1
        if not wordLookup.has_key(word2):
            addWord(word2, i)
            i+=1
        if len(rule[1])==2:
            word3 = rule[1][1]
            if not wordLookup.has_key(word3):
示例#37
0
def associationMining(papers):

    textFolder = 'data/text/'
    support = 9
    confidence = 10
    rules = {}

    # Create stopwords list
    stopwordsFile = open('stopwords.txt', 'r')
    stopwords = set()

    for line in stopwordsFile:
        word = line.strip('\r\n').lower()
        stopwords.add(word)

    stopwordsFile.close()

    transactions = []

    for key, value in papers.items():
        if 'folder' in papers[key] and 'filename' in papers[key]:

            # Get candidates
            candidates = []
            dataFile = open(textFolder + papers[key]['folder'] +
                            papers[key]['filename'])

            for line in dataFile:
                text = line.strip('\r\n')
                words = easy_tokenizer(text)
                candidates.append(words)

            dataFile.close()

            # Compute words dict
            wordDict = {}

            for words in candidates:
                for word in words:
                    if word in stopwords or len(word) == 1 or word.isdigit():
                        continue
                    if word not in wordDict:
                        wordDict[word] = 0
                    wordDict[word] += 1

            # Compute bigrams
            bigrams = {}
            L = 0

            for words in candidates:
                n = len(words)
                L += n
                for i in range(0, n - 1):
                    if words[i] in wordDict and words[i + 1] in wordDict:
                        bigram = words[i] + '_' + words[i + 1]
                        if bigram not in bigrams:
                            # bigram's count, first word's count, second word's count, significance score
                            bigrams[bigram] = [
                                0, wordDict[words[i]], wordDict[words[i + 1]],
                                0.0
                            ]
                        bigrams[bigram][0] += 1

            # Readjust bigrams scores
            for bigram in bigrams:
                bigrams[bigram][3] = (1.0 * bigrams[bigram][0] -  \
                 1.0 * bigrams[bigram][1] * bigrams[bigram][2]/L) / \
                 ((1.0 * bigrams[bigram][0])**0.5)

            # Compute transactions
            bigramDict = {}

            for bigram in bigrams:
                if bigrams[bigram][0] > 1:
                    first, second = bigram.split('_')
                    if first not in bigramDict:
                        bigramDict[first] = set()
                    bigramDict[first].add(second)

            # Compute quality entities
            transactions = []
            for words in candidates:
                transaction = set()  # set of words/bigrams
                n = len(words)
                i = 0
                while i < n:
                    if words[i] in bigramDict and i + 1 < n and words[
                            i + 1] in bigramDict[words[i]]:
                        transaction.add(words[i] + '_' + words[i + 1])
                        i += 2
                        continue
                    if words[i] in stopwords or len(
                            words[i]) == 1 or words[i].isdigit():
                        i += 1
                        continue
                    transaction.add(words[i])
                    i += 1
                transactions.append(list(transaction))

    rules = apriori(transactions,
                    target='r',
                    supp=support,
                    conf=confidence,
                    report='sc')

    print '--------- One-to-Many Assocation Rules ------------'
    for left, right, support, confidence in sorted(rules, key=lambda x: x[0]):
        print left, '-->', right, support, confidence
    print 'Number of rules: ', len(rules)
示例#38
0
    print(fpgrowth.__doc__)
elif tid < -1:
    print(eclat.__doc__)
elif tid <  0:
    print(apriori.__doc__)
else:
    tracts = [ [ 1, 2, 3 ],
               [ 1, 4, 5 ],
               [ 2, 3, 4 ],
               [ 1, 2, 3, 4 ],
               [ 2, 3 ],
               [ 1, 2, 4 ],
               [ 4, 5 ],
               [ 1, 2, 3, 4 ],
               [ 3, 4, 5 ],
               [ 1, 2, 3 ] ]
    print('transactions:')
    for t in tracts: print(t)
    if   tid < 1:
        print  ('apriori(tracts, supp=-3, zmin=2):')
        for r in apriori(tracts, supp=-3, zmin=2): print r
    elif tid < 2:
        print  ('eclat(tracts, supp=-3, zmin=2):')
        for r in eclat(tracts, supp=-3, zmin=2): print r
    elif tid < 3:
        print  ('fpgrowth(tracts, supp=-3, zmin=2):')
        for r in fpgrowth(tracts, supp=-3, zmin=2): print r
    else:
        print  ('fim(tracts, supp=-3, zmin=2, report=\'#\'):')
        for r in fim(tracts, supp=-3, zmin=2, report='#'): print r
import fim

try:
    item_counts = {}
    frequencies = []
    rules = []

    # read input file by line and split to 
    # store each line as list of items
    # fim apriori expects this data structure as input
    baskets = [ line.split() for line in open('ProductPurchaseData.txt').read().strip().split('\n')]
    
    # target = 's'       -> frequent item sets
    # supp   = negative  -> minimum support of an item set
    # zmax   = number    -> maximum number of items per item set
    item_sets = fim.apriori(baskets, target='s', supp=-100, zmax=2)
    
    for r in item_sets:
        # apriori reports in the format ((itemset), support)
        item_set, item_count = r
        # k = 1
        if len(item_set) == 1:
            item_counts[item_set[0]] = item_count
        # k = 2
        elif len(item_set) == 2:
            item1, item2 = item_set
            # lexicographial ordering of the rules
            # report the rule a->b but not the rule b->a 
            if item1 < item2:
                frequencies.append(((item1, item2), float(item_count)))