Exemplo n.º 1
0
def task5(pid_to_keyword):
    word_to_type = {}
    pid_to_word = {}
    paper_list = []

    with open("entity_types.txt", "r") as f:
        for line in f:
            word, e_type = line.strip('\n').split(',')
            word_to_type[word] = e_type

    for pid in pid_to_keyword.keys():
        for kw in pid_to_keyword[pid]:
            if kw in word_to_type.keys():
                if pid not in pid_to_word.keys():
                    pid_to_word[pid] = set()
                pid_to_word[pid].add(kw)

    for pid in pid_to_word.keys():
        paper_list.append(pid_to_word[pid])

    result = fp_growth.find_frequent_itemsets(paper_list, 10)
    for item in result:
        if len(item
               ) != 2 or word_to_type[item[0]] != "METHOD" and word_to_type[
                   item[0]] != "PROBLEM" or word_to_type[item[
                       1]] != "METHOD" and word_to_type[item[1]] != "PROBLEM":
            continue
        if word_to_type[item[0]] != word_to_type[item[1]]:
            continue
        print(item)
Exemplo n.º 2
0
def ship_log_analyzer(log_array, customer):
    array = []
    with open("shipping_analysis.csv") as file:
        for line in file:
            array.append(line.strip('\n').split(','))
    items = find_frequent_itemsets(array, 50)
    items = [item for item in items if customer in item]
    for item in items:
        print(item)
    out_array = log_array[:]
    for log_item in log_array:
        refs = log_item[1].pop(2) + ' ' + log_item[1].pop(1)
        refs = refs.split(' ')
        log_item[1] += refs
        print("Item: ", log_item[1])
        distance = 0
        for item in items:
            #print(item)
            #print(len(set(log_item[1]).intersection(item)))
            if len(set(log_item[1]).intersection(item)) > distance:
                distance = len(set(log_item[1]).intersection(item))
                print("Rule: ", item, " | Intersect: ",
                      len(set(log_item[1]).intersection(item)),
                      " | New Distance: ", distance)
        if distance < 3:
            out_array.remove(log_item)
    print(out_array)
    return out_array
Exemplo n.º 3
0
def fp_growth(windows, min_support, iterations=0):
    from fp_growth import find_frequent_itemsets
    itemsets = []

    if 0 < min_support < 1:
        new_support = math.ceil(min_support * len(windows))
        logger.info(
            "Min support %s%% of %s: %s",
            min_support * 100,
            len(windows),
            new_support)
        min_support = new_support

    itemset_gen = find_frequent_itemsets(windows, min_support)
    if iterations > 1:
        for x in xrange(0, iterations):
            template_ids = frozenset(next(itemset_gen))
            itemsets.append(template_ids)
    else:
        for itemset in itemset_gen:
            template_ids = frozenset(itemset)
            itemsets.append(template_ids)

    logger.info("Removing subsets from fp_growth output...")
    if len(itemsets):
        itemsets = get_nonsubsets(itemsets)

    ret = [Event(id=str(uuid.uuid4()), template_ids=template_ids)
           for template_ids in itemsets]
    return ret
Exemplo n.º 4
0
    def testDuplicate(self):
        raw = '25,52,274;71;71,274;52;25,52;274,71'
        transactions = [line.split(',') for line in raw.split(';')]

        itemsets = list(fp_growth.find_frequent_itemsets(transactions, 2))
        self.assertEqual([['25'], ['52', '25'], ['274'], ['71'], ['52']],
            itemsets)
Exemplo n.º 5
0
def fp_growth(windows, min_support, iterations=0):
    from fp_growth import find_frequent_itemsets
    itemsets = []

    if 0 < min_support < 1:
        new_support = math.ceil(min_support * len(windows))
        logger.info("Min support %s%% of %s: %s", min_support * 100,
                    len(windows), new_support)
        min_support = new_support

    itemset_gen = find_frequent_itemsets(windows, min_support)
    if iterations > 1:
        for x in xrange(0, iterations):
            template_ids = frozenset(next(itemset_gen))
            itemsets.append(template_ids)
    else:
        for itemset in itemset_gen:
            template_ids = frozenset(itemset)
            itemsets.append(template_ids)

    logger.info("Removing subsets from fp_growth output...")
    if len(itemsets):
        itemsets = get_nonsubsets(itemsets)

    ret = [
        Event(id=str(uuid.uuid4()), template_ids=template_ids)
        for template_ids in itemsets
    ]
    return ret
Exemplo n.º 6
0
def printFre(vips, vipNos, type, support, per):
    # store the every vip info
    vipPlus = []
    for i in range(len(vips)):
        vipPlus.append(tolist(vips.get_group(vipNos[i]), type, per))
    frequent_items = find_frequent_itemsets(vipPlus, support)
    #
    return list(frequent_items)
Exemplo n.º 7
0
def testFrequency():
    from fp_growth import find_frequent_itemsets
    k = []

    for itemset, support in find_frequent_itemsets(lll, 0.7, True):
        print itemset, support

        k.append([itemset, support])

    print k
Exemplo n.º 8
0
def testFrequency():
    from fp_growth import find_frequent_itemsets
    k=[]

    for itemset, support in find_frequent_itemsets(lll, 0.7, True):
        print itemset,support

        k.append([itemset,support])

    print k
Exemplo n.º 9
0
    def testDuplicate(self):
        raw = '25,52,274;71;71,274;52;25,52;274,71'
        transactions = [line.split(',') for line in raw.split(';')]

        itemsets = list(fp_growth.find_frequent_itemsets(transactions, 2))
        # Python 2 - dictionary is sorted by key value (?)
        # self.assertEqual([['25'], ['52', '25'], ['274'], ['71'], ['52']], itemsets)
        # Python 3 - dictionary is sorted by insertion order (?)
        self.assertEqual([['52'], ['274'], ['25'], ['52', '25'], ['71']],
                         itemsets)
Exemplo n.º 10
0
def log_frequent_pattern(df_with_log,
                         support_value,
                         min_pattern_len,
                         tokenizer='WITHOUT_LAYER'):
    """
    Find and show frequent patterns of logs from the input DataFrame.

    :param df_with_log:     input DataFrame with a column named log
    :param support_value:   support value of FP-growth algorithm
    :param min_pattern_len: pattern's minimum length, or say token
    :param tokenizer:       with tokenizer
    :return: DataFrame with new columns summarize frequent pattern info
    """
    if tokenizer not in ['WITH_LAYER', 'WITHOUT_LAYER']:
        print("tokenizer must be chosen from 'WITH_LAYER' or 'WITHOUT_LAYER'")
        return

    df = df_with_log.copy()
    try:
        logs = df.factor.copy()  # pd.series
    except AttributeError as e:
        print(e)
        print("The input df must contain a column named 'factor', " +
              "which gives string representations of random generated factor.")
        return

    # tokenize each log in logs
    for index, log in logs.iteritems():
        if tokenizer == 'WITHOUT_LAYER':
            logs.set_value(index, log_tokenize_without_layer(log))
        elif tokenizer == 'WITH_LAYER':
            logs.set_value(index, log_tokenize_with_layer(log))
    logs_as_list = list(logs)

    # find frequent_pattern
    frequent_pattern = list(find_frequent_itemsets(logs_as_list,
                                                   support_value))

    filtered_frequent_pattern = filter(lambda fp: len(fp) >= min_pattern_len,
                                       frequent_pattern)

    # construct the DataFrame
    for fp in filtered_frequent_pattern:
        fp_exist_list = list()
        for log in logs:
            if log_contain_pattern(log, fp):
                fp_exist_list.append(1)
            else:
                fp_exist_list.append(0)
        df[','.join(fp)] = pd.Series(fp_exist_list, index=df.index)

    return df
Exemplo n.º 11
0
	def fp_growth(self):
		self.assocItems= []
		minsup= 5
		# transactions= top 9 users
		transactions= [Person.users[Person.users.index(user[0])].items for user in self.neighbors[:10] if not user[0]==self.id]
		for itemset in find_frequent_itemsets(transactions, minsup):
			# if length more than 2, and not subset if the user items, 
			# also has at least one instersection
			interc= intersection(self.items, itemset)
			if len(itemset)>=2 and not issubset(self.items, itemset) and interc:
				# format -> (what you have, what you should read)
				recItems= set(itemset)-interc
				pairs= {"own": interc, "rec": recItems}
				self.assocItems.append(pairs)
Exemplo n.º 12
0
    def handle_noargs(self, **options):
        support = 0
        all_items = []
        for transaction in Transaction.objects.all():
            items = TransactionItem.objects.filter(
                transaction=transaction).values_list('item__id', flat=True)
            all_items.append(map(str, items))

        while True:
            items = {}
            itemsets = []
            for itemset, support in find_frequent_itemsets(
                    all_items, support, True):
                itemsets.append(itemset)
                for item in itemset:
                    if int(item) not in items.keys():
                        items[item] = []

            for index in items.keys():
                for itemset in itemsets:
                    if index in itemset:
                        for item in itemset:
                            if item != index and item not in items[index]:
                                items[index].append(item)

            for main_item in items.keys():
                if len(items[main_item]) > 0:
                    for frequent_item in items[main_item]:
                        try:
                            item_set = FrequentItem.objects.get(
                                main_item__id=int(main_item),
                                frequent_item__id=int(frequent_item))
                        except:
                            item_set = FrequentItem()
                            item_set.main_item = Item.objects.get(
                                id=int(main_item))
                            item_set.frequent_item = Item.objects.get(
                                id=int(frequent_item))

                        item_set.support = support
                        item_set.save()

            print 'SUPPORT:', support
            print 'ITEMS:', items
            support += 1
            if len(items) < 1:
                break
Exemplo n.º 13
0
    def genAssociations(self):

        for item in find_frequent_itemsets(self.transList, self.minSup):
            if len(item) in self.F:
                self.F[len(item)] .append(tuple(item))
            else:
                self.F[len(item)] = [(tuple(item))]

            set_item = set(item)
            for t in self.transList:
                if set_item.issubset(set(t)):
                    if tuple(item) in self.freqList:
                        self.freqList[tuple(item)] +=1
                    else:
                        self.freqList[tuple(item)] = 1

        return self.F
Exemplo n.º 14
0
def getFpgrowth(sourcePath, seporator):
    f1 = open(sourcePath)
    retDict = {}
    transactions = []
    for line in f1:
        line = line.strip('\n')
        # transactions.append([r.encode('utf-8') for r in line.split(' ')])
        transactions.append(line.split(seporator))
    frequentSet = fp_growth.find_frequent_itemsets(transactions, 10, include_support=True)
    for item in frequentSet:
        if not item:
            break
        if len(item[0]) == 2 and '' not in item[0] and ' ' not in item[0]:
            for i in (0, 1):
                retDict[item[0][i]] = 1
    f1.close()
    return retDict
Exemplo n.º 15
0
    def solve(self):

        os.system('pip install fp-growth')
        import fp_growth as fg

        data_set = []
        with open('A.csv', 'r') as f:
            lines = f.readlines()
            for line in lines:
                data_set.append(line.strip().split(","))

        freq_items = fg.find_frequent_itemsets(data_set,
                                               len(data_set) * 0.45,
                                               include_support=True)

        L = []
        for (items, count) in freq_items:
            s_items = set(items)
            if ('republican0' in s_items
                    or 'democrat0' in s_items) and len(s_items) > 1:
                L.append((items, count))

        rules = []

        for (items, count) in L:
            items.sort()
            rule_lens = len(items) - 1

            for i in range(1, 2**rule_lens):
                left = {items[0]}
                right = set()
                bin_form = bin_digits(i, rule_lens)
                for j in range(rule_lens):
                    if bin_form[j] == '1':
                        right.add(items[j + 1])
                    else:
                        left.add(items[j + 1])

                left_count = 0
                for entry in data_set:
                    if left.issubset(entry):
                        left_count += 1
                if count * 1.0 / left_count >= 0.9:
                    rules.append([list(left), list(right)])

        return rules
Exemplo n.º 16
0
 def calc_fp_growth(self, attrivute_list, minimum_support=2, num_combo=2):
     result = dict()
     tmp_dict = dict()
     if len(attrivute_list) > 0:
         frequent_itemsets = fpg.find_frequent_itemsets(attrivute_list, minimum_support, include_support=True)
         for itemset, support in frequent_itemsets:
             if len(itemset) == num_combo:
                 if itemset[0] > itemset[1]:
                     tname = itemset[1]+'_'+itemset[0]
                 else:
                     tname = itemset[0]+'_'+itemset[1]
                 if itemset[0] not in tmp_dict:
                     tmp_dict[itemset[0]] = []
                 tmp_dict[itemset[0]].append(tname)
                 if itemset[1] not in tmp_dict:
                     tmp_dict[itemset[1]] = []
                 tmp_dict[itemset[1]].append(tname)
                 result[tname] = support
     return result, tmp_dict
Exemplo n.º 17
0
Arquivo: csv.py Projeto: siliconer/yk
def csv_read():  #核心的函数,
	with open('E:/yk/test/yk--cz.csv', 'rb') as csvfile: #读取csv 文件.
		csvreader = _csv.reader(csvfile, delimiter=' ', quotechar='|')
		negative_chinese = '\xe5\x90\xa6'
		postive_chinese = '\xe6\x98\xaf'
		billvalue = ''
		targetvalue = ''#初始化为空字符.
		index_number = 0 
		yangka_data = [['0' for i in range(0,2)] for j in range(0,90000)]
		fpgrowth_yangka = codecs.open("E:/yk/test/yangka.txt","w")  #定义要把发现出来的数据写入的文件. 其中w代表写.

		for row in csvreader:
			element = str(', '.join(row)) 
			_element = str(', '.join(row).decode("gb2312") )  # 每条_element 的例子为   ”否,否,否,否,否,否,1,0,0,1,2,2“
			#bill_one,bill_two,bill_three ...分别表示一月,二月,三月...是否出账三无的数值为'否'还是'是'
			bill_one = _element.split(',')[0] #对_element 以','进行分割 ”否,否,否,否,否,否,1,0,0,1,2,2“分割得到的第一项为'否'
			bill_two =  _element.split(',')[1]
			bill_three =  _element.split(',')[2]
			bill_four =  _element.split(',')[3]
			bill_five =  _element.split(',')[4]
			bill_six =  _element.split(',')[5]	
			#target_one,target_two,target_three... 分别表示一月,二月,三月....养卡目标是'0'还是'1'还是'2'
			target_one = _element.split(',')[6]
			target_two = _element.split(',')[7]
			target_three =_element.split(',')[8]
			target_four = _element.split(',')[9]
			target_five = _element.split(',')[10]
			target_six = _element.split(',')[11]

			print _element
			#根据上面得到的每个月的状态(是否出账三无),目标生成一个状态变化的数值  和目标变化的数值
			billvalue =  billvalue_make(bill_one,bill_two,bill_three,bill_four,bill_five,bill_six)
			targetvalue = targetvalue_make(target_one,target_two,target_three,target_four,target_five,target_six)
			#生成数值之后赋值给yangka_data这个数组,列1为是否出账生成这个的状态变化#的数据,列2为目标变化生成的数据. index_number每次加1 作为一个浮标
			yangka_data[index_number][0] = billvalue
			yangka_data[index_number][1] = targetvalue
			index_number = index_number + 1 
		# print yankga_data
		# 这是核心部分,需要在运行这个程序之前安装fp_growth这个频繁项挖掘的算法包,其返回的itemset 就是挖掘处来的每个数组集如‘['00000', '30288'] ’,support 为出现次数‘2782’,它们共同组成生成到文件里的记录‘['00000', '30288'] 2782’,下面的数字500 是挖掘出来的记录出现的最低次数,比如像'‘['00000', '30288'] ’这种状态如果出现次数低于500 就不考虑,超过500 就考虑,在这里它出现了'2782'次,所以就会考虑了.
		for (itemset,support) in find_frequent_itemsets(yangka_data,500,True):
			print>>fpgrowth_yangka,itemset,support #把这些挖掘得到的记录 生成到文件yangka.txt里.
Exemplo n.º 18
0
    def fp_growth(self, filepath, minsup):
        import csv
        import unicodecsv
        from fp_growth import find_frequent_itemsets

        formattedpath = filepath + '.format.csv'
        with open(formattedpath, 'wb') as outputfile:
            writer = unicodecsv.writer(outputfile, delimiter='\t', encoding='utf-8')
        
            segmentor = segment()
            with open(filepath) as inputfile:
                for transaction in csv.reader(inputfile, delimiter='\t'):

                    assert len(transaction) == 1, "Invalid"
                    writer.writerow(segmentor.char_segment(transaction[0]))

        finalresult = {}
        with open(formattedpath) as inputfile:
            for itemset, support in find_frequent_itemsets(csv.reader(inputfile, delimiter='\t'), minsup, True):
                finalresult[', '.join(itemset)] = support
        return finalresult
Exemplo n.º 19
0
def extractPatternFromCohorte(cohorte, minsup, tag):
    """
	-> Store all frequent pattern (i.e set of items present more than minsup in 
	   cohorte)
	-> cohorte is an array of array, discrete value (obtain via the assemble_CohorteFromAllFiles
	   function)
	-> minsup is an int between 0 and 100 (% of support)
	-> tag is a string, insert in the output file name
	"""
    saveFileName = "DATA/PATTERN/" + str(tag) + "_pattern_" + str(
        minsup) + ".csv"
    numberOfPatient = len(cohorte)
    minsup = int(minsup)
    minimumSupport = (minsup * numberOfPatient) / 100
    patternFile = open(saveFileName, "w")
    for itemset in find_frequent_itemsets(cohorte, minimumSupport):
        line = ""
        for element in itemset:
            line = line + str(element) + ";"
        line = line[:-1]
        patternFile.write(line + "\n")
    patternFile.close()
Exemplo n.º 20
0
def recommend_recipes(user,n):
    
    if user not in dataset.users:
        ret = [("Bourbon Chicken","http://pictures.food.com/api/file/a2ZX1DphTjK0YuAi916b-149-bourbon-chicken.jpg"),("To Die for Crock Pot Roast","http://img.food.com/img/recipes/27/20/8/large/picVfzLZo.jpg"),("Crock-Pot Chicken With Black Beans & Cream Cheese","http://img.food.com/img/recipes/89/20/4/large/picec1bG3.jpg"),("Creamy Cajun Chicken Pasta","http://img.food.com/img/recipes/39/08/7/large/piccZDaro.jpg"),("\"Whatever Floats Your Boat\" Brownies!","http://img.food.com/img/recipes/32/20/4/large/picblOl7e.jpg"),("Best Ever Banana Cake With Cream Cheese Frosting","http://img.food.com/img/recipes/67/25/6/large/pichIPBA2.jpg"),("Pancakes","http://img.food.com/img/recipes/25/69/0/large/piciUoO07.jpg")]
        return ret[:n]
    item = cPickle.load(open("similItem.pkl"))
    item.setdataset(dataset)
    items = item.recommend(user,n+1)

    cont = cPickle.load(open("similCont.pkl"))
    cont.setdataset(dataset)
    conts  = cont.recommend(user,n+1)

    recipes = dataset.getItemsForUser(user)

    alpha = 5 * (n/2)
    transactions = []
    for recipe in recipes:
        transactions.append(dataset.recipes_ingredients[recipe] * dataset.matrix[user][recipe])
    f = find_frequent_itemsets(transactions,alpha)
    freq = []
    for i in f:
        if len(freq) >= n-1:
            break
        freq.append(i)
    out = conts[:len(freq)+1]
    out.extend(items)
    ret = []
    i=0
    while len(ret) < n:
        try:
            ret.append((out[i],dataset.imgs[out[i]]))
        except Exception:
            continue;
        i+=1  
    return ret
Exemplo n.º 21
0
    #path = raw_input("Please, write the path to the CSV file \n")
    path = r"C:\Users\migue\Documents\UC3M\TU Graz\Bachelor thesis\Data\despacho_liencres_out.csv"

    # read CSV file and store it
    #instances = pd.read_csv(path, sep=';')
    #print("csv read")
    #instances.to_dict('records')
    #print("Dictionary created")
    instances = []
    with open(path, 'rb') as f:
        reader = csv.reader(f)
        instances = list(reader)
    print("List created")
    frequent_items = pd.DataFrame()
    #minsup=0.15
    for idx, itemset in find_frequent_itemsets(instances, 4):
        frequent_items.append(itemset)
        if (idx % 100 == 0):
            print(idx)
    print(frequent_items)

    # ----------------------------------------------------------------------------------------------
    #path = input("Please, write the path to the CSV file \n")
    #csvout = "C:\\Users\\migue\\Documents\\UC3M\\TU Graz\\Bachelor thesis\\Data\\" + input("Please, write the name of the CSV output file \n")

    ## open files & create outfile
    #counter = 0

    #sound = []
    #barometer = []
    #temperature = []
Exemplo n.º 22
0
from fp_growth import find_frequent_itemsets
transaction = []
minsup = 2
transactions = [[1, 2, 5], [2, 4], [2, 3], [1, 2, 4], [1, 3], [2, 3], [1, 3],
                [1, 2, 3, 5], [1, 2, 3]]
for itemset in find_frequent_itemsets(transactions, minsup):
    print itemset
def find_global_freq_itemsets(transactions,minsup):
	global_freq_itemset=[];
	for itemset in find_frequent_itemsets(transactions, minsup):
		global_freq_itemset.append(itemset); 
	return global_freq_itemset;
Exemplo n.º 24
0
            print(freqSet - conseq, '-->', conseq, 'conf:', conf)
            brl.append((freqSet - conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH


def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)):  #try further merging
        Hmp1 = aprioriGen(H, m + 1)  #create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):  #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)


def pntRules(ruleList, itemMeaning):
    for ruleTup in ruleList:
        for item in ruleTup[0]:
            print(itemMeaning[item])
        print("           -------->")
        for item in ruleTup[1]:
            print(itemMeaning[item])
        print("confidence: %f" % ruleTup[2])
        print()  #print a blank line


A, B = apriori(loadDataSet(), 0.4)
print(A, B)

print(list(find_frequent_itemsets(loadDataSet(), 0.2, True)))
Exemplo n.º 25
0
                # Assign to event format
                # EventFormat['WHERE'] = Where
                # EventFormat['WHO'] = Who
                # EventFormat['WHEN'] = When
                # EventFormat['TOPIC'] = TopicResults

                #print >> outputFile, EventFormat

                #convert set to a list
                outputStringList = list(itemSet)
                if len(outputStringList) >= 3 and len(Where)>0 and len(Who)>0:
                    outputString = ';'.join(outputStringList)
                    print outputString
                    csvWriter.writerow([outputString])


    f = open(OutputFileName)
    try:
        for itemset, support in find_frequent_itemsets(csv.reader(f), 2, True):
            print '{' + ', '.join(itemset) + '} ' + str(support)
    finally:
        f.close()


except mdb.Error, e:
    print "Error %d: %s" % (e.args[0],e.args[1])
    sys.exit(1)

finally:
    if con:
        con.close()
Exemplo n.º 26
0
def main():
    #TODO 获取当前目录
    path = os.path.split(os.path.realpath(__file__))[0]
    path.decode('gbk')
    #实例化
    logger1 = logger(path)
    #调用实例化里面的函数
    log = logger1.logger
    #TODO 开始运行
    log.info('run begin')

    #TODO 创建结果目录
    resultPath = path + u'\\result'
    mkdir(resultPath, log)

    #TODO 读取配置文件
    log.info('read configuration file begin')
    cf = ConfigParser()
    localPath = path + u'\\test.ini'
    cf.read(localPath)
    database = cf.get(u'db', 'database')
    password = cf.get(u'db', 'password')
    user = cf.get(u'db', 'user')
    port = cf.getint(u'db', 'port')
    host = cf.get(u'db', 'host')
    log.info('read configuration file end')

    # TODO 开始分析
    log.info('read data begin')
    x = getdata(host, port, user, password, database)
    path1 = path + u'\\drugloc.xlsx'
    path2 = path + u'\\druglocgroup.xlsx'
    drugloc = pd.read_excel(path1)
    druglocgroup = pd.read_excel(path2)
    log.info('Read data end')

    log.info('Begin analysis')
    hoslist = [
        '攀钢集团总医院密地院区', '攀钢集团总医院长寿路院区', '攀枝花市中心医院', '中国十九冶集团有限公司职工医院',
        '攀枝花煤业(集团)有限责任公司总医院', '米易县人民医院', '攀枝花市中西医结合医院', '攀枝花市第二人民医院'
    ]
    fredegree = 0.2
    for hos in hoslist:
        log.info('this is hospital %s' % hos)
        xx = x[x['kb01_ckb519'] == hos]
        # 构建药品组编码
        data = pd.merge(xx,
                        drugloc,
                        left_on='ka20_ake001',
                        right_on='drugId',
                        how='left')
        data_1 = pd.merge(data,
                          druglocgroup,
                          left_on='locId',
                          right_on='locId',
                          how='left')
        # 去除druggroupid为空值的
        data_2 = data_1[(data_1['drugGroupId'].isnull() == False)
                        & (data_1['kc22_aaz217'].isnull() == False)]
        data_3 = data_2[['kc22_aaz217', 'drugGroupId']]
        data_4 = data_3.drop_duplicates()
        #找到频繁药品
        druglist = getitems(data_4, 'kc22_aaz217', 'drugGroupId')
        transactions = [line.split(',') for line in druglist]
        itemsets = list(
            fp_growth.find_frequent_itemsets(transactions,
                                             fredegree * len(transactions)))
        # 去掉重复交换的频繁项即(a,b),(b,a)
        for i in range(len(itemsets)):
            for j in range(i + 1, len(itemsets)):
                if j < len(itemsets):
                    if (len(itemsets[i]) > 1) & (set(itemsets[i]) == set(
                            itemsets[j])):
                        itemsets.remove(itemsets[i])
        # 去掉频繁1项集
        table = []
        num = []
        for items in itemsets:
            if len(items) > 1:
                table.append(items)
                num.append(len(items))
        freitem = pd.DataFrame({'fre': table, 'num': num})
        freitem.sort_values(by=['num'], ascending=[False], inplace=True)
        maxfre = max(freitem['num'])
        # 找到频繁n项集,中各项之间的交集或并集
        freitems = freitem[freitem['num'] == maxfre]
        unionfre = set(freitems.iloc[0, 0]).union(*freitems.iloc[1:, 0])

        # 计算权重 日均使用频率,平均花费,费用比率等
        # 计算日均使用频率
        drugcount = data_2['kc22_aaz217'].groupby(
            [data_2['kc22_aaz217'],
             data_2['drugGroupId']]).count().reset_index()
        drugcount.rename(columns={0: 'count'}, inplace=True)
        days = data_2[['kc22_aaz217', 'days']].drop_duplicates()
        days.replace(0, 1, inplace=True)
        drug = pd.merge(drugcount,
                        days,
                        left_on='kc22_aaz217',
                        right_on='kc22_aaz217',
                        how='left')
        drug['meanCount'] = drug['count'] / drug['days']
        drugmeandays = drug['meanCount'].groupby(
            drug['drugGroupId']).mean().reset_index()
        # 计算平均花费
        drugcost = data_2['kc22_ckc526'].groupby(
            [data_2['kc22_aaz217'],
             data_2['drugGroupId']]).sum().reset_index()
        drugcost.rename(columns={'kc22_ckc526': 'cost'}, inplace=True)
        drugmeancost = drugcost['cost'].groupby(
            drug['drugGroupId']).mean().reset_index()
        # 计算平均药品价格
        drugprice = data_2['kc22_cke521'].groupby(
            data_2['drugGroupId']).mean().reset_index()
        drugprice.rename(columns={'kc22_cke521': 'price'}, inplace=True)
        # 计算费用比率
        sumcost = data_2['kc22_ckc526'].groupby(
            data_2['kc22_aaz217']).sum().reset_index()
        sumcost.rename(columns={'kc22_ckc526': 'sumcost'}, inplace=True)
        drugratio = pd.merge(drugcost,
                             sumcost,
                             left_on='kc22_aaz217',
                             right_on='kc22_aaz217',
                             how='left')
        drugratio['ratio'] = drugratio['cost'] / drugratio['sumcost']
        ratio = drugratio['ratio'].groupby(
            drugratio['drugGroupId']).mean().reset_index()

        # 查看这几个药品组中对应了几个药,
        druggroup = data_2['kc22_cke521'].groupby([
            data_2['drugGroupId'], data_2['drugGroupName'],
            data_2['ka20_ake002']
        ]).mean().reset_index()
        drugnum = druggroup['ka20_ake002'].groupby(
            druggroup['drugGroupId']).count().reset_index()
        drugnum.rename(columns={0: 'drugnum'}, inplace=True)
        # 计算最高价格的是哪个药品
        druggroup.sort_values(by=['kc22_cke521'],
                              ascending=[False],
                              inplace=True)
        drugmaxprice = druggroup.drop_duplicates(subset=['drugGroupId'],
                                                 keep='first')
        # 计算频率出现最高的是哪个药
        drugmaxfre = data_2['kc22_aaz217'].groupby([
            data_2['drugGroupId'], data_2['drugGroupName'],
            data_2['ka20_ake002']
        ]).count().reset_index()
        drugmaxfre.rename(columns={0: 'drugmaxfre'}, inplace=True)
        drugmaxfre.sort_values(by=['drugmaxfre'],
                               ascending=[False],
                               inplace=True)
        drugmaxfre = drugmaxfre.drop_duplicates(subset=['drugmaxfre'],
                                                keep='first')

        drugname = []
        meandayfre = []
        meancost = []
        meanprice = []
        costratio = []
        drugnums = []
        drugpricemax = []
        drugfremax = []
        for i in unionfre:
            drugname.append(
                list(data_2[data_2['drugGroupId'] == i]['drugGroupName'])[0])
            meandayfre.append(
                list(drugmeandays[drugmeandays['drugGroupId'] == i]
                     ['meanCount'])[0])
            meancost.append(
                list(
                    drugmeancost[drugmeancost['drugGroupId'] == i]['cost'])[0])
            meanprice.append(
                list(drugprice[drugprice['drugGroupId'] == i]['price'])[0])
            costratio.append(
                list(ratio[ratio['drugGroupId'] == i]['ratio'])[0])
            drugnums.append(
                list(drugnum[drugnum['drugGroupId'] == i]['drugnum'])[0])
            drugpricemax.append(
                list(drugmaxprice[drugmaxprice['drugGroupId'] == i]
                     ['ka20_ake002'])[0])
            drugfremax.append(
                list(drugmaxfre[drugmaxfre['drugGroupId'] == i]['ka20_ake002'])
                [0])
        data = {
            'drugname': drugname,
            'meandayfre': meandayfre,
            'meancost': meancost,
            'meanprice': meanprice,
            'costratio': costratio,
            'drugnums': drugnums,
            'drugpricemax': drugpricemax,
            'drugfremax': drugfremax
        }
        df = pd.DataFrame(data,
                          columns=[
                              'drugname', 'meandayfre', 'meancost',
                              'meanprice', 'costratio', 'drugnums',
                              'drugpricemax', 'drugfremax'
                          ])

        pathfile = resultPath + '\\result_' + hos + '.csv'
        log.info('The result is saved in %s' % pathfile)
        df.to_csv(pathfile, encoding='gbk')
    log.info('End analysis')
    log.info('run end')
Exemplo n.º 27
0
#print(len(high_ach_ordered_list))

plot_centroids(np.transpose(km_4.cluster_centers_), correct_order)
#For k = 5
# plot_centroids(np.transpose(km_5.cluster_centers_), correct_order)

# #run silhouette plots on both to determine which is best
# silhouette_plots(x_normalized[0::5],5) #running only on 30000 items as the dataset is too big !
# silhouette_plots(x_normalized[30000:30000+30000],3)

########################################## ASSOCIATION CODE ###########################################################
# # #build FP_Tree the min_sup that works is 50% which is way too low
cluster_list = [item['clusters'] for item in high_ach_ordered_list]
# root = FPtree_construction(cluster_list,0.50)
#print(cluster_list)
patterns = find_frequent_itemsets(cluster_list, 2000, include_support=True)

for items in patterns:
    print items

############################################ CLASSIFICATION CODE #######################################################

clf = RandomForestClassifier(random_state=255)
student_features = []
student_label = []
incomplete_students = []
for student in student_cluster:
    clusters = student_cluster[student]
    if clusters["result"] is None:
        incomplete_features = []
        incomplete_features.append(clusters[0])
Exemplo n.º 28
0
with open('AssocationMatrixCtrl.csv', 'rb') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        # print row
        for item in row:
            # print item
            my_list = item.split(",")

            my_tuple = tuple(my_list)
            transactions.append(my_list)

tupleTransac = tuple(tuple(x) for x in transactions)
# print tupleTransac
# dataframe = pd.DataFrame(transactions)
# print dataframe

# print transactions
# for transaction in transactions:
#     print transaction
# with open
freqItemsets = []
report = find_frequent_itemsets(transactions, 4)
for itemset in report:
    freqItemsets.append(itemset)
print len(freqItemsets)
dataFrame = pd.DataFrame(freqItemsets)
dataFrame.to_csv('fpgrowth.csv')

# relim_input = itemmining.get_relim_input(tupleTransac)
# report = itemmining.relim(relim_input, min_support=2)
# print report
def generateFrequentItemsets(minsup):
	# print BV.items
	positiveTransactions = []
	negativeTransactions = []
	positiveFrequentItemsets = []
	negativeFrequentItemsets = []
	positiveSupports = []
	negativeSupports = []
	for item in BV.items:
		transaction = []
		count = 0
		if item[-1] == 1:
			for attribute in item[:-1]:
				if attribute == 0.0:
					transaction.append(count * 2)
				else:
					transaction.append(count * 2 + 1)
				count = count + 1
			positiveTransactions.append(transaction)
		else:
			for attribute in item[:-1]:
				if attribute == 0.0:
					transaction.append(count * 2)
				else:
					transaction.append(count * 2 + 1)
				count = count + 1
			negativeTransactions.append(transaction)
	# print "here\n\n"
	# print len(positiveTransactions)
	# print len(negativeTransactions)
	for positiveFrequentItemset, positiveSupport in find_frequent_itemsets(positiveTransactions, int(minsup * len(positiveTransactions)), True):
		positiveFrequentItemsets.append(positiveFrequentItemset)
		positiveSupports.append(positiveSupport)
	for negativeFrequentItemset, negativeSupport in find_frequent_itemsets(negativeTransactions, int(minsup * len(negativeTransactions)), True):
		negativeFrequentItemsets.append(negativeFrequentItemset)
		negativeSupports.append(negativeSupport)
	# print "FP done\n\n"
	# for itemset in positiveFrequentItemsets:
	# 	print itemset
	# print "negative"
	# for itemset in negativeFrequentItemsets:
	# 	print itemset
	# print "done"
	print len(positiveFrequentItemsets)
	print len(negativeFrequentItemsets)
	positiveDict = {}
	negativeDict = {}
	removePositive = {}
	removeNegative = {}
	idx = 0
	for itemset in positiveFrequentItemsets:
		num = 0
		for item in itemset:
			num = num + 2**item
		# print "positive num"
		# print num
		useless = False
		for key in positiveDict:
			if key & num == num:
				useless = True
			elif key & num == key:
				removePositive[key] = 1
		if useless == False:
			positiveDict[num] = positiveSupports[idx]
		idx = idx + 1
	idx = 0
	for itemset in negativeFrequentItemsets:
		num = 0
		for item in itemset:
			num = num + 2**item
		useless = False
		for key in negativeDict:
			if key & num == num:
				useless = True
			elif key & num == key:
				removeNegative[key] = 1
		# if num in positiveDict and positiveDict[num] != 1 and useless == False:
		if useless == False:
			negativeDict[num] = negativeSupports[idx]
		# print positiveDict
		# print negativeDict
		for key in removePositive:
			if key in positiveDict:
				del positiveDict[key]
		for key in removeNegative:
			if key in negativeDict:
				del negativeDict[key]
	# print "dumping\n\n"
	json.dump(positiveDict, open("positiveFrequentItemsets.txt","w"))
	json.dump(negativeDict, open("negativeFrequentItemsets.txt","w"))
	json.dump(len(positiveTransactions), open("numOfPosResults.txt","w"))
	json.dump(len(negativeTransactions), open("numOfNegResults.txt","w"))
	# for itemset in find_frequent_itemsets(transactions, minsup):
	#     print itemset
Exemplo n.º 30
0
def main():
    # TODO 获取当前目录
    path = os.path.split(os.path.realpath(__file__))[0]
    path.decode('gbk')
    # 实例化
    logger1 = logger(path)
    # 调用实例化里面的函数
    log = logger1.logger
    # TODO 开始运行
    log.info('run begin')

    # TODO 创建结果目录
    resultPath = path + u'\\result'
    mkdir(resultPath, log)

    # TODO 读取配置文件
    log.info('read configuration file begin')
    cf = ConfigParser()
    localPath = path + u'\\config.ini'
    cf.read(localPath)
    freqs = cf.getint('parameter', 'freqs')
    username = cf.get('database', 'username')
    password = cf.get('database','password')
    tns = cf.get('database', 'tns')
    databasename = cf.get('database', 'databasename')
    table = cf.get('database', 'table')
    tabledetail = cf.get('database', 'tabledetail')
    log.info("Read configuration file end")

    # TODO 读取数据
    log.info('Read data begin')
    datapath = path + u'\\data'
    listDir = os.listdir(datapath)
    filePath = os.path.join(datapath,listDir[0])
    df, rawdata = readdata(filePath)
    log.info('Read Data end')

    # TODO 开始分析
    items = getitems(rawdata)
    print(len(items))
    transactions = [line.split(',') for line in items]
    result = []
    itemsets = []
    for itemset, support in fp_growth.find_frequent_itemsets(transactions, freqs, True):
        result.append((itemset, support))
    results = sorted(result, key=lambda i: len(i[0]), reverse=True)

    # 去除频繁1项
    resl = []
    for itemset, support in results:
        if len(itemset) > 1:
            resl.append((itemset, support))
    # print resl
    print(len(resl))

   # 去除所有子集
    res1 = sorted(resl, key=lambda i: len(i[0]), reverse=True)
    res2 = [resl[0][0]]
    frItems = [resl[0]]
    for i, support in resl:
        TF = []
        for j in res2:
            TF.append(str(np.in1d(i, j).all()))
        if 'True' in TF:
            continue
        else:
            res2.append(i)
            frItems.append((i, support))
    print(len(frItems))
    print frItems

    lists = []
    for index,(i,support) in enumerate(frItems):
        df_person = df[df['indv_id'].isin(i)]
        df_person['patientCount'] = len(i)
        df_person['id'] = shortuuid.uuid()
        df_person['preNum'] = len(i)
        #找出频繁一起入院的时间
        time = df_person['indv_id'].groupby(df_person['intime']).count().reset_index()
        time.rename(columns={'indv_id':'times'},inplace=True)
        presonNum = len(i)-1
        time = time[time['times']>presonNum]
        timelist = time['intime']
        df_person = df_person[df_person['intime'].isin(timelist)]
        df_person['sumCost'] = sum(df_person[u'医疗总费用'])
        df_person['sumClaimCost'] = sum(df_person[u'报销金额'])
        df_person['fre'] = len(df_person)/len(i)
        lists.append(df_person)
        i.append(support)
    result = pd.concat(lists)
    result['department']= ''
    result['doctor'] = ''
    print result.head()

    # #TODO 输出
    filename1 = u'\\su_illegal_seek_exception.csv'
    filename2 = u'\\su_illegal_seek_exception_detail.csv'
    outputfile1 = resultPath + filename1
    outputfile2 = resultPath + filename2

    # #主表输出
    res1 = result[['id',u'患者姓名']].drop_duplicates()
    res1 = res1[u'患者姓名'].groupby(res1['id']).apply(lambda x: ','.join(x)).reset_index()
    res2 = result[['id', 'intime']].drop_duplicates()
    res2 = res2['intime'].groupby(res2['id']).apply(lambda x: ','.join(x)).reset_index()
    result2 = pd.merge(res1,res2,on='id',how='left')
    res3 = result[['id', u'医院名称']].drop_duplicates()
    res3 = res3[u'医院名称'].groupby(res3['id']).apply(lambda x: ','.join(x)).reset_index()
    result2 = pd.merge(result2, res3, on='id', how='left')
    res4 = result[['id',u'医院名称', u'医院所在分中心',]].drop_duplicates()
    res4 = res4[u'医院所在分中心'].groupby(res4['id']).apply(lambda x: ','.join(x)).reset_index()
    result2 = pd.merge(result2, res4, on='id', how='left')
    res5 = result[['id', u'医院等级',u'医院名称']].drop_duplicates()
    res5 = res5[u'医院等级'].groupby(res5['id']).apply(lambda x: ','.join(x)).reset_index()
    result2 = pd.merge(result2, res5, on='id', how='left')

    result2.rename(columns={u'患者姓名':'involvedPatientNames', 'intime':'inHosTime',u'医院名称':'hospitalName',u'医院等级':'hospitalLevel',u'医院所在分中心':'hospitalArea'},inplace=True)
    result3 = result[['id','sumCost','sumClaimCost','doctor','fre','patientCount']]

    su_illegal_seek_exception = pd.merge(result2,result3,left_on='id',right_on='id',how='left')
    su_illegal_seek_exception.drop_duplicates(inplace=True)
    su_illegal_seek_exception.rename(columns={'sumCost':'allMoney','sumClaimCost':'bcMoney','doctor':'doctorName','fre':'frequency'},inplace=True)
    su_illegal_seek_exception.to_csv(outputfile1,encoding='gb18030',index=False)
Exemplo n.º 31
0
    def get_freq_itemsets(self, min_sup, max_len):

        fi = []
        if self.path == '../datasets-space/kosarak-full-space.data':
            f = open('kosarak-minsupp-0.6-percent.txt')

            for line in f:
                tokens = line.split(',')
                itemsetstr = tokens[0]
                itemsetstr = itemsetstr.strip('[]')

                itemset = []
                for item in itemsetstr.split():
                    itemset.append(int(item))

                sup = int(tokens[1])
                if sup < min_sup:
                    break

                if len(itemset) <= max_len:
                    fi.append(self.Itemset(itemset, sup))

            f.close()
            return fi

        if self.path == '../datasets-space/aol-full-space.data':
            f = open('aol-minsupp-0.2-percent.txt')

            for line in f:

                tokens = line.split()
                itemset = []
                for item in tokens[:-2]:
                    itemset.append(int(item))

                sup = int(tokens[-2])

                if sup < min_sup:
                    break

                if len(itemset) <= max_len:
                    fi.append(self.Itemset(itemset, sup))

            #print "test loading aol, fi = ", fi

            f.close()
            return fi

        if max_len == 1:
            x = 0
            for item, coverage in self.items.iteritems():
                sup = len(coverage)

                if sup >= min_sup:
                    x += 1
                    fi.append(self.Itemset([
                        item,
                    ], sup))
        else:
            x = 0
            for itemset, sup in find_frequent_itemsets(self.transactions,
                                                       min_sup,
                                                       include_support=True):
                if len(itemset) > max_len:
                    continue
                fi.append(self.Itemset(itemset, sup))
                x += 1


#				if x % 100 == 0:
#logging.debug('%d frequent itemsets obtained'%x)

#		logging.debug('Total for min sup %d = %d frequent itemsets'%(min_sup,x))

        fi.sort(key=lambda (i): i.sup, reverse=True)

        return fi
Exemplo n.º 32
0
def getFrequentItem(filepath):
    labelDic={}
    dicfile=open('alllabelDic.pkl','rb')
    labelDic=pickle.load(dicfile)
    invertlabelDic=dict(izip(labelDic.itervalues(),labelDic.iterkeys()))
    # for k in invertlabelDic:
    #     print invertlabelDic[k]
    #
    # for k in labelDic:
    #     print labelDic[k]

    dic_labelTag=[]
    # labelTag=np.loadtxt(filepath,dtype=str,delimiter=',',usecols=(4,))
    # dic_labelTag=[]
    # for item in labelTag:
    #     if item in labelDic.keys():
    #         #print (labelDic[item])
    #         dic_labelTag.append(labelDic[item])
    # print(dic_labelTag)
    # frequentSet=[]  #frequentSet is the set of frequent item,like:[[['1'], 4], [['2', '1'], 4]] first is frequent tag,second is the support dgree.
    # for itemset, support in find_frequent_itemsets(dic_labelTag, 4, True):
    #     #frequentSet.append([itemset,support])
    #     print itemset,support


    labelTag=np.loadtxt(filepath,dtype=str,delimiter=',',usecols=(3,4))
    time=1
    while time<=48:
        tempLabel=[]
        for item in labelTag:

            labtltime=str2timeNum(item[0]) # here labeltime is a number as we cut one hour into two pices of time(30 min)
            if labtltime>= (time-1)*30 and labtltime<=time*30:
                if item[1] in labelDic.keys():
                    tempLabel.append(labelDic[item[1]])
                else:
                    print item[1],'-------------------------------------'
                    tempLabel.append('999999999')
        if tempLabel:
            dic_labelTag.append(tempLabel)
        time+=1

    #print(dic_labelTag)
    # dic_labelTag=[]
    # for item in labelTag:
    #     if item in labelDic.keys():
    #         #print (labelDic[item])
    #         dic_labelTag.append(labelDic[item])
    # print(dic_labelTag)
    frequentSet=[]  #frequentSet is the set of frequent item,like:[[['1'], 4], [['2', '1'], 4]] first is frequent tag,second is the support dgree.
    for itemset, support in find_frequent_itemsets(dic_labelTag,0.2, True):
        frequentSet.append([itemset,support])


    savefile=open(filepath.replace('RCed_stoppoint.txt','itemfrequence.txt'),'w')
    for item, support in sorted(frequentSet, key=lambda (item, support): support):
        #print item,support
        if len(item)==1:
            savefile.write(invertlabelDic[item[0]])
            savefile.write('\n')
        else:
            for index in range(len(item)-1):
                savefile.write(invertlabelDic[item[index]])
                savefile.write(',')
            savefile.write(invertlabelDic[item[len(item)-1]]    )
            savefile.write('\n')
    savefile.close()








    return dic_labelTag,frequentSet
Exemplo n.º 33
0

#-------------------------------------------------------
def parse_file(path):
    '''Parse the json file and create a dict'''
    data = dict()
    with open(path, 'r') as infile:
        data = json.load(infile)
    infile.close()
    print('Total No of transactions: ', len(data))
    trans_list = list()
    for i in range(len(data)):
        trans_list.append(list(map(str, data[i]['Items'])))

    return trans_list


#----------------------------------------------------------
if __name__ == "__main__":
    transactions = parse_file('trans_data.json')
    result = []
    transac = transactions[0:20]
    minsup = int(input('Enter minimum support: '))
    for itemset, support in find_frequent_itemsets(
            transac, minsup,
            True):  # 2nd parameter is the minimum support value.
        result.append((itemset, support))

    result = sorted(result, key=lambda i: i[0])
    for itemset, support in result:
        print(str(itemset) + ' ' + str(support))
Exemplo n.º 34
0
    for conseq in H:
        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence
        if conf >= minConf: 
            print(freqSet-conseq,'-->',conseq,'conf:',conf)
            brl.append((freqSet-conseq, conseq, conf))
            prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
    m = len(H[0])
    if (len(freqSet) > (m + 1)): #try further merging
        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates
        Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
        if (len(Hmp1) > 1):    #need at least two sets to merge
            rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
            
def pntRules(ruleList, itemMeaning):
    for ruleTup in ruleList:
        for item in ruleTup[0]:
            print(itemMeaning[item])
        print("           -------->")
        for item in ruleTup[1]:
            print(itemMeaning[item])
        print("confidence: %f" % ruleTup[2])
        print()       #print a blank line
        
A, B = apriori(loadDataSet(), 0.4)
print(A, B)

print (list(find_frequent_itemsets(loadDataSet(), 0.2, True)))
Exemplo n.º 35
0
def find_frequent_pattern(read_filename1, read_filename2, write_filename1, write_filename2, write_filename3):
    
    #频繁项集挖掘的支持度
    ##### 可调整该值输出不同的结果,作为实验对比
    minimun_support = 25
    #####
    
    #频繁项及其对应的长度、支持度的列表
    frequent_patterns = []  #二维int列表
    length_all = []
    support_all = []
    
    word_list = []
    f0 = open(read_filename2, 'rb')
    line = f0.readline()
    while line:
        word_list.append(line.split()[0])
        line = f0.readline()
    f0.close()
    
    '''
    #挖掘频繁项集,并得到相应的频繁项集的支持度
    #频繁项集挖掘采用FP-Growth算法
    #参考 https://github.com/enaeseth/python-fp-growth
    '''
    trans = generate_transactions(read_filename1, word_list)
    
    #find_frequent_itemsets返回的结果类型为"generator"
    #The type of the return of the "find_frequent_itemsets" is "generator"
    for each, support in find_frequent_itemsets(trans, minimun_support, include_support=True):
        each.sort()
               
        frequent_patterns.append(each)
        length_all.append(len(each))
        support_all.append(support)
        
    print 'Total frequent patterns: %d' % len(frequent_patterns)           
    
    #频繁项按照长度由高到低排序
    fl = zip(frequent_patterns, length_all, support_all)
    fl1 = sorted(fl, key = itemgetter(1), reverse = True)
    
    frequent_patterns = []
    result_length_support = []
    
    #频繁项集过滤
    for each in fl1:
        
        tag = 0
        for each1 in frequent_patterns:
            if len(set(each[0]) & set(each1)) == len(each[0]):
                tag = 1
                break
            elif np.true_divide(len(set(each[0]) & set(each1)), len(set(each[0]) | set(each1))) > 0.4999:
                ##### 可调整该相似度,作为实验对比
                #####
                tag = 1
                break
            else:
                pass
            
        if tag == 0:               
            frequent_patterns.append(each[0])
            result_length_support.append(str(each[1]) + " " + str(each[2]))
    
    #result_length_support = []
    
    real_word_trans = []
    trans_to_string = []
    
    for each in frequent_patterns:
        trans_to_string.append(" ".join([str(x) for x in each]))
        real_word_list = map_trans_to_word(each, word_list)
        real_word_trans.append(" ".join(real_word_list))
    
    quick_write_list_to_text(trans_to_string, write_filename1)
    quick_write_list_to_text(result_length_support, write_filename2)
    quick_write_list_to_text(real_word_trans, write_filename3)
Exemplo n.º 36
0
        filtered = [w for w in result
                    if w not in english_stopwords
                    and re.match(r'^\d+$', w) is None]
        yield filtered


def content_from_file(filename):
    for x in transform_data(read_json(filename),
                            get_item=itemgetter('object')):
        yield x


if __name__ == '__main__':
    p = ArgumentParser()
    p.add_argument('input', nargs=1)
    p.add_argument('-s', '--minsup', type=int, default=1000,
                   help='Minimum itemset support')
    p.add_argument('-c', '--category', type=str, default="spam",
                   help='Class to perform search on')
    args = p.parse_args()

    input = args.input[0]
    if isdir(input):
        content_gen = content_from_dir(input, category=args.category)
    else:
        content_gen = content_from_file(input)

    for itemset, support in \
            find_frequent_itemsets(content_gen, args.minsup, True):
        print str(support) + ' ' + ' '.join(itemset)
Exemplo n.º 37
0
def spur(read_directory, write_directory1, write_directory2):
    '''
    SPUR压缩
    Summarization via Pattern Utility and Ranking
    Summarize a batch of transactions with low compression ratio and high quality.
    
    Xintian Yang, Amol Ghoting, Yiye Ruan, A Framework for Summarizing and Analyzing Twitter Feeds, KDD'12,August 12–16, 2012, Beijing, China.
    
    :param read_directory: VSM文件目录
    :param write_directory1: 压缩结果文件目录
    :param write_directory2: 压缩比例文件目录
    '''

    #频繁项集挖掘的支持度
    minimun_support = 60
    
    #误报率
    f = 0.1
    
    #压缩比率
    ratio = []
    
    #压缩时间
    compress_time = []
    
    #文件总数
    file_number = sum([len(files) for root, dirs, files in os.walk(read_directory)])
    
    #循环遍历所有VSM文件
    for i in range(file_number):
        print 'Batch: %d' % (i + 1)
        start = time.clock()
        
        '''
        #挖掘频繁项集,并得到相应的频繁项集的支持度
        #频繁项集挖掘采用FP-Growth算法
        #参考 https://github.com/enaeseth/python-fp-growth
        '''
        o_trans, trans_size = generate_transactions(read_directory + '/' + str(i + 1) + '.txt')
        
        #压缩预算,压缩的上限是原始事务中的总项数items的0.6
        #M = 0.7 * trans_size
        M = 85000
        #频繁项及其对应的长度、支持度的列表
        frequent_patterns = []  #二维int列表
        length_all = []
        support_all = []
        
        #find_frequent_itemsets返回的结果类型为"generator"
        #The type of the return of the "find_frequent_itemsets" is "generator"
        for each, support in find_frequent_itemsets(o_trans, minimun_support, include_support=True):
            each.sort()
            frequent_patterns.append(each)
            length_all.append(len(each))
            support_all.append(support)
        
        print len(frequent_patterns)           
        #频繁项按照长度由高到低排序
        fl = zip(frequent_patterns, length_all, support_all)
        fl1 = sorted(fl, key = itemgetter(1), reverse = True)
        
        '''
        #为便于表示原始事务,每一个频繁项pattern用一个字符串来表示,作为其id
        #每一个pattern的表示格式"p*",*为数字,从0开始
        '''
        #pattern的id与pattern所代表的事务中的项关系的字典
        #类型:'id':int[item]
        id_pattern_dict = {}
        #pattern的id与pattern的长度关系的字典
        #类型:'id':int
        pattern_length_dict = {}
        #pattern的id与pattern的支持度关系的字典
        #类型:'id':int
        pattern_support_dict = {}
        
        id1 = 0
        for each in fl1:
            id_pattern_dict['p' + str(id1)] = each[0]
            pattern_length_dict['p' + str(id1)] = each[1]
            pattern_support_dict['p' + str(id1)] = each[2]         
            id1 += 1

        #pattern的id与含有该pattern的事务关系的字典
        #事务的编号的排列方式以原始事务顺序为依据,为int型
        #类型:'id':int[trans]
        pattern_trans_dict = {}
        for each in id_pattern_dict.keys():
            value_list = []
            for j in range(len(o_trans)):
                if set(id_pattern_dict[each]).issubset(o_trans[j]):  #后面无需集合化
                    value_list.append(j)
            
            pattern_trans_dict[each] = value_list
        
        #获取每个频繁项的子频繁项,不包含本身
        #类型:'id':str[id]
        sub_pattern_dict = {}
        #获取每个频繁项的父频繁项,不包含本身
        #类型:'id':str[id]
        super_pattern_dict = {}
        #获取每个与每个频繁项相交的但不属于以上2种情况的频繁项
        #类型:'id':str[id]
        overlap_pattern_dict = {}
        for each in id_pattern_dict.keys():
            value_list1 = []
            value_list2 = []
            value_list3 = []
            for each1 in id_pattern_dict.keys():
                if each != each1:
                    intersection = set(id_pattern_dict[each1]) & set(id_pattern_dict[each])
                    if intersection == set():
                        pass
                    elif set(id_pattern_dict[each1]) == intersection:
                        value_list1.append(each1)
                    elif set(id_pattern_dict[each]) == intersection:
                        value_list2.append(each1)
                    else:
                        value_list3.append(each1)
                else:
                    pass
            
            sub_pattern_dict[each] = value_list1
            super_pattern_dict[each] = value_list2
            overlap_pattern_dict[each] = value_list3
        
        '''
        初始化utility值
        返回pattern的id与该pattern的utility值关系的字典
        返回pattern的id与包含该pattern的事务列表关系的字典
        '''
        pattern_utility , pattern_coverage_set = utility_f(id_pattern_dict, pattern_trans_dict, pattern_support_dict, sub_pattern_dict, f)
        
        #获取utility值最大的pattern
        max_index = np.argmax(pattern_utility.values())
        Q_top = pattern_utility.keys()[max_index]
        
        #pattern_utility的复制
        Q_utility = pattern_utility.copy()
        
        '''
        将原始事务用当前pattern表示,根据utility进行
        同时不断更新utility值
        '''
        #current_size = trans_size
        current_size = 0
        iter_count = 0
        
        while current_size < M:
            #当前选择的pattern
            this_pattern = Q_top
            
            if Q_utility[this_pattern] >= 0.0:
                
                '''
                  #用当前频繁项this_pattern表示原始事务
                  #this_pattern是该pattern的键,是一个字符串
                  '''
                replace_trans_with_pattern(o_trans, this_pattern, id_pattern_dict[this_pattern], pattern_coverage_set[this_pattern])
                #此时,o_trans已经改变
                #注意,之后o_trans中既包含int型,又包含string型
                
                '''
                  当前pattern表示完后,更新其余pattern的utility值
                  '''
                for each1 in super_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each1]) & set(pattern_coverage_set[this_pattern])
                    pattern_utility[each1] = pattern_utility[each1] - len(id_pattern_dict[this_pattern]) * len(covered_set)
                    if each1 in Q_utility.keys():
                        Q_utility[each1] = pattern_utility[each1]
                
                for each2 in sub_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each2]) & set(pattern_coverage_set[this_pattern])
                    pattern_utility[each2] = pattern_utility[each2] - (len(id_pattern_dict[each2]) - 1) * len(covered_set)
                    
                    if each2 in Q_utility.keys():
                        Q_utility[each2] = pattern_utility[each2]
                    
                    pattern_coverage_set[each2] = [x for x in pattern_coverage_set[each2] if x not in covered_set]
                    if (len(pattern_coverage_set[each2]) == 0) and (each2 in Q_utility.keys()):
                        del Q_utility[each2]
                
                for each3 in overlap_pattern_dict[this_pattern]:
                    covered_set = set(pattern_coverage_set[each3]) & set(pattern_coverage_set[this_pattern])
                    pattern_utility[each3] = pattern_utility[each3] - len(covered_set) * len(set(id_pattern_dict[each3]) & set(id_pattern_dict[this_pattern]))
                    if each3 in Q_utility.keys():
                        Q_utility[each3] = pattern_utility[each3]
                
                #if len(pattern_coverage_set[this_pattern]) == 0:
                    #flag += 1
                #else:
                    #flag = 0
                    
                current_size = current_size + len(pattern_coverage_set[this_pattern])
                iter_count += 1
                if iter_count >= 50000:
                    break
                
                #if flag == 3:
                    #break
                #current_size = np.sum([len(x) for x in o_trans])
                #print current_size
                
                #当前pattern已表示过 删除之
                del Q_utility[this_pattern]
                
                #重新按照utility值降序排序 选取utility值最大的一项
                if Q_utility != {}:
                    max_index = np.argmax(Q_utility.values())
                    Q_top = Q_utility.keys()[max_index]
                else:
                    break
                            
            else:
                break  
        
        #final_size = np.sum([len(x) for x in o_trans])

        final_size = current_size
        print 'Final size: ', final_size
        this_ratio = np.true_divide(final_size, trans_size)
        print 'Ratio: ', this_ratio
        
        ratio.append(str(this_ratio))
        
        interval = time.clock() - start
        print 'Time: %f' % interval
        compress_time.append(str(interval))
        
        write_list_to_text_by_row(o_trans, write_directory1 + '/' + str(i + 1) + '.txt') 
    
    quick_write_list_to_text(ratio, write_directory2 + '/ratio.txt')
    quick_write_list_to_text(compress_time, write_directory2 + '/compress_time.txt')
Exemplo n.º 38
0
def find_frequent_pattern(read_filename1, read_filename2, write_filename1,
                          write_filename2, write_filename3):

    #频繁项集挖掘的支持度
    ##### 可调整该值输出不同的结果,作为实验对比
    minimun_support = 25
    #####

    #频繁项及其对应的长度、支持度的列表
    frequent_patterns = []  #二维int列表
    length_all = []
    support_all = []

    word_list = []
    f0 = open(read_filename2, 'rb')
    line = f0.readline()
    while line:
        word_list.append(line.split()[0])
        line = f0.readline()
    f0.close()
    '''
    #挖掘频繁项集,并得到相应的频繁项集的支持度
    #频繁项集挖掘采用FP-Growth算法
    #参考 https://github.com/enaeseth/python-fp-growth
    '''
    trans = generate_transactions(read_filename1, word_list)

    #find_frequent_itemsets返回的结果类型为"generator"
    #The type of the return of the "find_frequent_itemsets" is "generator"
    for each, support in find_frequent_itemsets(trans,
                                                minimun_support,
                                                include_support=True):
        each.sort()

        frequent_patterns.append(each)
        length_all.append(len(each))
        support_all.append(support)

    print 'Total frequent patterns: %d' % len(frequent_patterns)

    #频繁项按照长度由高到低排序
    fl = zip(frequent_patterns, length_all, support_all)
    fl1 = sorted(fl, key=itemgetter(1), reverse=True)

    frequent_patterns = []
    result_length_support = []

    #频繁项集过滤
    for each in fl1:

        tag = 0
        for each1 in frequent_patterns:
            if len(set(each[0]) & set(each1)) == len(each[0]):
                tag = 1
                break
            elif np.true_divide(len(set(each[0]) & set(each1)),
                                len(set(each[0]) | set(each1))) > 0.4999:
                ##### 可调整该相似度,作为实验对比
                #####
                tag = 1
                break
            else:
                pass

        if tag == 0:
            frequent_patterns.append(each[0])
            result_length_support.append(str(each[1]) + " " + str(each[2]))

    #result_length_support = []

    real_word_trans = []
    trans_to_string = []

    for each in frequent_patterns:
        trans_to_string.append(" ".join([str(x) for x in each]))
        real_word_list = map_trans_to_word(each, word_list)
        real_word_trans.append(" ".join(real_word_list))

    quick_write_list_to_text(trans_to_string, write_filename1)
    quick_write_list_to_text(result_length_support, write_filename2)
    quick_write_list_to_text(real_word_trans, write_filename3)
Exemplo n.º 39
0
def searchForPattern(cohorte, maxTry, maxNumberOfFrequentPattern,
                     patternSaveFileName):
    """
	-> Generate pattern (i.e frequent itemsets) from cohorte, with maxTry.
	   results are saved in a .csv file (patternSaveFileName).
	-> cohorte is a list of list
	-> maxTry is an int
	-> patternSaveFileName is a string, save file should be located in
	   DATA/PATTERN folder.
	-> maxNumberOfFrequentPattern is an int, the max number of frequent pattern to Generate
	   (setup ti avoid memory problem)

	-> TODO:
		- re-check the algorithm
		- limit the retrieval of the same patterns
		- clean doublon in patternSaveFileName

	"""

    # Initialisation des parametres
    minsup = len(cohorte)
    minLenOfPattern = len(cohorte[0])
    numberOfTry = 0
    tunningPatternLen = 1
    tunnigMinSup = 0
    pattern_save = open(patternSaveFileName, "a")
    pattern_save.close()

    while (1 > 0):

        ######################################
        # controle le nombre de try effectue #
        ######################################
        if (numberOfTry >= maxTry):
            break

        ###########################
        # Generation des patterns #
        ###########################
        listOffrequentItemset = []
        for itemset in find_frequent_itemsets(cohorte, minsup):
            listOffrequentItemset.append(itemset)

        ######################################################
        # controle le nombre de pattern, si aucun pattern    #
        # n est genere on baisse la valeur du minsup employe #
        # pour genere les patterns                           #
        ######################################################
        if (len(listOffrequentItemset) > 0):
            listOfItemSize = []
            print "Found " + str(
                len(listOffrequentItemset
                    )) + " frequent itemsets with minsup = " + str(minsup)

            ##############################################################
            # Ecriture des pattern dans un fichier de sauvegarde         #
            # chaque ligne correspond a un pattern, chaque               #
            # element du pattern est separe des autres par un ;          #
            # le dernier terme de la ligne correspond au minsup          #
            # utilise pour genere le pattern (i.e le support du pattern) #
            ##############################################################
            pattern_save = open(patternSaveFileName, "a")
            for element in listOffrequentItemset:
                lineToWrite = ""
                for item in element:
                    lineToWrite = lineToWrite + item + ";"
                listOfItemSize.append(len(element))
                lineToWrite = lineToWrite + str(minsup)
                pattern_save.write(lineToWrite + "\n")
            pattern_save.close()

            if (len(listOffrequentItemset) > maxNumberOfFrequentPattern):
                print "max number of patterns reached, cancel mining"
                break

            ######################################################
            # controle de la taille des pattern, si la taille    #
            # du plus gros pattern ne passe pas le controle      #
            # on adapte alternativement le score minsup employer #
            # et la taille des pattern attendues				 #
            ######################################################
            maxSize = max(listOfItemSize)
            if (maxSize < minLenOfPattern):
                if (tunnigMinSup):
                    tunnigMinSup = 0
                    tunningPatternLen = 1
                    minsup = minsup - 1
                    triedToIcreaseMinLenPattern = 0
                elif (tunningPatternLen):
                    tunningPatternLen = 0
                    tunnigMinSup = 1
                    minLenOfPattern = minLenOfPattern - 1
            else:
                if (not triedToIcreaseMinLenPattern):

                    ###################################################
                    # si la taille des pattern est bonne mais         #
                    # on viens juste de changer de minsup alors       #
                    # on augmente la taille attendue des pattern      #
                    # pour voir si on pas pecher un plus gros pattern #
                    ###################################################
                    minLenOfPattern = minLenOfPattern + 1
                    triedToIcreaseMinLenPattern = 1

                else:
                    ######################################################
                    # Si la taille des pattern est bonne et on a deja    #
                    # essayer d augmenter la taille attendue des pattern #
                    # on arrete la recherche ici                         #
                    ######################################################
                    print "found a good pattern"
                    break

        else:
            minsup = minsup - 1
        numberOfTry += 1
Exemplo n.º 40
0
for i in range(numOfFreSet):
    if len(sortedFreSet[i][0]) <= 1:
        sortedFreSet = sortedFreSet[:i]
        break

# for i in range(numOfFreSet):
items = sortedFreSet[0][0]
sup = sortedFreSet[0][1]
itemList = []
for i in range(1,len(items)):
    itemList += list(itertools.combinations(items, i))
print(itemList)
dict = {}
for i in range(len(itemList)):
    count = 0
    for j in range(numOfData):
        if set(itemList[i]) in set(data[j]):
            count += 1
    dict[itemList[i]] = count

data_1 = data[0]
print(set(data_1))
print(set(itemList[0]))
'''

import fp_growth as fpg

fre = fpg.find_frequent_itemsets(data, 0.01, True)
print(list(fre))
Exemplo n.º 41
0
def enaeseth_fpgrowth(minsup, item_no):
    start = datetime.now()
    transactions, y_res = merge_data(item_no)
    for itemset in find_frequent_itemsets(transactions, minsup):
        print(itemset)
    print(datetime.now() - start)
Exemplo n.º 42
0
        items = line.split()
        # pulling a single line or transaction and putting it in items using spaces a delimiters
        for item in items:
            transaction.append(item)
            # appending each item in items to the list transactions to from a list of items in a single transaction
        transactions.append(transaction)
        # appends each transaction into transactions forming a list of lists
        transaction = []
        # resetting the value of transactions

from fp_growth import find_frequent_itemsets
# importing the fp-tree library

frequent_sets = []

for itemset in find_frequent_itemsets(transactions, 100, include_support=True):
    # running creating an item set that satisfied a min-sup of 100 from our transaction list
    frequent_sets.append(itemset[1])
    # appends the amount of support an itemset has to the list frequency_sets
    frequent_sets.append(itemset[0])
    # appends the itemset to the list frequent_sets

print frequent_sets
# prints the frequent sets, items in the list with odd indexes are the itemsets that are frequent
# the index of a frequent itemset minus 1 is the support for that itemset.
# the csv file is formatted with k and k+1 indexes being support and itemset pairs to make finding the closed and maximum sets easier.

print len(frequent_sets) / 2
# print the number of frequent sets found

print datetime.now() - startTime
Exemplo n.º 43
0
def csv_read():
    current_dir = os.getcwd()
    counter = 0
    csv_file = current_dir + '/my/yk--my.csv'
    counter = 0
    yangka_txt = current_dir + '/yangka.txt'
    file_yangka = codecs.open(yangka_txt, "w")

    with open(csv_file, 'rb') as csvfile:
        csvreader = _csv.reader(csvfile, delimiter=' ', quotechar='|')
        negative_chinese = '\xe5\x90\xa6'
        postive_chinese = '\xe6\x98\xaf'
        billvalue = ''
        targetvalue = ''
        index_number = 0
        yangka_data = [['0' for i in range(0, 2)] for j in range(0, 90000)]

        for row in csvreader:
            element = str(', '.join(row))
            _element = str(', '.join(row).decode("gb2312"))

            bill_one = _element.split(',')[0]
            bill_two = _element.split(',')[1]
            bill_three = _element.split(',')[2]
            bill_four = _element.split(',')[3]
            bill_five = _element.split(',')[4]
            bill_six = _element.split(',')[5]
            target_one = _element.split(',')[6]
            target_two = _element.split(',')[7]
            target_three = _element.split(',')[8]
            target_four = _element.split(',')[9]
            target_five = _element.split(',')[10]
            target_six = _element.split(',')[11]

            print _element
            # billvalue =  billvalue_make(bill_one,bill_two,bill_three,bill_four,bill_five,bill_six)
            # targetvalue = targetvalue_make(target_one,target_two,target_three,target_four,target_five,target_six)
            billvalue = billvalue_make1(bill_one, bill_two, bill_three,
                                        bill_four)
            targetvalue = targetvalue_make1(target_one, target_two,
                                            target_three, target_four)
            yangka_data[index_number][0] = billvalue
            yangka_data[index_number][1] = targetvalue
            index_number = index_number + 1

        for (itemset,
             support) in find_frequent_itemsets(yangka_data, 100, True):
            print >> file_yangka, itemset, support

        #temp_arrya :temporary array  && used for reading the initial data generating from fp_growth ,then adjust the data sequence
        # example :[T1001,'BYYYY'] 528 ->['BYYYY', 'T1001'] 528
        temp_array = ['0' for i in range(0, 10000)]
        temp_array_indexer = 0
        file_data = open(yangka_txt)
        for data in file_data:
            print 'data ' + str(data.replace('\n', ' ').replace('\r', ''))
            if len(str(data.split(']')[0])) > 8:
                # adjust_write_sequence : [T1001,'BYYYY'] 528 ->['BYYYY', 'T1001'] 528
                # .replace('\n', ' ').replace('\r', '')  ->get rid of the '\n' in the end of  each line of yangka.txt
                temp_array[temp_array_indexer] = adjust_write_sequence(
                    data).replace('\n', ' ').replace('\r', '')
                temp_array_indexer = temp_array_indexer + 1
                counter = counter + 1

        #read data from temp_array , sort them and write into yangka_sort.txt
        yanka_sort_file = current_dir + '/yangka_sort.txt'
        yangka_sort = codecs.open(yanka_sort_file, "w")
        yangka_data_mining = [['0' for i in range(0, 2)]
                              for j in range(0, counter)]
        index_counter = 0
        for m in range(0, len(temp_array)):
            data = str(temp_array[m])  # read data from temp_array
            if data != '0':
                yangka_data_mining[index_counter][0] = data.split(
                    ']')[0] + ']'  # pattern, such as '[T1001,'BYYYY']'
                yangka_data_mining[index_counter][1] = int(data.split(
                    ']')[1])  # the occuring time of pattern ,such as '528'
                index_counter = index_counter + 1
        yangka_data_mining.sort(key=operator.itemgetter(1),
                                reverse=True)  # sort them.
        for yangka_data in yangka_data_mining:
            yangka_data[1] = str(yangka_data[1])
        for data in yangka_data_mining:
            print >> yangka_sort, data  # write into file  yangka_sort.txt
Exemplo n.º 44
0
	#with open("./training_filenames","r") as f:
		#for line in f.readlines():
			#trainFileName_lang[line.split()[0]]=line.split()[1]
	#f = codecs.open("./allLangFiles/"+str(trainFileName_lang[lang]),'r',encoding='utf-8')
	#filewords=f.read()
	#corpus = [SnowballStemmer(languageMapping[lang]).stem(word) for word in re.split(';.?',filewords)]
	#train=corpus
	#fdist = nltk.FreqDist(w for w in corpus)
	#vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))
	#train = map(lambda x: x if x in vocabulary else "*unknown*", train)
	#estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
	#lm2 = NgramModel(2, train, estimator=estimator)
	#lm3=NgramModel(3, train, estimator=estimator)
	g = codecs.open('./fpg'+"_"+str(lang)+".txt",'w',encoding='utf-8')
	frequent_pattern_list=[]
	for itemset,support in find_frequent_itemsets(line_list_swr_stem_ofwr,100,True):
		#if len(itemset)== 2 :
			#print (lm2.prob(itemset[0],itemset[1:])
			#print str(itemset[0])+' ####'+str(itemset[1:])
		for item in itemset:
			g.write(item+" " )
		g.write(': ' + str(support)+' \n')
		#elif len(itemset)== 3 :
			#print (lm3.prob(itemset[0],itemset[1:])
			#print str(itemset[0])+' ####'+str(itemset[1:])
		#	g.write(str(itemset) + ' : ' + str(support)+' \n' ) # + str(lm3.prob(itemset[0],itemset[1:]))+' \n')
		#else: 
		#	g.write(str(itemset) + ' : ' + str(support)+' \n' )
		#frequent_pattern_list.append(str(itemset))
	#dictionary = corpora.Dictionary(frequent_pattern_list)
	#corpus = [dictionary.doc2bow(text) for text in frequent_pattern_list]
Exemplo n.º 45
0
    def testDuplicate(self):
        raw = "25,52,274;71;71,274;52;25,52;274,71"
        transactions = [line.split(",") for line in raw.split(";")]

        itemsets = list(fp_growth.find_frequent_itemsets(transactions, 2))
        self.assertEqual([["25"], ["52", "25"], ["274"], ["71"], ["52"]], itemsets)