def ResultDFToSave(rules):  #根据Qrange3关联分析生成的规则得到并返回对于的DataFrame数据结构的函数
    returnRules = []
    for i in rules:
        temList = []
        temStr = ''
        for j in i[0]:  #处理第一个frozenset
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temStr = temStr + ' ==> '
        for j in i[1]:
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temList.append(temStr)
        temList.append(i[2])
        temList.append(i[3])
        temList.append(i[4])
        temList.append(i[5])
        temList.append(i[6])
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '覆盖度', '力度', '提升度',
                                 '利用度'))

    supportRate = 0.02
    confidenceRate = 0.5
    itemsets = dict(oaf.frequent_itemsets(listToAnalysis, supportRate))
    rules = oaf.association_rules(itemsets, confidenceRate)
    rules = list(rules)
    regularNum = len(rules)
    printRules = dealRules(rules)
    result = list(oaf.rules_stats(
        rules, itemsets, len(listToAnalysis)))  #下面这个函数改变了rules,把rules用完了!
    printResult = dealResult(result)

    #################################################下面将结果保存成excel格式的文件
    dfToSave = ResultDFToSave(result)
    saveRegularName = str(supportRate) + '支持度_' + str(
        confidenceRate) + '置信度_产生了' + str(regularNum) + '条规则' + '.xlsx'
    dfToSave.to_excel(saveRegularName)

    #######################################################下面是根据不同置信度和关联度得到关联规则数目
    listTable = []
    supportRate = 0.01
    confidenceRate = 0.1
    for i in range(9):
        support = supportRate * (i + 1)
        listS = []
        for j in range(9):
            confidence = confidenceRate * (j + 1)
            itemsets = dict(oaf.frequent_itemsets(listToAnalysis, support))
            rules = list(oaf.association_rules(itemsets, confidence))
            listS.append(len(rules))
        listTable.append(listS)
    dfList = pd.DataFrame(listTable,
                          index=[supportRate * (i + 1) for i in range(9)],
                          columns=[confidenceRate * (i + 1) for i in range(9)])
    dfList.to_excel('regularNum.xlsx')
示例#2
0
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '支持度', '力度', '提升度',
                                 '利用度'))


if __name__ == '__main__':
    supportRate = 0.004
    confidenceRate = 0.6
    itemsets = dict(oaf.frequent_itemsets(ryzd, supportRate))
    rules = oaf.association_rules(itemsets, confidenceRate)
    rules = list(rules)
    regularNum = len(rules)
    printRules = dealRules(rules)
    result = list(oaf.rules_stats(rules, itemsets,
                                  len(ryzd)))  #下面这个函数改变了rules,把rules用完了!
    printResult = dealResult(result)

    #################################################
    # 下面将结果保存成excel格式的文件
    dfToSave = ResultDFToSave(result)
    dfToSave.to_excel(r'C:\Users\Administrator\Desktop\2.xlsx')

    #######################################################
    # 下面是根据不同置信度和关联度得到关联规则数目

    listTable = []
    supportRate = 0.001
    confidenceRate = 0.1
    for i in range(9):
        support = supportRate * (i + 1)
    def find_rules(self):
        if self.data is None: return
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.onehot_mapping = mapping
        names = {item: '{}={}'.format(var.name, val)
                 for item, var, val in OneHot.decode(mapping, data, mapping)}

        # Items that consequent must include if classifying
        class_items = {item
                       for item, var, val in OneHot.decode(mapping, data, mapping)
                       if var is data.domain.class_var} if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, tooltip) in enumerate([("Supp", "Support"),
                                                ("Conf", "Confidence (support / antecedent support)"),
                                                ("Covr", "Coverage (antecedent support / number of examples)"),
                                                ("Strg", "Strength (consequent support / antecedent support)"),
                                                ("Lift", "Lift (number of examples * confidence / consequent support)"),
                                                ("Levr", "Leverage ((support * number of examples - antecedent support * consequent support) / (number of examples)²)"),
                                                ("Antecedent", None),
                                                ("", None),
                                                ("Consequent", None)]):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        #~ # Aggregate rules by common (support,confidence) for scatterplot
        #~ scatter_agg = defaultdict(list)

        # Find itemsets
        nRules = 0
        itemsets = {}
        progress = gui.ProgressBar(self, self.maxRules + 1)
        for itemset, support in frequent_itemsets(X, self.minSupport / 100):
            itemsets[itemset] = support

            if class_items and not class_items & itemset:
                continue

            # Filter itemset by joined filters before descending into it
            itemset_str = ' '.join(names[i] for i in itemset)
            if (filterSearch and
                (len(itemset) < itemsetMin or
                 itemsetMax < len(itemset) or
                 not isRegexMatch(itemset_str, itemset_str))):
                continue

            for rule in gen_assoc_rules(itemsets,
                                        self.minConfidence / 100,
                                        itemset):
                (left, right), support, confidence = rule

                if class_items and right - class_items:
                    continue
                if filterSearch and not isSizeMatch(len(left), len(right)):
                    continue
                left_str = ' '.join(names[i] for i in sorted(left))
                right_str = ' '.join(names[i] for i in sorted(right))
                if filterSearch and not isRegexMatch(left_str, right_str):
                    continue

                # All filters matched, calculate stats and add table row
                _, _, _, coverage, strength, lift, leverage = next(
                    rules_stats((rule,), itemsets, n_examples))

                support_item = NumericItem(support / n_examples)
                # Set row data on first column
                support_item.setData((itemset - class_items,
                                      class_items and (class_items & itemset).pop()),
                                     self.ROW_DATA_ROLE)
                left_item = StandardItem(left_str, len(left))
                left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                model.appendRow([support_item,
                                 NumericItem(confidence),
                                 NumericItem(coverage),
                                 NumericItem(strength),
                                 NumericItem(lift),
                                 NumericItem(leverage),
                                 left_item,
                                 StandardItem('→'),
                                 StandardItem(right_str, len(right))])
                #~ scatter_agg[(round(support / n_examples, 2), round(confidence, 2))].append((left, right))
                nRules += 1
                progress.advance()
                if nRules >= self.maxRules:
                    break
            if nRules >= self.maxRules:
                break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)
        progress.finish()

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount()  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0
    def find_rules(self):
        if self.data is None or not len(self.data):
            return
        if self._is_running:
            return
        self._is_running = True
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.error(911)
        if X is None:
            self.error(911, 'Need some discrete data to work with.')

        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {
            item:
            ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(
                var.name, val)
            for item, var, val in OneHot.decode(mapping, data, mapping)
        }
        # Items that consequent must include if classifying
        class_items = {
            item
            for item, var, val in OneHot.decode(mapping, data, mapping)
            if var is data.domain.class_var
        } if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, tooltip) in enumerate([
            ("Supp", "Support"),
            ("Conf", "Confidence (support / antecedent support)"),
            ("Covr", "Coverage (antecedent support / number of examples)"),
            ("Strg", "Strength (consequent support / antecedent support)"),
            ("Lift",
             "Lift (number of examples * confidence / consequent support)"),
            ("Levr",
             "Leverage ((support * number of examples - antecedent support * consequent support) / (number of examples)²)"
             ), ("Antecedent", None), ("", None), ("Consequent", None)
        ]):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        #~ # Aggregate rules by common (support,confidence) for scatterplot
        #~ scatter_agg = defaultdict(list)

        # Find itemsets
        nRules = 0
        itemsets = {}
        with self.progressBar(self.maxRules + 1) as progress:
            for itemset, support in frequent_itemsets(X,
                                                      self.minSupport / 100):
                itemsets[itemset] = support

                if class_items and not class_items & itemset:
                    continue

                # Filter itemset by joined filters before descending into it
                itemset_str = ' '.join(names[i] for i in itemset)
                if (filterSearch and
                    (len(itemset) < itemsetMin or itemsetMax < len(itemset)
                     or not isRegexMatch(itemset_str, itemset_str))):
                    continue

                for rule in association_rules(itemsets,
                                              self.minConfidence / 100,
                                              itemset):
                    left, right, support, confidence = rule

                    if class_items and right - class_items:
                        continue
                    if filterSearch and not isSizeMatch(len(left), len(right)):
                        continue
                    left_str = ', '.join(names[i] for i in sorted(left))
                    right_str = ', '.join(names[i] for i in sorted(right))
                    if filterSearch and not isRegexMatch(left_str, right_str):
                        continue

                    # All filters matched, calculate stats and add table row
                    _, _, _, _, coverage, strength, lift, leverage = next(
                        rules_stats((rule, ), itemsets, n_examples))

                    support_item = NumericItem(support / n_examples)
                    # Set row data on first column
                    support_item.setData(
                        (itemset - class_items, class_items and
                         (class_items & itemset).pop()), self.ROW_DATA_ROLE)
                    left_item = StandardItem(left_str, len(left))
                    left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    model.appendRow([
                        support_item,
                        NumericItem(confidence),
                        NumericItem(coverage),
                        NumericItem(strength),
                        NumericItem(lift),
                        NumericItem(leverage), left_item,
                        StandardItem('→'),
                        StandardItem(right_str, len(right))
                    ])
                    #~ scatter_agg[(round(support / n_examples, 2), round(confidence, 2))].append((left, right))
                    nRules += 1
                    progress.advance()
                    if nRules >= self.maxRules:
                        break
                if nRules >= self.maxRules:
                    break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount(
        )  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0
        self._is_running = False
    often,
    .01,
)  #这里设置置信度frozenset({'肺炎'})
rules = list(rules)


def dealResult(rules):
    returnRules = []
    for i in rules:
        temStr = ''
        for j in i[0]:  #处理第一个frozenset
            temStr = temStr + j + '&'
        temStr = temStr[:-1]
        temStr = temStr + ' ==> '
        for j in i[1]:
            temStr = temStr + j + '&'
        temStr = temStr[:-1]
        temStr = temStr + ';' + '\t' + str(i[2]) + ';' + '\t' + str(
            i[3]) + ';' + '\t' + str(i[4]) + ';' + '\t' + str(
                i[5]) + ';' + '\t' + str(i[6]) + ';' + '\t' + str(i[7])
        #        print(temStr)
        returnRules.append(temStr)
    return returnRules


printRules = dealResult(rules)
print(printRules)
result = list(oaf.rules_stats(rules, often, len(ryzd)))
printResult = dealResult(result)
print(printResult)
    def doAnalysize(self,
                    pd_data,
                    category,
                    supportRate=0.02,
                    confidenceRate=0.5,
                    savepath=r'C:\Users\Administrator\Desktop'):
        # 初始化词库路径
        savepath = savepath + "\\" + category
        if not os.path.exists(savepath):
            os.makedirs(savepath)
        initpath = "tmall\\spiders\\DataAnalysize\\jiebaInit\\" + category + ".txt"
        jieba.load_userdict(initpath)
        pd_data['ratecontent_list'] = pd_data.apply(
            lambda r: list(jieba.cut(r['rateContent'])), axis=1)

        aim_list = []
        with open(initpath, 'r', encoding="utf-8") as f:
            for line in f.readlines():
                aim_list.append(line.strip('\n'))
        pd_data['aim_list'] = pd_data.apply(lambda r: list(
            set(r['ratecontent_list']).intersection(set(aim_list))),
                                            axis=1)
        simple_aimdata = []
        pd_data.apply(lambda r: simple_aimdata.append(r['aim_list'])
                      if not r['aim_list'] == [] else 1,
                      axis=1)
        wordcloudlist = []
        for item in simple_aimdata:
            for i in item:
                wordcloudlist.append(i)
        # 生成每种分析的词云图
        self.everyWordCloud(wordcloudlist, savepath)

        #经过上面两行操作,得到目标列表: simple_aimdata
        strSet = set(functools.reduce(lambda a, b: a + b, simple_aimdata))
        strEncode = dict(zip(strSet, range(
            len(strSet))))  # 编码字典,即:{'甜腻': 6,'鱼腥味': 53,etc...}
        strDecode = dict(
            zip(strEncode.values(),
                strEncode.keys()))  # 解码字典,即:{6:'甜腻',53:'鱼腥味',etc...}
        listToAnalysis_int = [
            list(map(lambda item: strEncode[item], row))
            for row in simple_aimdata
        ]
        # 开始进行关联分析
        itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, supportRate))
        # print("itemsets : ")
        # print(itemsets)
        rules = oaf.association_rules(itemsets, confidenceRate)
        rules = list(rules)
        regularNum = len(rules)
        printRules = self.dealRules(rules, strDecode)  # 该变量可以打印查看生成的规则
        # print(printRules)
        result = list(oaf.rules_stats(
            rules, itemsets,
            len(listToAnalysis_int)))  # 下面这个函数改变了rules,把rules用完了!
        # print(result)
        printResult = self.dealResult(result, strDecode)  # 该变量可以打印查看结果
        # print(printResult)

        #################################################下面将结果保存成excel格式的文件
        # save rules to excel
        dfToSave = self.ResultDFToSave(result, strDecode)
        saveRegularName = savepath + "\\" + str(supportRate) + '支持度_' + str(
            confidenceRate) + '置信度_产生了' + str(regularNum) + '条规则' + '.xlsx'
        dfToSave.to_excel(saveRegularName)
        # save itemsets to excel
        self.saveItemSets(itemsets, strDecode, savepath)

        #######################################################下面是根据不同置信度和关联度得到关联规则数目
        listTable = []
        supportRate = 0.01
        confidenceRate = 0.1
        for i in range(9):
            support = supportRate * (i + 1)
            listS = []
            for j in range(9):
                confidence = confidenceRate * (j + 1)
                itemsets = dict(
                    oaf.frequent_itemsets(listToAnalysis_int, support))
                rules = list(oaf.association_rules(itemsets, confidence))
                listS.append(len(rules))
            listTable.append(listS)
        dfList = pd.DataFrame(
            listTable,
            index=[supportRate * (i + 1) for i in range(9)],
            columns=[confidenceRate * (i + 1) for i in range(9)])
        dfList.to_excel(savepath + "\\" + 'regularNum.xlsx')
    for i in rules:
        temList = []
        temStr = ''
        for j in i[0]:  #处理第一个frozenset
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temStr = temStr + ' ==> '
        for j in i[1]:
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temList.append(temStr)
        temList.append(i[2])
        temList.append(i[3])
        temList.append(i[4])
        temList.append(i[5])
        temList.append(i[6])
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '覆盖度', '力度', '提升度',
                                 '利用度'))


often = dict(oaf.frequent_itemsets(ryzd, .02))
rules = oaf.association_rules(often, .5)  #这里设置置信度
rules = list(rules)
printRules = dealRules(rules)
result = list(oaf.rules_stats(rules, often,
                              len(ryzd)))  #下面这个函数改变了rules,把rules用完了!
printResult = dealResult(result)
print(printResult)
示例#8
0
def associateRules(support=0.02, confidence=0.5):
    support = 0.15
    confidence = 0.15
    try:
        with open('filelocation.json') as f_obj:
            fileInput = json.load(f_obj)
    except:
        with open('errorFlag.json', 'w') as e_obj:
            json.dump("File open process failed", e_obj)
        return
    filename = fileInput

    dfar = pd.read_csv(filename)
    tag = list(dfar.columns.values)
    listToAnalysis = []  #最终结果

    for item in range(1, len(tag) - 1):  #遍历列
        imax = max(list(dfar[tag[item]]))  #上界
        imin = min(list(dfar[tag[item]]))  #下界

        ijc = imax - imin  #极差
        l = ijc / 4

        i1 = imin + l
        i2 = i1 + l
        i3 = i2 + l

        listToStore = []

        for i in range(dfar.shape[0]):
            s = dfar.iloc[i][tag[item]]

            if s >= i3 and s <= imax:
                ss = tag[item] + str(i3) + '-' + str(imax)
            elif s >= i2:
                ss = tag[item] + str(i2) + '-' + str(i3)
            elif s >= i1:
                ss = tag[item] + str(i1) + '-' + str(i2)
            elif s >= imin:
                ss = tag[item] + str(imin) + '-' + str(i1)
            listToStore.append(ss)

        listToAnalysis.append(listToStore.copy())

    listToAnalysis2 = []
    ll = len(listToAnalysis[0])

    for ii in range(ll):
        ltmp = []
        for it in listToAnalysis:
            ltmp.append(it[ii])
        listToAnalysis2.append(ltmp.copy())

    #创建编码词典与解码词典
    what = functools.reduce(lambda a, b: a + b, listToAnalysis2)
    strSet = set(what)

    zz = zip(strSet, range(len(strSet)))
    strEncode = dict(zz)  #编码字典

    strDecode = dict(zip(strEncode.values(), strEncode.keys()))  #解码字典

    listToAnalysis_int = [
        list(map(lambda item: strEncode[item], row)) for row in listToAnalysis2
    ]

    with open('Information.json') as obj:
        infostring = json.load(obj)
    inforlist = infostring.split(' ')
    confidence = float(inforlist[0]) / float(100)
    support = float(inforlist[1]) / float(100)
    itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, support))
    #频繁项集

    rules = oaf.association_rules(itemsets, confidence)
    rules = list(rules)
    #关联规则

    regularNum = len(rules)

    #printRules=dealResult(result,strDecode)
    #######
    #print("You will get ")
    #print(regularNum)
    #print("association rules when\n"+"SupportRate = ",end='')
    #print(support,end='')
    #print("ConfidenceRate = "+str(confidence))
    informationBack="You will get "+str(regularNum)+"association rules when\n"\
                                                    +"SupportRate = "+str(support)+" ConfidenceRate = "+str(confidence)
    with open('InformationBack.json', 'w') as inf:
        json.dump(informationBack, inf)
    result = list(oaf.rules_stats(rules, itemsets, len(listToAnalysis_int)))

    dfToSave = ResultDFToSave(result, strDecode)
    with open('arInteractiveText.json', 'w') as ij:
        json.dump(str(dfToSave), ij)
    saveRegularName = "Processed.xlsx"
    dfToSave.to_excel(saveRegularName)
    return regularNum
示例#9
0
        listToAnalysis.append(listToStore.copy())
        listToStore.clear()
    #进行编码,将listToAnalysis里面的字符串转换成整数
    strSet = set(functools.reduce(lambda a,b:a+b, listToAnalysis))
    strEncode = dict(zip(strSet,range(len(strSet)))) #编码字典,即:{'ArticleTag_BS': 6,'Country_Argentina': 53,etc...}
    strDecode = dict(zip(strEncode.values(), strEncode.keys()))  #解码字典,即:{6:'ArticleTag_BS',53:'Country_Argentina',etc...}
    listToAnalysis_int = [list(map(lambda item:strEncode[item],row)) for row in listToAnalysis]
    #开始进行关联分析     
    supportRate = 0.02
    confidenceRate = 0.5     
    itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, supportRate))        
    rules = oaf.association_rules(itemsets, confidenceRate)
    rules = list(rules)
    regularNum = len(rules)
    printRules = dealRules(rules,strDecode)  #该变量可以打印查看生成的规则
    result = list(oaf.rules_stats(rules, itemsets, len(listToAnalysis_int)))   #下面这个函数改变了rules,把rules用完了!
    printResult = dealResult(result,strDecode)  #该变量可以打印查看结果
    
#################################################下面将结果保存成excel格式的文件    
    dfToSave = ResultDFToSave(result,strDecode)
    saveRegularName = str(supportRate)+'支持度_'+str(confidenceRate)+'置信度_产生了'+str(regularNum)+'条规则'+'.xlsx'
    dfToSave.to_excel(saveRegularName)

#######################################################下面是根据不同置信度和关联度得到关联规则数目
    listTable = []
    supportRate = 0.01
    confidenceRate = 0.1
    for i in range(9):
        support = supportRate*(i+1)
        listS = []
        for j in range(9):
示例#10
0

def ResultDFToSave(rules):  #根据Qrange3关联分析生成的规则得到并返回对于的DataFrame数据结构的函数
    returnRules = []
    for i in rules:
        temList = []
        temStr = ''
        for j in i[0]:  #处理第一个frozenset
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temStr = temStr + ' ==> '
        for j in i[1]:
            temStr = temStr + str(j) + '&'
        temStr = temStr[:-1]
        temList.append(temStr)
        temList.append(i[2])
        temList.append(i[3])
        temList.append(i[4])
        temList.append(i[5])
        temList.append(i[6])
        temList.append(i[7])
        returnRules.append(temList)
    return pd.DataFrame(returnRules,
                        columns=('规则', '项集出现数目', '置信度', '覆盖度', '力度', '提升度',
                                 '利用度'))


printRules = dealRules(rules)
result = list(oaf.rules_stats(rules, often,
                              len(listToAnalysis)))  #下面这个函数改变了rules,把rules用完了!
printResult = dealResult(result)
示例#11
0
    def find_rules(self):
        if self.data is None or not len(self.data):
            return
        if self._is_running:
            self._is_running = False
            return

        self.button.button.setText('Cancel')

        self._is_running = True
        data = self.data
        self.table.model().clear()

        n_examples = len(data)
        NumericItem = self.NumericItem
        StandardItem = self.StandardItem
        filterSearch = self.filterSearch
        itemsetMin = self.filterAntecedentMin + self.filterConsequentMin
        itemsetMax = self.filterAntecedentMax + self.filterConsequentMax
        isSizeMatch = self.isSizeMatch
        isRegexMatch = self.isRegexMatch

        X, mapping = OneHot.encode(data, self.classify)
        self.Error.need_discrete_data.clear()
        if X is None:
            self.Error.need_discrete_data()

        self.onehot_mapping = mapping
        ITEM_FMT = '{}' if issparse(data.X) else '{}={}'
        names = {item: ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(var.name, val)
                 for item, var, val in OneHot.decode(mapping, data, mapping)}
        # Items that consequent must include if classifying
        class_items = {item
                       for item, var, val in OneHot.decode(mapping, data, mapping)
                       if var is data.domain.class_var} if self.classify else set()
        assert bool(class_items) == bool(self.classify)

        model = QStandardItemModel(self.table)
        for col, (label, _, tooltip) in enumerate(self.header):
            item = QStandardItem(label)
            item.setToolTip(tooltip)
            model.setHorizontalHeaderItem(col, item)

        # Find itemsets
        nRules = 0
        itemsets = {}
        ARROW_ITEM = StandardItem('→')
        ARROW_ITEM.setTextAlignment(Qt.AlignCenter)
        with self.progressBar(self.maxRules + 1) as progress:
            for itemset, support in frequent_itemsets(X, self.minSupport / 100):
                itemsets[itemset] = support

                if class_items and not class_items & itemset:
                    continue

                # Filter itemset by joined filters before descending into it
                itemset_str = ' '.join(names[i] for i in itemset)
                if (filterSearch and
                    (len(itemset) < itemsetMin or
                     itemsetMax < len(itemset) or
                     not isRegexMatch(itemset_str, itemset_str))):
                    continue

                for rule in association_rules(itemsets,
                                              self.minConfidence / 100,
                                              itemset):
                    left, right, support, confidence = rule

                    if class_items and right - class_items:
                        continue
                    if filterSearch and not isSizeMatch(len(left), len(right)):
                        continue
                    left_str =  ', '.join(names[i] for i in sorted(left))
                    right_str = ', '.join(names[i] for i in sorted(right))
                    if filterSearch and not isRegexMatch(left_str, right_str):
                        continue

                    # All filters matched, calculate stats and add table row
                    _, _, _, _, coverage, strength, lift, leverage = next(
                        rules_stats((rule,), itemsets, n_examples))

                    support_item = NumericItem(support / n_examples)
                    # Set row data on first column
                    support_item.setData((itemset - class_items,
                                          class_items and (class_items & itemset).pop()),
                                         self.ROW_DATA_ROLE)
                    left_item = StandardItem(left_str, len(left))
                    left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter)
                    model.appendRow([support_item,
                                     NumericItem(confidence),
                                     NumericItem(coverage),
                                     NumericItem(strength),
                                     NumericItem(lift),
                                     NumericItem(leverage),
                                     left_item,
                                     ARROW_ITEM.clone(),
                                     StandardItem(right_str, len(right))])
                    nRules += 1
                    progress.advance()

                    if not self._is_running or nRules >= self.maxRules:
                        break

                qApp.processEvents()

                if not self._is_running or nRules >= self.maxRules:
                    break

        # Populate the TableView
        table = self.table
        table.setHidden(True)
        table.setSortingEnabled(False)
        proxy_model = self.proxy_model
        proxy_model.setSourceModel(model)
        table.setModel(proxy_model)
        for i in range(model.columnCount()):
            table.resizeColumnToContents(i)
        table.setSortingEnabled(True)
        table.setHidden(False)
        self.table_rules = proxy_model.get_data()
        if self.table_rules is not None:
            self.Outputs.rules.send(self.table_rules)

        self.button.button.setText('Find Rules')

        self.nRules = nRules
        self.nFilteredRules = proxy_model.rowCount()  # TODO: continue; also add in owitemsets
        self.nSelectedRules = 0
        self.nSelectedExamples = 0
        self._is_running = False
示例#12
0
# So the items ‘4’ and ‘25’ (fifth and twenty sixth columns of X) are the only items (and itemsets) that appear 10 or more times. Let’s check this:
print((X.sum(axis=0) >= 10).nonzero()[1])

# Conclusion: Given databases of uniformly distributed random data, there’s not much to work with.




# Examples with rules
np.random.seed(0)
N = 100
X = np.random.random((N, 100)) > .9

# Find all itemsets with at least 5% support:
itemsets = dict(ofpg.frequent_itemsets(X, .05))

# Generate all association rules from these itemsets with minimum 50% confidence:
rules = ofpg.association_rules(itemsets, .5)
rules = list(rules)

# Or only the rules for a particular itemset:
print(list(ofpg.association_rules(itemsets, .3, frozenset({75, 98}))))


# Examples of additional stats for rules generated by association_rules()
N = 30
X = np.random.random((N, 50)) > .9
itemsets = dict(ofpg.frequent_itemsets(X, .1))
rules = ofpg.association_rules(itemsets, .6)
print(list(ofpg.rules_stats(rules, itemsets, N)))