def ResultDFToSave(rules): #根据Qrange3关联分析生成的规则得到并返回对于的DataFrame数据结构的函数 returnRules = [] for i in rules: temList = [] temStr = '' for j in i[0]: #处理第一个frozenset temStr = temStr + str(j) + '&' temStr = temStr[:-1] temStr = temStr + ' ==> ' for j in i[1]: temStr = temStr + str(j) + '&' temStr = temStr[:-1] temList.append(temStr) temList.append(i[2]) temList.append(i[3]) temList.append(i[4]) temList.append(i[5]) temList.append(i[6]) temList.append(i[7]) returnRules.append(temList) return pd.DataFrame(returnRules, columns=('规则', '项集出现数目', '置信度', '覆盖度', '力度', '提升度', '利用度')) supportRate = 0.02 confidenceRate = 0.5 itemsets = dict(oaf.frequent_itemsets(listToAnalysis, supportRate)) rules = oaf.association_rules(itemsets, confidenceRate) rules = list(rules) regularNum = len(rules) printRules = dealRules(rules) result = list(oaf.rules_stats( rules, itemsets, len(listToAnalysis))) #下面这个函数改变了rules,把rules用完了! printResult = dealResult(result) #################################################下面将结果保存成excel格式的文件 dfToSave = ResultDFToSave(result) saveRegularName = str(supportRate) + '支持度_' + str( confidenceRate) + '置信度_产生了' + str(regularNum) + '条规则' + '.xlsx' dfToSave.to_excel(saveRegularName) #######################################################下面是根据不同置信度和关联度得到关联规则数目 listTable = [] supportRate = 0.01 confidenceRate = 0.1 for i in range(9): support = supportRate * (i + 1) listS = [] for j in range(9): confidence = confidenceRate * (j + 1) itemsets = dict(oaf.frequent_itemsets(listToAnalysis, support)) rules = list(oaf.association_rules(itemsets, confidence)) listS.append(len(rules)) listTable.append(listS) dfList = pd.DataFrame(listTable, index=[supportRate * (i + 1) for i in range(9)], columns=[confidenceRate * (i + 1) for i in range(9)]) dfList.to_excel('regularNum.xlsx')
def rules_extractor(X, profundidades=range(4), metric=0.3): res = {} for i in profundidades: T = transacciones_profundidad(X, i) itemsets = dict(fp.frequent_itemsets(T, metric)) rules = [ (P, Q, supp, conf) for P, Q, supp, conf in fp.association_rules(itemsets, metric) ] res[i] = (itemsets, rules) return res
def find_rules(self): if self.data is None or not len(self.data): return if self._is_running: return self._is_running = True data = self.data self.table.model().clear() n_examples = len(data) NumericItem = self.NumericItem StandardItem = self.StandardItem filterSearch = self.filterSearch itemsetMin = self.filterAntecedentMin + self.filterConsequentMin itemsetMax = self.filterAntecedentMax + self.filterConsequentMax isSizeMatch = self.isSizeMatch isRegexMatch = self.isRegexMatch X, mapping = OneHot.encode(data, self.classify) self.error(911) if X is None: self.error(911, 'Need some discrete data to work with.') self.onehot_mapping = mapping ITEM_FMT = '{}' if issparse(data.X) else '{}={}' names = { item: ('{}={}' if var is data.domain.class_var else ITEM_FMT).format( var.name, val) for item, var, val in OneHot.decode(mapping, data, mapping) } # Items that consequent must include if classifying class_items = { item for item, var, val in OneHot.decode(mapping, data, mapping) if var is data.domain.class_var } if self.classify else set() assert bool(class_items) == bool(self.classify) model = QStandardItemModel(self.table) for col, (label, tooltip) in enumerate([ ("Supp", "Support"), ("Conf", "Confidence (support / antecedent support)"), ("Covr", "Coverage (antecedent support / number of examples)"), ("Strg", "Strength (consequent support / antecedent support)"), ("Lift", "Lift (number of examples * confidence / consequent support)"), ("Levr", "Leverage ((support * number of examples - antecedent support * consequent support) / (number of examples)²)" ), ("Antecedent", None), ("", None), ("Consequent", None) ]): item = QStandardItem(label) item.setToolTip(tooltip) model.setHorizontalHeaderItem(col, item) #~ # Aggregate rules by common (support,confidence) for scatterplot #~ scatter_agg = defaultdict(list) # Find itemsets nRules = 0 itemsets = {} with self.progressBar(self.maxRules + 1) as progress: for itemset, support in frequent_itemsets(X, self.minSupport / 100): itemsets[itemset] = support if class_items and not class_items & itemset: continue # Filter itemset by joined filters before descending into it itemset_str = ' '.join(names[i] for i in itemset) if (filterSearch and (len(itemset) < itemsetMin or itemsetMax < len(itemset) or not isRegexMatch(itemset_str, itemset_str))): continue for rule in association_rules(itemsets, self.minConfidence / 100, itemset): left, right, support, confidence = rule if class_items and right - class_items: continue if filterSearch and not isSizeMatch(len(left), len(right)): continue left_str = ', '.join(names[i] for i in sorted(left)) right_str = ', '.join(names[i] for i in sorted(right)) if filterSearch and not isRegexMatch(left_str, right_str): continue # All filters matched, calculate stats and add table row _, _, _, _, coverage, strength, lift, leverage = next( rules_stats((rule, ), itemsets, n_examples)) support_item = NumericItem(support / n_examples) # Set row data on first column support_item.setData( (itemset - class_items, class_items and (class_items & itemset).pop()), self.ROW_DATA_ROLE) left_item = StandardItem(left_str, len(left)) left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter) model.appendRow([ support_item, NumericItem(confidence), NumericItem(coverage), NumericItem(strength), NumericItem(lift), NumericItem(leverage), left_item, StandardItem('→'), StandardItem(right_str, len(right)) ]) #~ scatter_agg[(round(support / n_examples, 2), round(confidence, 2))].append((left, right)) nRules += 1 progress.advance() if nRules >= self.maxRules: break if nRules >= self.maxRules: break # Populate the TableView table = self.table table.setHidden(True) table.setSortingEnabled(False) proxy_model = self.proxy_model proxy_model.setSourceModel(model) table.setModel(proxy_model) for i in range(model.columnCount()): table.resizeColumnToContents(i) table.setSortingEnabled(True) table.setHidden(False) self.nRules = nRules self.nFilteredRules = proxy_model.rowCount( ) # TODO: continue; also add in owitemsets self.nSelectedRules = 0 self.nSelectedExamples = 0 self._is_running = False
import os, os.path, shutil import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd') #txt目录提取 hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore") #呼吸疾病目录 hxjbdic = hxjb.readlines() #读行 ryzd = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] line_out = [] for line in f.readlines(): line = re.sub('\n', '', line) line = re.sub(r'(.+?)肺炎', '肺炎', line) #替换所有的肺炎 for hxjbc in hxjbdic: #检索每个词 hxjbc = re.sub('\n', '', hxjbc) if line.find(hxjbc) > -1: line_out.append(line) line_output = EMRdef.delre(line_out) ryzd.append(line_out) #line = '\n'.join(line_output) #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line) import orangecontrib.associate.fpgrowth as oaf often = dict(oaf.frequent_itemsets(ryzd, .01)) #生成频繁度 rules = oaf.association_rules(often, .5) #这里设置置信度 rules = list(rules) print(rules)
for line in f.readlines(): line = re.sub('\n','',line) line = re.sub(r'(.+?)肺炎','肺炎',line)#替换所有的肺炎 for hxjbc in hxjbdic:#检索每个词 hxjbc = re.sub('\n','',hxjbc) if line.find(hxjbc) >-1: line_out.append(line) line_output = EMRdef.delre(line_out) ryzd.append(line_out) #line = '\n'.join(line_output) #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line) import orangecontrib.associate.fpgrowth as oaf often=dict(oaf.frequent_itemsets(ryzd, .02))#生成频繁度 rules = oaf.association_rules(often, .25, ) #这里设置置信度frozenset({'肺炎'}) rules = list(rules) def dealResult(rules): returnRules = [] for i in rules: temStr = ''; for j in i[0]: #处理第一个frozenset temStr = temStr+j+'&' temStr = temStr[:-1] temStr = temStr + ' ==> ' for j in i[1]: temStr = temStr+j+'&' temStr = temStr[:-1] temStr = temStr + ';' +'\t'+str(i[2])+ ';' +'\t'+str(i[3])+ ';' +'\t'+str(i[4])+ ';' +'\t'+str(i[5])+ ';' +'\t'+str(i[6])+ ';' +'\t'+str(i[7]) # print(temStr)
def find_rules(self): if self.data is None: return if self._is_running: return self._is_running = True data = self.data self.table.model().clear() n_examples = len(data) NumericItem = self.NumericItem StandardItem = self.StandardItem filterSearch = self.filterSearch itemsetMin = self.filterAntecedentMin + self.filterConsequentMin itemsetMax = self.filterAntecedentMax + self.filterConsequentMax isSizeMatch = self.isSizeMatch isRegexMatch = self.isRegexMatch X, mapping = OneHot.encode(data, self.classify) self.onehot_mapping = mapping ITEM_FMT = '{}' if issparse(data.X) else '{}={}' names = {item: ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(var.name, val) for item, var, val in OneHot.decode(mapping, data, mapping)} # Items that consequent must include if classifying class_items = {item for item, var, val in OneHot.decode(mapping, data, mapping) if var is data.domain.class_var} if self.classify else set() assert bool(class_items) == bool(self.classify) model = QStandardItemModel(self.table) for col, (label, tooltip) in enumerate([("Supp", "Support"), ("Conf", "Confidence (support / antecedent support)"), ("Covr", "Coverage (antecedent support / number of examples)"), ("Strg", "Strength (consequent support / antecedent support)"), ("Lift", "Lift (number of examples * confidence / consequent support)"), ("Levr", "Leverage ((support * number of examples - antecedent support * consequent support) / (number of examples)²)"), ("Antecedent", None), ("", None), ("Consequent", None)]): item = QStandardItem(label) item.setToolTip(tooltip) model.setHorizontalHeaderItem(col, item) #~ # Aggregate rules by common (support,confidence) for scatterplot #~ scatter_agg = defaultdict(list) # Find itemsets nRules = 0 itemsets = {} with self.progressBar(self.maxRules + 1) as progress: for itemset, support in frequent_itemsets(X, self.minSupport / 100): itemsets[itemset] = support if class_items and not class_items & itemset: continue # Filter itemset by joined filters before descending into it itemset_str = ' '.join(names[i] for i in itemset) if (filterSearch and (len(itemset) < itemsetMin or itemsetMax < len(itemset) or not isRegexMatch(itemset_str, itemset_str))): continue for rule in association_rules(itemsets, self.minConfidence / 100, itemset): left, right, support, confidence = rule if class_items and right - class_items: continue if filterSearch and not isSizeMatch(len(left), len(right)): continue left_str = ', '.join(names[i] for i in sorted(left)) right_str = ', '.join(names[i] for i in sorted(right)) if filterSearch and not isRegexMatch(left_str, right_str): continue # All filters matched, calculate stats and add table row _, _, _, _, coverage, strength, lift, leverage = next( rules_stats((rule,), itemsets, n_examples)) support_item = NumericItem(support / n_examples) # Set row data on first column support_item.setData((itemset - class_items, class_items and (class_items & itemset).pop()), self.ROW_DATA_ROLE) left_item = StandardItem(left_str, len(left)) left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter) model.appendRow([support_item, NumericItem(confidence), NumericItem(coverage), NumericItem(strength), NumericItem(lift), NumericItem(leverage), left_item, StandardItem('→'), StandardItem(right_str, len(right))]) #~ scatter_agg[(round(support / n_examples, 2), round(confidence, 2))].append((left, right)) nRules += 1 progress.advance() if nRules >= self.maxRules: break if nRules >= self.maxRules: break # Populate the TableView table = self.table table.setHidden(True) table.setSortingEnabled(False) proxy_model = self.proxy_model proxy_model.setSourceModel(model) table.setModel(proxy_model) for i in range(model.columnCount()): table.resizeColumnToContents(i) table.setSortingEnabled(True) table.setHidden(False) self.nRules = nRules self.nFilteredRules = proxy_model.rowCount() # TODO: continue; also add in owitemsets self.nSelectedRules = 0 self.nSelectedExamples = 0 self._is_running = False
def doAnalysize(self, pd_data, category, supportRate=0.02, confidenceRate=0.5, savepath=r'C:\Users\Administrator\Desktop'): # 初始化词库路径 savepath = savepath + "\\" + category if not os.path.exists(savepath): os.makedirs(savepath) initpath = "tmall\\spiders\\DataAnalysize\\jiebaInit\\" + category + ".txt" jieba.load_userdict(initpath) pd_data['ratecontent_list'] = pd_data.apply( lambda r: list(jieba.cut(r['rateContent'])), axis=1) aim_list = [] with open(initpath, 'r', encoding="utf-8") as f: for line in f.readlines(): aim_list.append(line.strip('\n')) pd_data['aim_list'] = pd_data.apply(lambda r: list( set(r['ratecontent_list']).intersection(set(aim_list))), axis=1) simple_aimdata = [] pd_data.apply(lambda r: simple_aimdata.append(r['aim_list']) if not r['aim_list'] == [] else 1, axis=1) wordcloudlist = [] for item in simple_aimdata: for i in item: wordcloudlist.append(i) # 生成每种分析的词云图 self.everyWordCloud(wordcloudlist, savepath) #经过上面两行操作,得到目标列表: simple_aimdata strSet = set(functools.reduce(lambda a, b: a + b, simple_aimdata)) strEncode = dict(zip(strSet, range( len(strSet)))) # 编码字典,即:{'甜腻': 6,'鱼腥味': 53,etc...} strDecode = dict( zip(strEncode.values(), strEncode.keys())) # 解码字典,即:{6:'甜腻',53:'鱼腥味',etc...} listToAnalysis_int = [ list(map(lambda item: strEncode[item], row)) for row in simple_aimdata ] # 开始进行关联分析 itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, supportRate)) # print("itemsets : ") # print(itemsets) rules = oaf.association_rules(itemsets, confidenceRate) rules = list(rules) regularNum = len(rules) printRules = self.dealRules(rules, strDecode) # 该变量可以打印查看生成的规则 # print(printRules) result = list(oaf.rules_stats( rules, itemsets, len(listToAnalysis_int))) # 下面这个函数改变了rules,把rules用完了! # print(result) printResult = self.dealResult(result, strDecode) # 该变量可以打印查看结果 # print(printResult) #################################################下面将结果保存成excel格式的文件 # save rules to excel dfToSave = self.ResultDFToSave(result, strDecode) saveRegularName = savepath + "\\" + str(supportRate) + '支持度_' + str( confidenceRate) + '置信度_产生了' + str(regularNum) + '条规则' + '.xlsx' dfToSave.to_excel(saveRegularName) # save itemsets to excel self.saveItemSets(itemsets, strDecode, savepath) #######################################################下面是根据不同置信度和关联度得到关联规则数目 listTable = [] supportRate = 0.01 confidenceRate = 0.1 for i in range(9): support = supportRate * (i + 1) listS = [] for j in range(9): confidence = confidenceRate * (j + 1) itemsets = dict( oaf.frequent_itemsets(listToAnalysis_int, support)) rules = list(oaf.association_rules(itemsets, confidence)) listS.append(len(rules)) listTable.append(listS) dfList = pd.DataFrame( listTable, index=[supportRate * (i + 1) for i in range(9)], columns=[confidenceRate * (i + 1) for i in range(9)]) dfList.to_excel(savepath + "\\" + 'regularNum.xlsx')
def find_rules(self): if self.data is None or not len(self.data): return if self._is_running: self._is_running = False return self.button.button.setText('Cancel') self._is_running = True data = self.data self.table.model().clear() n_examples = len(data) NumericItem = self.NumericItem StandardItem = self.StandardItem filterSearch = self.filterSearch itemsetMin = self.filterAntecedentMin + self.filterConsequentMin itemsetMax = self.filterAntecedentMax + self.filterConsequentMax isSizeMatch = self.isSizeMatch isRegexMatch = self.isRegexMatch X, mapping = OneHot.encode(data, self.classify) self.Error.need_discrete_data.clear() if X is None: self.Error.need_discrete_data() self.onehot_mapping = mapping ITEM_FMT = '{}' if issparse(data.X) else '{}={}' names = {item: ('{}={}' if var is data.domain.class_var else ITEM_FMT).format(var.name, val) for item, var, val in OneHot.decode(mapping, data, mapping)} # Items that consequent must include if classifying class_items = {item for item, var, val in OneHot.decode(mapping, data, mapping) if var is data.domain.class_var} if self.classify else set() assert bool(class_items) == bool(self.classify) model = QStandardItemModel(self.table) for col, (label, _, tooltip) in enumerate(self.header): item = QStandardItem(label) item.setToolTip(tooltip) model.setHorizontalHeaderItem(col, item) # Find itemsets nRules = 0 itemsets = {} ARROW_ITEM = StandardItem('→') ARROW_ITEM.setTextAlignment(Qt.AlignCenter) with self.progressBar(self.maxRules + 1) as progress: for itemset, support in frequent_itemsets(X, self.minSupport / 100): itemsets[itemset] = support if class_items and not class_items & itemset: continue # Filter itemset by joined filters before descending into it itemset_str = ' '.join(names[i] for i in itemset) if (filterSearch and (len(itemset) < itemsetMin or itemsetMax < len(itemset) or not isRegexMatch(itemset_str, itemset_str))): continue for rule in association_rules(itemsets, self.minConfidence / 100, itemset): left, right, support, confidence = rule if class_items and right - class_items: continue if filterSearch and not isSizeMatch(len(left), len(right)): continue left_str = ', '.join(names[i] for i in sorted(left)) right_str = ', '.join(names[i] for i in sorted(right)) if filterSearch and not isRegexMatch(left_str, right_str): continue # All filters matched, calculate stats and add table row _, _, _, _, coverage, strength, lift, leverage = next( rules_stats((rule,), itemsets, n_examples)) support_item = NumericItem(support / n_examples) # Set row data on first column support_item.setData((itemset - class_items, class_items and (class_items & itemset).pop()), self.ROW_DATA_ROLE) left_item = StandardItem(left_str, len(left)) left_item.setTextAlignment(Qt.AlignRight | Qt.AlignVCenter) model.appendRow([support_item, NumericItem(confidence), NumericItem(coverage), NumericItem(strength), NumericItem(lift), NumericItem(leverage), left_item, ARROW_ITEM.clone(), StandardItem(right_str, len(right))]) nRules += 1 progress.advance() if not self._is_running or nRules >= self.maxRules: break qApp.processEvents() if not self._is_running or nRules >= self.maxRules: break # Populate the TableView table = self.table table.setHidden(True) table.setSortingEnabled(False) proxy_model = self.proxy_model proxy_model.setSourceModel(model) table.setModel(proxy_model) for i in range(model.columnCount()): table.resizeColumnToContents(i) table.setSortingEnabled(True) table.setHidden(False) self.table_rules = proxy_model.get_data() if self.table_rules is not None: self.Outputs.rules.send(self.table_rules) self.button.button.setText('Find Rules') self.nRules = nRules self.nFilteredRules = proxy_model.rowCount() # TODO: continue; also add in owitemsets self.nSelectedRules = 0 self.nSelectedExamples = 0 self._is_running = False
import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd') #txt目录提取 hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore") #呼吸疾病目录 hxjbdic = hxjb.readlines() #读行 ryzd = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] line_out = [] for line in f.readlines(): line = re.sub('\n', '', line) line = re.sub(r'(.+?)肺炎', '肺炎', line) #替换所有的肺炎 for hxjbc in hxjbdic: #检索每个词 hxjbc = re.sub('\n', '', hxjbc) if line.find(hxjbc) > -1: line_out.append(line) line_output = EMRdef.delre(line_out) ryzd.append(line_out) #line = '\n'.join(line_output) #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line) import orangecontrib.associate.fpgrowth as oaf often = dict(oaf.frequent_itemsets(ryzd, .01)) #生成频繁度 rules = oaf.association_rules(often, .5, hxjbdic) #这里设置置信度 rules = list(rules) print(rules)
import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd') #txt目录提取 hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore") #呼吸疾病目录 hxjbdic = hxjb.readlines() #读行 ryzd = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] line_out = [] for line in f.readlines(): line = re.sub('\n', '', line) line = re.sub(r'(.+?)肺炎', '肺炎', line) #替换所有的肺炎 for hxjbc in hxjbdic: #检索每个词 hxjbc = re.sub('\n', '', hxjbc) if line.find(hxjbc) > -1: line_out.append(line) line_output = EMRdef.delre(line_out) ryzd.append(line_out) #line = '\n'.join(line_output) #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line) import orangecontrib.associate.fpgrowth as oaf often = dict(oaf.frequent_itemsets(ryzd, .01)) #生成频繁度 rules = oaf.association_rules(hxjbdic, .5, often) #这里设置置信度 rules = list(rules) print(rules)
def associateRules(support=0.02, confidence=0.5): support = 0.15 confidence = 0.15 try: with open('filelocation.json') as f_obj: fileInput = json.load(f_obj) except: with open('errorFlag.json', 'w') as e_obj: json.dump("File open process failed", e_obj) return filename = fileInput dfar = pd.read_csv(filename) tag = list(dfar.columns.values) listToAnalysis = [] #最终结果 for item in range(1, len(tag) - 1): #遍历列 imax = max(list(dfar[tag[item]])) #上界 imin = min(list(dfar[tag[item]])) #下界 ijc = imax - imin #极差 l = ijc / 4 i1 = imin + l i2 = i1 + l i3 = i2 + l listToStore = [] for i in range(dfar.shape[0]): s = dfar.iloc[i][tag[item]] if s >= i3 and s <= imax: ss = tag[item] + str(i3) + '-' + str(imax) elif s >= i2: ss = tag[item] + str(i2) + '-' + str(i3) elif s >= i1: ss = tag[item] + str(i1) + '-' + str(i2) elif s >= imin: ss = tag[item] + str(imin) + '-' + str(i1) listToStore.append(ss) listToAnalysis.append(listToStore.copy()) listToAnalysis2 = [] ll = len(listToAnalysis[0]) for ii in range(ll): ltmp = [] for it in listToAnalysis: ltmp.append(it[ii]) listToAnalysis2.append(ltmp.copy()) #创建编码词典与解码词典 what = functools.reduce(lambda a, b: a + b, listToAnalysis2) strSet = set(what) zz = zip(strSet, range(len(strSet))) strEncode = dict(zz) #编码字典 strDecode = dict(zip(strEncode.values(), strEncode.keys())) #解码字典 listToAnalysis_int = [ list(map(lambda item: strEncode[item], row)) for row in listToAnalysis2 ] with open('Information.json') as obj: infostring = json.load(obj) inforlist = infostring.split(' ') confidence = float(inforlist[0]) / float(100) support = float(inforlist[1]) / float(100) itemsets = dict(oaf.frequent_itemsets(listToAnalysis_int, support)) #频繁项集 rules = oaf.association_rules(itemsets, confidence) rules = list(rules) #关联规则 regularNum = len(rules) #printRules=dealResult(result,strDecode) ####### #print("You will get ") #print(regularNum) #print("association rules when\n"+"SupportRate = ",end='') #print(support,end='') #print("ConfidenceRate = "+str(confidence)) informationBack="You will get "+str(regularNum)+"association rules when\n"\ +"SupportRate = "+str(support)+" ConfidenceRate = "+str(confidence) with open('InformationBack.json', 'w') as inf: json.dump(informationBack, inf) result = list(oaf.rules_stats(rules, itemsets, len(listToAnalysis_int))) dfToSave = ResultDFToSave(result, strDecode) with open('arInteractiveText.json', 'w') as ij: json.dump(str(dfToSave), ij) saveRegularName = "Processed.xlsx" dfToSave.to_excel(saveRegularName) return regularNum
print("FREQUENT PATTERN WITH MORE THAN 1 ITEM") counter = 1 frequentItemSet.sort(key=lambda x: -x[1]) for itemSet in frequentItemSet: itemSet_list = list(itemSet[0]) if len(itemSet_list) > 1: print("[" + str(counter) + "]") for item in itemSet_list: print(dictItemToDescription[dictKeyToItem[item]]) print("Minimal Support = " + str(itemSet[1])) counter += 1 if counter > 15: break # search for association Rule associationRuleItemList = or3.association_rules( dict(or3.frequent_itemsets(ItemList, 0.02)), 0.001) rules = list(associationRuleItemList) rules.sort(key=lambda x: -x[3]) print("10 ASSOCIATION RULE WITH GREATER SUPPORT") counter = 1 for rule in rules: rule_list = list(rule) print("[" + str(counter) + "]") print( str(dictItemToDescription[dictKeyToItem[list(rule[0])[0]]]) + " => " + str(dictItemToDescription[dictKeyToItem[list(rule[1])[0]]])) print("Minimal Support = " + str(rule[2])) print("Confidence = " + str(rule[3])) counter += 1
def asso_analysis(path, file_path): if not os.path.exists(root + '/asso_analysis/'): os.mkdir(root + '/asso_analysis/') if not os.path.exists(root + '/asso_analysis/err_label_clean.csv'): data = pd.read_csv(path + file_path, encoding='utf-8') room_list = data['PAR_ROOM'].unique().tolist() room_type_map = {} # 生成机房类型 - 机房ID映射字典 for r in room_list: r_info = data[data['PAR_ROOM'] == r].dropna(axis=1) if str(r_info.columns.tolist()) not in room_type_map.keys(): room_type_map[str(r_info.columns.tolist())] = [r] else: room_type_map[str(r_info.columns.tolist())].append(r) # 生成标记数据 label_df = pd.DataFrame() for k in room_type_map.keys(): same_type_room = room_type_map.get(k) df = pd.DataFrame() for r in same_type_room: df = df.append(data[data['PAR_ROOM'] == r], ignore_index=True) df.dropna(axis=1, inplace=True) col = [c for c in df.columns.tolist() if c not in ['TIME','PAR_ROOM','ALARM_CAUSE']] cur_label = pd.DataFrame() cur_label['before_err'] = list(range(24, 0, -1)) * int(df.shape[0] / 24) + list(range(24, 24 - int(df.shape[0] % 24), -1)) cur_label['err_feature'] = to_label(col, df, cur_label['before_err'].values.tolist()) cur_label['ALARM_CAUSE'] = df['ALARM_CAUSE'] label_df = label_df.append(cur_label) label_df.dropna(inplace=True) label_df.to_csv(root + '/asso_analysis/err_label_clean.csv', index=False, encoding='utf-8') cur_cate = pd.read_csv(root + '/asso_analysis/err_label_clean.csv', encoding='utf-8', low_memory=False) cur_cate.dropna(inplace=True) cate_dict = {'R_LOS': 161, 'NE_NOT_LOGIN': 161, 'High Temperature': 161, 'NE_COMMU_BREAK': 161, 'lossOfSignal': 161, 'R_LOF': 161, 'IN_PWR_HIGH': 161, 'POWERALM': 161, 'HARD_BAD': 161, 'NE_Backup_Failed': 161, 'Comms fail alarm': 161, 'FCS_ERR': 161, 'LSR_NO_FITED': 161, 'PKG_FAIL': 161, 'IN_PWR_FAIL': 161, 'BUS_ERR': 161, 'PLUGGABLE_TRANSCEIVER_DISMOUNT': 161, 'R_OOF': 161, 'PWR_MAJ_ALM': 161, 'Client Service Mismatch': 161, 'UNKNOWN_CARD': 161, 'OS-Optical_Power_High': 161, 'GNE_CONNECT_FAIL': 161, 'Replaceable Unit Problem': 162, 'Loss Of Signal': 162, 'LOS': 162, 'LOF': 162, 'IN_PWR_ABN': 162, 'OUT_PWR_ABN': 162, 'Underlying Resource Unavailable': 162, 'Loss Of Frame': 162, 'ME loss of communication': 162, 'COMMUN_FAIL': 162, 'TEMP_OVER': 162, 'BD_STATUS': 162, 'SUBCARD_ABN': 162, 'POWER_FAIL': 162, 'Duplicate Shelf Detected': 162, 'NE_DATA_INCONSISTENCY': 162, 'SYSBUS_FAIL': 162, 'SHELF_ABSENCE': 162, 'ABSENCE_WARNING': 162, 'POWER_ABNORMAL': 162, 'Bipolar Violations': 162, 'Transmitter Failure': 162, 'CHIP_FAIL': 162, 'BUS_ERROR': 162, 'LAPS_FAIL': 162, 'Degraded Signal': 163, 'Signal Degrade': 163, 'Internal Communication Problem': 163, 'RDI': 163, 'cntrlBusFail': 163, 'BD_NOT_INSTALLED': 163, 'FAN_FAIL': 163, 'SYN_BAD': 163, 'Circuit Pack Mismatch': 163, 'Fan Failed': 163, 'Replaceable Unit Missing': 163, 'Fuse Failure': 163, 'Battery Failure': 163, 'Temperature Out Of Range': 163, 'Power Failure - B': 163, 'Database Save and Restore Failed': 163, 'Cooling Fan Failure': 163, 'MIB backup misaligned': 164, 'Inside Failure': 164, 'Sfwr Environment Problem': 164, 'HouseKeeping': 164} err_type = ['161', '162', '163', '164'] # err_type = cur_cate['ALARM_CAUSE'].unique().tolist() cur_cate['ALARM_CAUSE'] = cur_cate['ALARM_CAUSE'].apply(lambda x: str(cate_dict[x]) if x in cate_dict.keys() else "-1") cur_cate['err_feature'] = cur_cate['err_feature'].apply(lambda x: x.split("|")) err_feature = [] last_before_err = 24 items_dict = {'161': [], '162': [], '163': [], '164': []} for index, row in cur_cate.iterrows(): err_feature.append(row['err_feature']) if last_before_err < row['before_err'] or index == cur_cate.shape[0]-1: cause = cur_cate.loc[index - 1, 'ALARM_CAUSE'] items_dict[cause] += err_feature err_feature.clear() last_before_err = row['before_err'] d_itemsets = {} for c in err_type: # 频繁项集 each_itemsets = dict(oaf.frequent_itemsets(items_dict[c], 0.0125)) total = 0 # 关联规则 for k in each_itemsets.keys(): s = set(k) s.add(c) d_itemsets[frozenset(s)] = each_itemsets[k] if k not in d_itemsets: d_itemsets[k] = each_itemsets[k] else: d_itemsets[k] += each_itemsets[k] total += each_itemsets[k] d_itemsets[frozenset([c])] = total rules = list(oaf.association_rules(d_itemsets, 0.7)) cur_result = pd.DataFrame(rule_process(rules, err_type), columns=('规则', '置信度')) cur_result.to_csv(root + '/asso_analysis/associate_analysis.csv', encoding='utf-8', header=True, index=False)
import pandas import pyodbc suppParam = 0.1 confParam = 0.7 _conn = pyodbc.connect( "DRIVER={SQL Server};SERVER=(local)\sql2017;Database=PythonDemo;Trusted_Connection=yes;" ) _sql = "SELECT [Departments] as [Values] FROM [dbo].[CombinedSets] WHERE StoreCode=20" InputDataSet = pandas.read_sql_query(sql=_sql, con=_conn) mlb = MultiLabelBinarizer(sparse_output=True) X = mlb.fit_transform(InputDataSet["Values"].str.split(",\s*")) > 0 classes = mlb.classes_ itemsets = dict(frequent_itemsets(X, suppParam)) rules = [[ ", ".join(classes[i] for i in P), classes[next(iter(Q))], supp, conf ] for P, Q, supp, conf in association_rules(itemsets, confParam)] OutputDataSet = pandas.DataFrame(rules, columns=["ante", "cons", "supp", "conf"]) rows = len(InputDataSet) OutputDataSet["suppPCT"] = pandas.Series([(i / rows) for i in OutputDataSet["supp"]], dtype="float") OutputDataSet.sort_values(["conf"], ascending=False) print(OutputDataSet)
transaction.append(tag_numbers) #print(transaction) #df2 = pd.read_csv("/Users/ahmaddorri/Desktop/tag recomendation/data/mixed/youtube.words",header=None ,sep=" ") #print(df2.head()) sampleTransaction = np.random.choice(transaction, size=2000, replace=False).tolist() #print(sampleTransaction) import orangecontrib.associate.fpgrowth as org T = [["unicef", "child", "united", "nation"], ["education", "child", "game", "math"], ["unicef", "education", "child", "job"]] #freq_item = org.frequent_itemsets(T,2) itemsets = dict(org.frequent_itemsets(T, 1)) #print(list(freq_item)) print(itemsets) print(len(itemsets)) rules = org.association_rules(itemsets, min_confidence=0.49) rules = list(rules) for r in rules: print(r) if ("unicef" in r[0]): print(r[0])
import codecs import EMRdef import re emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHRryzd') #txt目录提取 hxjb = open(r'D:\python\EMR\hxjbml.txt', errors="ignore") #呼吸疾病目录 hxjbdic = hxjb.readlines() #读行 ryzd = [] for emrtxt in emrtxts: f = open(emrtxt, 'r', errors="ignore") #中文加入errors emrpath = os.path.basename(emrtxt) emrpath = os.path.splitext(emrpath)[0] line_out = [] for line in f.readlines(): line = re.sub('\n', '', line) line = re.sub(r'(.+?)肺炎', '肺炎', line) #替换所有的肺炎 for hxjbc in hxjbdic: #检索每个词 hxjbc = re.sub('\n', '', hxjbc) if line.find(hxjbc) > -1: line_out.append(line) line_output = EMRdef.delre(line_out) ryzd.append(line_out) #line = '\n'.join(line_output) #EMRdef.text_create(r'D:\DeepLearning ER\EHRryzd2','.txt' ,emrpath,line) import orangecontrib.associate.fpgrowth as oaf often = dict(oaf.frequent_itemsets(ryzd, .01)) #生成频繁度 rules = oaf.association_rules(often, .01, frozenset({'肺炎'})) #这里设置置信度 rules = list(rules) print(rules)
import sys import os, os.path,shutil import codecs import EMRdef import re #关键词提取 关键词为诊疗计划 emrtxts = EMRdef.txttq(u'D:\DeepLearning ER\EHR')#txt目录提取 pattern2 = r'。|:|“|”|;|,'#根据标点分词 tgjc = [] for emrtxt in emrtxts: f = open(emrtxt,'r',errors="ignore")#中文加入errors emrtxt = os.path.basename(emrtxt) emrtxt_str = re.findall(r'(^.+?)\_',emrtxt)#提取ID emrtxt = "".join(emrtxt_str)#转成str #txtp=txtp.decode('utf-8') for line in f.readlines(): line = re.sub(' ','',line)#删除空格 if line.find (u'体格检查') >-1: line = re.sub('体格检查:','',line) f2_end = re.split(pattern2,line) tgjc.append(f2_end) f2_out = "\n".join(f2_end)#转成str #EMRdef.text_create(r'D:\DeepLearning ER\EHRtigejiancha','.txt' ,emrtxt,f2_out)#导出 #zljhs.append(emrtxt+':'+line) #EMRdef.text_save('D:\python\EMR\zljh.txt',zljhs)''' '''------------------------------------------------------------------------------------------------------------''' #导入关联规则 import orangecontrib.associate.fpgrowth as oaf rules = oaf.association_rules(tgjc, 0.2) #这里设置置信度 rules = list(rules)
def model(data, support=0.05, confidence=0.2): fre_ite = dict(oaf.frequent_itemsets(data, support)) #这里设置置信度 rules = oaf.association_rules(fre_ite, confidence) result = list(rules) return result
temList.append(i[3]) temList.append(i[4]) temList.append(i[5]) temList.append(i[6]) temList.append(i[7]) returnRules.append(temList) return pd.DataFrame(returnRules, columns=('规则', '项集出现数目', '置信度', '支持度', '力度', '提升度', '利用度')) if __name__ == '__main__': supportRate = 0.004 confidenceRate = 0.6 itemsets = dict(oaf.frequent_itemsets(ryzd, supportRate)) rules = oaf.association_rules(itemsets, confidenceRate) rules = list(rules) regularNum = len(rules) printRules = dealRules(rules) result = list(oaf.rules_stats(rules, itemsets, len(ryzd))) #下面这个函数改变了rules,把rules用完了! printResult = dealResult(result) ################################################# # 下面将结果保存成excel格式的文件 dfToSave = ResultDFToSave(result) dfToSave.to_excel(r'C:\Users\Administrator\Desktop\2.xlsx') ####################################################### # 下面是根据不同置信度和关联度得到关联规则数目
# So the items ‘4’ and ‘25’ (fifth and twenty sixth columns of X) are the only items (and itemsets) that appear 10 or more times. Let’s check this: print((X.sum(axis=0) >= 10).nonzero()[1]) # Conclusion: Given databases of uniformly distributed random data, there’s not much to work with. # Examples with rules np.random.seed(0) N = 100 X = np.random.random((N, 100)) > .9 # Find all itemsets with at least 5% support: itemsets = dict(ofpg.frequent_itemsets(X, .05)) # Generate all association rules from these itemsets with minimum 50% confidence: rules = ofpg.association_rules(itemsets, .5) rules = list(rules) # Or only the rules for a particular itemset: print(list(ofpg.association_rules(itemsets, .3, frozenset({75, 98})))) # Examples of additional stats for rules generated by association_rules() N = 30 X = np.random.random((N, 50)) > .9 itemsets = dict(ofpg.frequent_itemsets(X, .1)) rules = ofpg.association_rules(itemsets, .6) print(list(ofpg.rules_stats(rules, itemsets, N)))