def findKeyAttrs(samples, protect_attr='', result_attr='class'): """ Args: samples(pandas DataFrame): protect_attr(string || Array<string>): Return: key_attrs(list<string>): a list of key attributes that directly influence the decision """ from pycausal.pycausal import pycausal as pc pc = pc() pc.start_vm() from pycausal import search as s # pc import must keep the above order # choose a causal mining algorithm causal = 'fges' if causal == 'bayes': ### use bayes Est to find the key attributes ### somewhat slow, extract more key attributes graph = s.bayesEst(samples, depth=0, alpha=0.05, verbose=True) else: ## OR use Fast Greedy Equivalence Search ## faster than bayes, get less key attributes graph = s.tetradrunner() graph.getAlgorithmParameters(algoId='fges', scoreId='bdeu') graph.run(algoId='fges', dfs=samples, scoreId='bdeu', priorKnowledge=None, dataType='discrete', structurePrior=0.5, samplePrior=0.5, maxDegree=5, faithfulnessAssumed=True, verbose=False) # graph.getNodes() key_attrs = [] print('edges', graph.getEdges()) for edge in graph.getEdges(): if 'class' in edge: # extract attr name from the edge # remove --> or --o or --- and white space attr = re.sub(r'-+>?o?|{}|\s+'.format(result_attr), '', edge) key_attrs.append(attr) # remove protect attrs if type(protect_attr) is not str: # if protect attr is a list for a in protect_attr: if a in key_attrs: key_attrs.remove(a) elif protect_attr in key_attrs: # if protect attr is a string key_attrs.remove(protect_attr) print('key attributes', key_attrs) return key_attrs
def py_causal( sp_pro_deg_v, knowledge_path, bp=False, dataType="discrete", algoId="fges", scoreId="disc-bic-score", structurePrior=1.0, samplePrior=1.0, maxDegree=20, faithfulnessAssumed=True, symmetricFirstStep=True, verbose=True, numberResampling=100, percentResampleSize=90, resamplingEnsemble=1, addOriginalDataset=True, resamplingWithReplacement=True): """Notes for py_causal. Args: sp_sga_deg: 2-D df, index are sample IDs, columns are SGAs and DGEs names. df's value is 0 or 1. bp: bool, using bootstrap. Default is False. Returns: node_l: A list of contain nodes in network. edge_l: A 2-D df contains two columns, source node to target node, didn't contain edge without direction. bic: fges output, contain nodes, edges, network score. """ #avoid changing input file sp_pro_deg = deepcopy(sp_pro_deg_v) # connect to java from pycausal.pycausal import pycausal as pc pc = pc() pc.start_vm(java_max_heap_size="1000M") # generate knowledge from pycausal import prior as p prior = p.knowledgeFromFile(knowledge_path) # search from pycausal import search as s tetrad = s.tetradrunner() if bp == True: tetrad.run( dfs=sp_pro_deg, priorKnowledge=prior, dataType=dataType, algoId=algoId, scoreId=scoreId, structurePrior=structurePrior, samplePrior=samplePrior, maxDegree=maxDegree, faithfulnessAssumed=faithfulnessAssumed, symmetricFirstStep=symmetricFirstStep, verbose=verbose, numberResampling=numberResampling, percentResampleSize=percentResampleSize, resamplingEnsemble=resamplingEnsemble, addOriginalDataset=addOriginalDataset, resamplingWithReplacement=resamplingWithReplacement) else: tetrad.run( dfs=sp_pro_deg, priorKnowledge=prior, dataType=dataType, algoId=algoId, scoreId=scoreId, structurePrior=structurePrior, samplePrior=samplePrior, maxDegree=maxDegree, faithfulnessAssumed=faithfulnessAssumed, symmetricFirstStep=symmetricFirstStep, verbose=verbose) node_l = tetrad.getNodes() edge_l = tetrad.getEdges() bic = tetrad.getTetradGraph() return node_l, edge_l, bic
def init_causal_graph_dot_src(self, df, forbidden_edges, required_edges): p = pc() p.start_vm() tetrad = s.tetradrunner() prior = pr.knowledge(forbiddirect = forbidden_edges, requiredirect = required_edges) tetrad.run(algoId = 'fges', dfs = df, priorKnowledge = prior, scoreId = 'sem-bic', dataType = 'continuous', penaltyDiscount = 2, maxDegree = -1, faithfulnessAssumed = True, verbose = True) dot_src = p.tetradGraphToDot(tetrad.getTetradGraph()) #p.stop_vm() self.edges = tetrad.getEdges() self.nodes = tetrad.getNodes() dot_src = self.trim_init_src_string(dot_src) self.dot_src_lines = self.dot_src_to_lines(dot_src) self.dot_src = self.lines_to_dot_src(self.dot_src_lines) self.init_dot_src = self.dot_src self.uncolored_dot_src = self.init_dot_src
def learn_fci(self, df, tabu_edges): """This function is used to learn model using FCI""" from pycausal.pycausal import pycausal as pc from pycausal import search as s from pycausal import prior as p pc = pc() pc.start_vm() forbid = [list(i) for i in tabu_edges] prior = p.knowledge(forbiddirect=forbid) tetrad = s.tetradrunner() tetrad.getAlgorithmParameters(algoId='fci', testId='fisher-z-test') tetrad.run(algoId='fci', dfs=df, testId='fisher-z-test', depth=-1, maxPathLength=-1, completeRuleSetUsed=False, verbose=False) edges = tetrad.getEdges() dot_str = pc.tetradGraphToDot(tetrad.getTetradGraph()) graph = pydot.graph_from_dot_data(dot_str) # graph[0].write_pdf(fname) pc.stop_vm() return edges
#!/usr/local/bin/python import os import pandas as pd import pydot from IPython.display import SVG data_dir = os.path.join(os.getcwd(), 'data', 'audiology.txt') df = pd.read_table(data_dir, sep="\t") from pycausal.pycausal import pycausal as pc pc = pc() pc.start_vm(java_max_heap_size='100M') from pycausal import search as s tetrad = s.tetradrunner() tetrad.run(algoId='fges', dfs=df, scoreId='bdeu', dataType='discrete', structurePrior=1.0, samplePrior=1.0, maxDegree=3, faithfulnessAssumed=True, verbose=True) tetrad.getNodes() tetrad.getEdges() dot = tetrad.getDot() dot.write_svg('fges-discrete.svg')
#!/usr/local/bin/python import os import pandas as pd import pydot from IPython.display import SVG data_dir = os.path.join(os.getcwd(), 'data', 'audiology.txt') df = pd.read_table(data_dir, sep="\t") from pycausal.pycausal import pycausal as pc pc = pc() pc.start_vm(java_max_heap_size = '100M') from pycausal import search as s tetrad = s.tetradrunner() tetrad.run(algoId = 'fges', dfs = df, scoreId = 'bdeu', dataType = 'discrete', structurePrior = 1.0, samplePrior = 1.0, maxDegree = 3, faithfulnessAssumed = True, verbose = True) tetrad.getNodes() tetrad.getEdges() dot_str = pc.tetradGraphToDot(tetrad.getTetradGraph()) graphs = pydot.graph_from_dot_data(dot_str) graphs[0].write_svg('fges-discrete.svg') pc.stop_vm()
def fges_stem(file_path, sys_iter, SGA_l, A_D): BIC_l = [float(0)] SGA = pd.DataFrame(SGA_l) SGA.columns = ['cause gene name'] A_D_i = A_D for i in range(sys_iter): print(i) file_l = os.listdir(file_path + '/Output/run%i' % i) while 'completeMatrixn.csv' not in file_l: df_name = file_path + '/Output/run%i/completeMatrix.csv' % i df = pd.read_csv(df_name, header=0, index_col=None) from pycausal.pycausal import pycausal as pc pc = pc() pc.start_vm(java_max_heap_size='6400M') from pycausal import prior as p # get knowledge from knowledge file #prior = p.knowledgeFromFile(file_path + '/Input/Knowledge') # get knowledge from DEG and SGA list DEG_l = [x for x in df.columns if x not in SGA_l] A_D_i = A_D_i[DEG_l] forbid = create_knowledge(SGA, SGA_l, A_D_i) temporal = [SGA_l, p.ForbiddenWithin(DEG_l)] prior = p.knowledge(forbiddirect=forbid, addtemporal=temporal) from pycausal import search as s tetrad = s.tetradrunner() tetrad.getAlgorithmParameters(algoId='fges', scoreId='bdeu-score') tetrad.run( algoId='fges', dfs=df, scoreId='bdeu-score', priorKnowledge=prior, dataType='discrete', structurePrior=1.0, samplePrior=1.0, maxDegree=100, faithfulnessAssumed=True, verbose=True, symmetricFirstStep=True ) # , numberResampling=10, resamplingEnsemble=1, addOriginalDataset=True) # save edges.csv node_l = tetrad.getNodes() edge_l = tetrad.getEdges() #edge_split_l = [] #for edge in edge_l: #if '---' in edge: #edge_n = edge.split(' ') #if np.sum(df[edge.split(' ')[0]]) > np.sum(df[edge.split(' ')[2]]): # edge_n.reverse() #else: # edge_n = edge_n #edge_split_l.append(edge_n) #else: #edge_split_l.append(edge.split(' ')) #edge_split_l = [edge.split(' ') for edge in edge_l if '---' not in edge] edge_split_l = [edge.split(' ') for edge in edge_l] edge_df = pd.DataFrame(edge_split_l).iloc[:, [0, 2]] edge_df.to_csv(file_path + '/Output/run%i/Edge.csv' % i, index=False, header=False) # save completeMatrixn.csv new_df = df.loc[:, node_l] new_df.to_csv(file_path + '/Output/run%i/completeMatrixn.csv' % i, index=False, header=True) # save BIC.txt print(tetrad.getTetradGraph(), file=open(file_path + '/Output/run%i/BIC.txt' % i, 'a')) file_l = os.listdir(file_path + '/Output/run%i' % i) else: # save BIC which used to verify convergency with open(file_path + '/Output/run%i/BIC.txt' % i, 'r') as BIC_txt: for line in BIC_txt: if 'BIC: -' in line: BIC_l.append(float(line[5:-1])) j = i + 1 mk_dir(file_path + '/Output/run%d' % j) next_file_l = os.listdir(file_path + '/Output/run%i' % j) while 'completeMatrix.csv' not in next_file_l: exe_path = './MCMC/inferSGAInNetwork_TDI.exe' m_path = ' -m ' + file_path + '/Output/run%i/completeMatrixn.csv' % i i_path = ' -i ' + file_path + '/Input/S_A0.csv' e_path = ' -e ' + file_path + '/Output/run%i/Edge.csv' % i o_path = ' -o ' + file_path + '/Output/run%d/ -x 50' % j combine = exe_path + m_path + i_path + e_path + o_path os.system(combine) time.sleep(20) next_file_l = os.listdir(file_path + '/Output/run%i' % j) else: pd.DataFrame(BIC_l).to_csv(file_path + '/Output/BIC.csv', index=False, header=False)