예제 #1
0
def findKeyAttrs(samples, protect_attr='', result_attr='class'):
    """
    Args:
        samples(pandas DataFrame): 
        protect_attr(string || Array<string>): 
    Return:
        key_attrs(list<string>): a list of key attributes that directly influence the decision
    """
    from pycausal.pycausal import pycausal as pc
    pc = pc()
    pc.start_vm()
    from pycausal import search as s
    # pc import must keep the above order

    # choose a causal mining algorithm
    causal = 'fges'
    if causal == 'bayes':
        ### use bayes Est to find the key attributes
        ### somewhat slow, extract more key attributes
        graph = s.bayesEst(samples, depth=0, alpha=0.05, verbose=True)
    else:
        ## OR use Fast Greedy Equivalence Search
        ## faster than bayes, get less key attributes
        graph = s.tetradrunner()
        graph.getAlgorithmParameters(algoId='fges', scoreId='bdeu')
        graph.run(algoId='fges',
                  dfs=samples,
                  scoreId='bdeu',
                  priorKnowledge=None,
                  dataType='discrete',
                  structurePrior=0.5,
                  samplePrior=0.5,
                  maxDegree=5,
                  faithfulnessAssumed=True,
                  verbose=False)

    # graph.getNodes()
    key_attrs = []
    print('edges', graph.getEdges())
    for edge in graph.getEdges():
        if 'class' in edge:
            # extract attr name from the edge
            # remove --> or --o or --- and white space
            attr = re.sub(r'-+>?o?|{}|\s+'.format(result_attr), '', edge)
            key_attrs.append(attr)

    # remove protect attrs
    if type(protect_attr) is not str:
        # if protect attr is a list
        for a in protect_attr:
            if a in key_attrs:
                key_attrs.remove(a)
    elif protect_attr in key_attrs:
        # if protect attr is a string
        key_attrs.remove(protect_attr)
    print('key attributes', key_attrs)
    return key_attrs
예제 #2
0
def py_causal(
    sp_pro_deg_v, knowledge_path, bp=False, 
    dataType="discrete", algoId="fges",
    scoreId="disc-bic-score", structurePrior=1.0, samplePrior=1.0, maxDegree=20,
    faithfulnessAssumed=True, symmetricFirstStep=True, verbose=True, 
    numberResampling=100, percentResampleSize=90, resamplingEnsemble=1,
    addOriginalDataset=True, resamplingWithReplacement=True):

    """Notes for py_causal.

    Args:
        sp_sga_deg: 2-D df, index are sample IDs, columns are SGAs and DGEs names. 
                    df's value is 0 or 1.
        bp: bool, using bootstrap. Default is False.
        
    Returns:
        node_l: A list of contain nodes in network.
        edge_l: A 2-D df contains two columns, source node to target node, 
                didn't contain edge without direction.
        bic: fges output, contain nodes, edges, network score.
    """
    #avoid changing input file
    sp_pro_deg = deepcopy(sp_pro_deg_v)

    # connect to java
    from pycausal.pycausal import pycausal as pc
    pc = pc()
    pc.start_vm(java_max_heap_size="1000M")

    # generate knowledge
    from pycausal import prior as p 
    prior = p.knowledgeFromFile(knowledge_path)

    # search
    from pycausal import search as s
    tetrad = s.tetradrunner()

    if bp == True:
        tetrad.run(
            dfs=sp_pro_deg, priorKnowledge=prior, 
            dataType=dataType, algoId=algoId,
            scoreId=scoreId, structurePrior=structurePrior, samplePrior=samplePrior, maxDegree=maxDegree,
            faithfulnessAssumed=faithfulnessAssumed, symmetricFirstStep=symmetricFirstStep, verbose=verbose, 
            numberResampling=numberResampling, percentResampleSize=percentResampleSize, resamplingEnsemble=resamplingEnsemble,
            addOriginalDataset=addOriginalDataset, resamplingWithReplacement=resamplingWithReplacement)
    else:
        tetrad.run(
            dfs=sp_pro_deg, priorKnowledge=prior, 
            dataType=dataType, algoId=algoId,
            scoreId=scoreId, structurePrior=structurePrior, samplePrior=samplePrior, maxDegree=maxDegree,
            faithfulnessAssumed=faithfulnessAssumed, symmetricFirstStep=symmetricFirstStep, verbose=verbose)

    node_l = tetrad.getNodes()
    edge_l = tetrad.getEdges()
    bic = tetrad.getTetradGraph()

    return node_l, edge_l, bic
예제 #3
0
 def init_causal_graph_dot_src(self, df, forbidden_edges, required_edges):
     p = pc()
     p.start_vm()
     tetrad = s.tetradrunner()
     prior = pr.knowledge(forbiddirect = forbidden_edges, requiredirect = required_edges)
     tetrad.run(algoId = 'fges', dfs = df, priorKnowledge = prior, scoreId = 'sem-bic', dataType = 'continuous', penaltyDiscount = 2, maxDegree = -1, faithfulnessAssumed = True, verbose = True)
     dot_src = p.tetradGraphToDot(tetrad.getTetradGraph())
     #p.stop_vm()
     self.edges = tetrad.getEdges()
     self.nodes = tetrad.getNodes()
     dot_src = self.trim_init_src_string(dot_src)
     self.dot_src_lines = self.dot_src_to_lines(dot_src)
     self.dot_src = self.lines_to_dot_src(self.dot_src_lines)
     self.init_dot_src = self.dot_src
     self.uncolored_dot_src = self.init_dot_src
예제 #4
0
파일: causal_model.py 프로젝트: rahlk/CADET
 def learn_fci(self, df, tabu_edges):
     """This function is used to learn model using FCI"""
     from pycausal.pycausal import pycausal as pc
     from pycausal import search as s
     from pycausal import prior as p
     pc = pc()
     pc.start_vm()
     forbid = [list(i) for i in tabu_edges]
     prior = p.knowledge(forbiddirect=forbid)
     tetrad = s.tetradrunner()
     tetrad.getAlgorithmParameters(algoId='fci', testId='fisher-z-test')
     tetrad.run(algoId='fci',
                dfs=df,
                testId='fisher-z-test',
                depth=-1,
                maxPathLength=-1,
                completeRuleSetUsed=False,
                verbose=False)
     edges = tetrad.getEdges()
     dot_str = pc.tetradGraphToDot(tetrad.getTetradGraph())
     graph = pydot.graph_from_dot_data(dot_str)
     # graph[0].write_pdf(fname)
     pc.stop_vm()
     return edges
#!/usr/local/bin/python

import os
import pandas as pd
import pydot
from IPython.display import SVG

data_dir = os.path.join(os.getcwd(), 'data', 'audiology.txt')
df = pd.read_table(data_dir, sep="\t")

from pycausal.pycausal import pycausal as pc
pc = pc()
pc.start_vm(java_max_heap_size='100M')

from pycausal import search as s
tetrad = s.tetradrunner()
tetrad.run(algoId='fges',
           dfs=df,
           scoreId='bdeu',
           dataType='discrete',
           structurePrior=1.0,
           samplePrior=1.0,
           maxDegree=3,
           faithfulnessAssumed=True,
           verbose=True)

tetrad.getNodes()
tetrad.getEdges()

dot = tetrad.getDot()
dot.write_svg('fges-discrete.svg')
#!/usr/local/bin/python


import os
import pandas as pd
import pydot
from IPython.display import SVG

data_dir = os.path.join(os.getcwd(), 'data', 'audiology.txt')
df = pd.read_table(data_dir, sep="\t")

from pycausal.pycausal import pycausal as pc
pc = pc()
pc.start_vm(java_max_heap_size = '100M')

from pycausal import search as s
tetrad = s.tetradrunner()
tetrad.run(algoId = 'fges', dfs = df, scoreId = 'bdeu', dataType = 'discrete',
           structurePrior = 1.0, samplePrior = 1.0, maxDegree = 3, faithfulnessAssumed = True, verbose = True)

tetrad.getNodes()
tetrad.getEdges()

dot_str = pc.tetradGraphToDot(tetrad.getTetradGraph())
graphs = pydot.graph_from_dot_data(dot_str)
graphs[0].write_svg('fges-discrete.svg')

pc.stop_vm()
예제 #7
0
def fges_stem(file_path, sys_iter, SGA_l, A_D):

    BIC_l = [float(0)]

    SGA = pd.DataFrame(SGA_l)
    SGA.columns = ['cause gene name']
    A_D_i = A_D

    for i in range(sys_iter):
        print(i)
        file_l = os.listdir(file_path + '/Output/run%i' % i)
        while 'completeMatrixn.csv' not in file_l:
            df_name = file_path + '/Output/run%i/completeMatrix.csv' % i
            df = pd.read_csv(df_name, header=0, index_col=None)

            from pycausal.pycausal import pycausal as pc
            pc = pc()
            pc.start_vm(java_max_heap_size='6400M')

            from pycausal import prior as p
            # get knowledge from knowledge file
            #prior = p.knowledgeFromFile(file_path + '/Input/Knowledge')

            # get knowledge from DEG and SGA list
            DEG_l = [x for x in df.columns if x not in SGA_l]
            A_D_i = A_D_i[DEG_l]
            forbid = create_knowledge(SGA, SGA_l, A_D_i)
            temporal = [SGA_l, p.ForbiddenWithin(DEG_l)]
            prior = p.knowledge(forbiddirect=forbid, addtemporal=temporal)

            from pycausal import search as s
            tetrad = s.tetradrunner()
            tetrad.getAlgorithmParameters(algoId='fges', scoreId='bdeu-score')

            tetrad.run(
                algoId='fges',
                dfs=df,
                scoreId='bdeu-score',
                priorKnowledge=prior,
                dataType='discrete',
                structurePrior=1.0,
                samplePrior=1.0,
                maxDegree=100,
                faithfulnessAssumed=True,
                verbose=True,
                symmetricFirstStep=True
            )  # , numberResampling=10, resamplingEnsemble=1, addOriginalDataset=True)

            # save edges.csv
            node_l = tetrad.getNodes()
            edge_l = tetrad.getEdges()
            #edge_split_l = []
            #for edge in edge_l:
            #if '---' in edge:
            #edge_n = edge.split(' ')
            #if np.sum(df[edge.split(' ')[0]]) > np.sum(df[edge.split(' ')[2]]):
            #    edge_n.reverse()
            #else:
            #    edge_n = edge_n
            #edge_split_l.append(edge_n)
            #else:
            #edge_split_l.append(edge.split(' '))

            #edge_split_l = [edge.split(' ') for edge in edge_l if '---' not in edge]
            edge_split_l = [edge.split(' ') for edge in edge_l]

            edge_df = pd.DataFrame(edge_split_l).iloc[:, [0, 2]]
            edge_df.to_csv(file_path + '/Output/run%i/Edge.csv' % i,
                           index=False,
                           header=False)

            # save completeMatrixn.csv
            new_df = df.loc[:, node_l]
            new_df.to_csv(file_path + '/Output/run%i/completeMatrixn.csv' % i,
                          index=False,
                          header=True)

            # save BIC.txt
            print(tetrad.getTetradGraph(),
                  file=open(file_path + '/Output/run%i/BIC.txt' % i, 'a'))
            file_l = os.listdir(file_path + '/Output/run%i' % i)

        else:
            # save BIC which used to verify convergency
            with open(file_path + '/Output/run%i/BIC.txt' % i, 'r') as BIC_txt:
                for line in BIC_txt:
                    if 'BIC: -' in line:
                        BIC_l.append(float(line[5:-1]))

            j = i + 1
            mk_dir(file_path + '/Output/run%d' % j)
            next_file_l = os.listdir(file_path + '/Output/run%i' % j)
            while 'completeMatrix.csv' not in next_file_l:
                exe_path = './MCMC/inferSGAInNetwork_TDI.exe'
                m_path = ' -m ' + file_path + '/Output/run%i/completeMatrixn.csv' % i
                i_path = ' -i ' + file_path + '/Input/S_A0.csv'
                e_path = ' -e ' + file_path + '/Output/run%i/Edge.csv' % i
                o_path = ' -o ' + file_path + '/Output/run%d/ -x 50' % j
                combine = exe_path + m_path + i_path + e_path + o_path
                os.system(combine)
                time.sleep(20)
                next_file_l = os.listdir(file_path + '/Output/run%i' % j)
            else:
                pd.DataFrame(BIC_l).to_csv(file_path + '/Output/BIC.csv',
                                           index=False,
                                           header=False)