def construct_graph(args):

    # Load pre-processed datasets
    v = pd.read_csv(args.v_input, index_col=False, delim_whitespace=True)
    e = pd.read_csv(args.e_input, index_col=False, delim_whitespace=True)

    logging.info(
        "Setting up graph data with {} nodes, {} edges and {} clusters".format(
            v.shape[0], e.shape[0], len(v["cluster"].unique())))

    # Coin dataframes into SQL for graphframes
    v_schema = StructType([
        StructField("id", IntegerType(), True),
        StructField("cluster", IntegerType(), True)
    ])
    v = sql_context.createDataFrame(v, schema=v_schema).dropDuplicates(['id'])

    e_schema = StructType([
        StructField("src", IntegerType(), True),
        StructField("dst", IntegerType(), True)
    ])
    e = sql_context.createDataFrame(e, schema=e_schema)

    # Generate graph before simulation starts
    g = GF.GraphFrame(v, e)

    return g
Пример #2
0
#Step 8-6-2 : Creating DataFrames of edge of given graph.

edgeDataList = [('A', 'C'), ('A', 'B'), ('B', 'A'), ('B', 'C'), ('B', 'G'),
                ('B', 'F'), ('C', 'A'), ('C', 'B'), ('C', 'F'), ('C', 'D'),
                ('D', 'C'), ('D', 'F'), ('D', 'E'), ('E', 'D'), ('E', 'F'),
                ('F', 'B'), ('F', 'C'), ('F', 'D'), ('F', 'E'), ('F', 'G'),
                ('G', 'B'), ('G', 'F')]

edgeRDD = sc.parallelize(edgeDataList, 4)
edgeRDD.take(4)
edgeRDDRows = edgeRDD.map(lambda data: Row(data[0], data[1]))
edgeRDDRows.take(4)
sourceColumn = StructField('src', StringType(), True)
destinationColumn = StructField('dst', StringType(), True)
edgeSchema = StructType([sourceColumn, destinationColumn])
edgeDataFrame = sqlContext.createDataFrame(edgeRDDRows, edgeSchema)

edgeDataFrame.show(5)

#Step 8-6-3 : Creating GraphFrame object.

import graphframes.graphframe as gfm
ourGraph = gfm.GraphFrame(verticesDataFrame, edgeDataFrame)
ourGraph.vertices.show(5)
ourGraph.edges.show(5)

#Step 8-6-3 : Running Breath First Search Algorithm.
bfsPath = ourGraph.bfs(fromExpr="id='D'", toExpr="id='G'")
bfsPath.show()
Пример #3
0
def simulate(g, p_is, p_id, p_ih, p_ir, p_hr, p_hd, t_latent, t_infectious,
             num_i_seeds, num_time_steps):

    # select the vertices as a list
    nodes = list(g.vertices.select("id").toPandas()["id"])

    i_nodes = list(random.sample(nodes, num_i_seeds))
    s_nodes = list(set(nodes) - set(i_nodes))
    h_nodes = []
    e_nodes = []
    r_nodes = []
    d_nodes = []
    duration = 0

    nodes_counter = pd.DataFrame(
        columns=["n_{}".format(x) for x in ['s', 'e', 'i', 'r', 'h', 'd']],
        index=[i for i in range(num_time_steps)])
    nodes_counter.loc[0] = [
        len(s_nodes),
        len(e_nodes),
        len(i_nodes),
        len(r_nodes),
        len(h_nodes),
        len(d_nodes)
    ]

    # ADD "neighbors" column: select neighbors of a node based on a graph (used in S_flow step)
    neighbor = g.find("(a)-[e]->(b)").drop("e").groupBy('a.id').agg(
        collect_list('b.id').alias('neighbors'))
    g_neighbor = neighbor.join(g.vertices, ['id'], "right_outer")
    g = GF.GraphFrame(g_neighbor, e)

    # ADD "state" column
    # At t0: number of I nodes and H nodes are based on user-defined functions. ALL OTHER NODES are assumed to be S
    g_temp = g.vertices.withColumn(
        "state",
        when(g.vertices.id.isin(i_nodes), "I").otherwise(
            when(g.vertices.id.isin(h_nodes), "H").otherwise("S")))

    # ADD "i_days" and "e_days" column
    # At t0: 1 for all I_nodes and 0 for all others
    g_temp = g_temp.withColumn("e_days", lit(0))
    g_temp = g_temp.withColumn(
        "i_days",
        when(g.vertices.id.isin(i_nodes), lit(1)).otherwise(lit(0)))
    g = GF.GraphFrame(g_temp, e)

    # TO DO: allow p_is to be a vector of rates (time-dependent)
    for step in range(1, num_time_steps + 1):
        H_flow(h_nodes, r_nodes, d_nodes, p_hr, p_hd)
        I_flow(i_nodes, r_nodes, h_nodes, d_nodes, p_id, p_ih, p_ir)
        new_I_nodes = E_flow(g, e_nodes, i_nodes, t_latent)
        new_E_nodes = S_flow(g, s_nodes, e_nodes, i_nodes, r_nodes, h_nodes,
                             d_nodes, p_is, p_id, p_ih, p_ir, t_infectious)

        # update the state column using the new lists ("x_nodes")
        g_temp = g.vertices.withColumn(
            "state",
            when(g.vertices.id.isin(s_nodes), "S").otherwise(
                when(g.vertices.id.isin(e_nodes), "E").otherwise(
                    when(g.vertices.id.isin(i_nodes), "I").otherwise(
                        when(g.vertices.id.isin(r_nodes), "R").otherwise(
                            when(g.vertices.id.isin(h_nodes),
                                 "H").otherwise("D"))))))

        # update i_days and e_days (1. initialize to zero the newly turned I nodes; 2. add one to previously exposed or infectious nodes)
        old_I_nodes = list(set(i_nodes) - set(new_I_nodes))
        g_temp = g_temp.withColumn(
            "i_days",
            when(g_temp.id.isin(new_I_nodes), 1).otherwise(
                when(g_temp.id.isin(old_I_nodes),
                     g_temp.i_days + 1).otherwise(0)))

        old_E_nodes = list(set(e_nodes) - set(new_E_nodes))
        g_temp = g_temp.withColumn(
            "e_days",
            when(g_temp.id.isin(new_E_nodes), 1).otherwise(
                when(g_temp.id.isin(old_E_nodes),
                     g_temp.e_days + 1).otherwise(0)))

        # finish upating the vertices --> let's update the graph!
        g = GF.GraphFrame(g_temp, e)

        duration += 1

        if len(i_nodes) == 0:
            print("TERMINATED: No more infectious nodes left to update")
            break

        nodes_counter.loc[step] = [
            len(s_nodes),
            len(e_nodes),
            len(i_nodes),
            len(r_nodes),
            len(h_nodes),
            len(d_nodes)
        ]

    return nodes_counter, duration
Пример #4
0
    StructField("sex", IntegerType(), True),
    StructField("cluster", IntegerType(), True)
])
#v = sql_context.createDataFrame(v, schema = v_schema).dropDuplicates(['id'])
# ^ the above line leads to a random number of nodes in the graph.

v = sql_context.createDataFrame(v, schema=v_schema)

e_schema = StructType([
    StructField("src", IntegerType(), True),
    StructField("dst", IntegerType(), True)
])
e = sql_context.createDataFrame(e, schema=e_schema)

# Generate graph before simulation starts
g = GF.GraphFrame(v, e)

# Define parameters
p_is = 0.5
p_id = 0.0134
p_ih = 0.0678
p_ir = 0.3945
p_hr = 0.3945 / 2
p_hd = 0.0419

t_latent = 5
t_infectious = 5

num_i_seeds = 20
num_time_steps = 3