def main():
    initialize_logger('./albertoEdgesenseLog')
    generated = datetime.now()
    
    users_resource, \
    nodes_resource, \
    comments_resource, \
    node_title_field, \
    timestep_size, \
    timestep_window, \
    timestep_count, \
    username, \
    password, \
    extraction_method, \
    admin_roles, \
    exclude_isolated, \
    dumpto, \
    create_datapackage, \
    datapackage_title, \
    license_type, \
    license_url, \
    destination_path = parse_options(sys.argv[1:])
    
    logging.info("Network processing - started")
    
    # Load the files
    allusers, allnodes, allcomments = load_files(users_resource, nodes_resource, comments_resource, username, password, extraction_method, dumpto, generated)
    
    # extract a normalized set of data
    nodes_map, posts_map, comments_map = eu.extract.normalized_data(allusers, allnodes, allcomments, node_title_field, admin_roles, exclude_isolated)

    # this is the network object
    # going forward it should be read from a serialized format to handle caching
    network = {}

    # Add some file metadata
    network['meta'] = {}
    # Timestamp of the file generation (to show in the dashboard)
    network['meta']['generated'] = int(generated.strftime("%s"))
        
    network['edges'] = extract_edges(nodes_map, comments_map)
    network['edges'] += extract_multiauthor_post_edges(nodes_map, posts_map)

    # filter out nodes that have not participated to the full:conversations
    inactive_nodes = [ v for v in nodes_map.values() if not v['active'] ]
    logging.info("inactive nodes: %(n)i" % {'n':len(inactive_nodes)})
    network['nodes'] = [ v for v in nodes_map.values() if v['active'] ]
    
    directed_multiedge_network = calculate_network_metrics(nodes_map, posts_map, comments_map, network, timestep_size, timestep_window, timestep_count)
    
    eu.resource.write_network(network, \
                     directed_multiedge_network, \
                     generated, \
                     create_datapackage, \
                     datapackage_title, \
                     license_type, \
                     license_url, \
                     destination_path)
    
    logging.info("Completed")  
Exemplo n.º 2
0
def main():
    initialize_logger('./log')
    
    generated = datetime.now()
    kind, source, outdir = parse_options(sys.argv[1:])
    logging.info("Parsing catalyst - Started")
    logging.info("Parsing catalyst - Source file: %(s)s" % {'s':source})
    logging.info("Parsing catalyst - Output directory: %(s)s" % {'s':outdir})
    logging.info("Parsing catalyst - Extraction Kind: %(s)s" % {'s':kind})
    
    # 1. load and parse the JSON file into a RDF Graph
    
    graph = ec.inference.catalyst_graph_for(source)
    
    # 2. extract the usersnodes,comments from the graph
    if kind == 'simple':
        users,nodes,comments = ec.extract.simple.users_nodes_comments_from(graph)
    elif kind == 'excerpts':
        users,nodes,comments = ec.extract.excerpts.users_nodes_comments_from(graph)
    else:
        logging.info("Parsing catalyst - Extraction kind not supported")
        return
        
    # 3. sort the lists
    sorted_users = sorted(users, key=eu.sort_by('created'))
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    sorted_comments = sorted(comments, key=eu.sort_by('created'))
    
    # 4. saves the files
    write_file(sorted_users, 'users.json', outdir)
    write_file(sorted_nodes, 'nodes.json', outdir)
    write_file(sorted_comments, 'comments.json', outdir)
    logging.info("Parsing catalyst - Completed")
Exemplo n.º 3
0
def main():
    initialize_logger('./log')
    
    generated = datetime.now()
    kind, source, destination_path, moderator, timestep_size, timestep_window, timestep_count, create_datapackage, license_type, license_url, datapackage_title = parse_options(sys.argv[1:])
    logging.info("Parsing catalyst - Started")
    logging.info("Parsing catalyst - Source file: %(s)s" % {'s':source})
    logging.info("Parsing catalyst - Output directory: %(s)s" % {'s':destination_path})
    logging.info("Parsing catalyst - Extraction Kind: %(s)s" % {'s':kind})
    
    # 1. load and parse the JSON file into a RDF Graph
    
    graph = ec.inference.catalyst_graph_for(source)
    
    # 2. extract the usersnodes,comments from the graph
    use_posts = (kind == 'posts') or (kind == 'both')
    use_ideas = (kind == 'ideas') or (kind == 'both')
    assert use_ideas or use_posts, "kind must be ideas, posts or both"
    moderator_test = None
    if moderator:
        moderator_test = partial(ec.extract.is_moderator, graph, moderator_roles=(moderator,))
    network = ec.extract.ideas.graph_to_network(generated, graph, use_ideas, use_posts, moderator_test)
    
    directed_multiedge_network = calculate_network_metrics({}, {}, {}, network, timestep_size, timestep_window, timestep_count)
    
    eu.resource.write_network(network, \
                     directed_multiedge_network, \
                     generated, \
                     create_datapackage, \
                     datapackage_title, \
                     license_type, \
                     license_url, \
                     destination_path)

    logging.info("Parsing catalyst - Completed")
Exemplo n.º 4
0
def main(argv):
    initialize_logger('./log')

    users_resource, nodes_resource, comments_resource = parse_options(argv)
    
    logging.info("Network processing - started")  
    # load users
    jusers = eu.resource.load(users_resource)
    allusers = jusers['users']

    # load nodes
    jnodes = eu.resource.load(nodes_resource)
    allnodes = jnodes['nodes']

    # load comments
    jcomments = eu.resource.load(comments_resource)
    allcomments = jcomments['comments']

    logging.info("file loaded")  
    
    generated = datetime.now()
    
    network = build(allusers, allnodes, allcomments, generated)
    
    write_network(network, generated)
    
    logging.info("Completed")  
Exemplo n.º 5
0
def parse():
    node_title_field = 'uid'
    timestep_size = 60*60*24*7
    timestep_window = 1
    timestep_count = 20
    username = None
    password = None
    extraction_method = 'nested'
    admin_roles = set()
    exclude_isolated = False
    create_datapackage = False 
    license_type = None
    license_url = None
    datapackage_title = None
    kind = 'both'
    moderator = None
    generated = datetime.now()
    
    source_json = request.form['source'] if request.form.has_key('source') else None
    if not source_json:
        raise InvalidUsage('Missing parameters', status_code=400)
    
    initialize_logger('./log')
    
    logging.info("parse_source - Started")
    logging.info("parse_source - Source: %(s)s" % {'s':source_json})
    logging.info("parse_source - Extraction Kind: %(s)s" % {'s':kind})
    
    # 1. load and parse the JSON file into a RDF Graph    
    graph = ec.inference.catalyst_graph_for(source_json)
    
    # 2. extract the usersnodes,comments from the graph
    use_posts = (kind == 'posts') or (kind == 'both')
    use_ideas = (kind == 'ideas') or (kind == 'both')
    assert use_ideas or use_posts, "kind must be ideas, posts or both"
    moderator_test = None
    if moderator:
        moderator_test = partial(ec.extract.is_moderator, graph, moderator_roles=(moderator,))
    network = ec.extract.ideas.graph_to_network(generated, graph, use_ideas, use_posts, moderator_test)
    
    directed_multiedge_network = calculate_network_metrics({}, {}, {}, network, timestep_size, timestep_window, timestep_count)
    
    eu.resource.write_network(network, \
                     directed_multiedge_network, \
                     generated, \
                     create_datapackage, \
                     datapackage_title, \
                     license_type, \
                     license_url, \
                     destination_path)

    # return the result URL
    tag = generated.strftime('%Y-%m-%d-%H-%M-%S')
    base_path = os.path.join("/json/data", tag)
    result_path = os.path.join(base_path, "network.min.json")
    
    logging.info("Completed: %(s)s" % {'s':result_path})
    return jsonify({'last': tag, 'base_path': base_path, 'metrics': 'network.min.json', 'gexf': 'network.gexf', 'datapackage': 'datapackage.json' })
Exemplo n.º 6
0
def main():
    initialize_logger("./log")
    generated = datetime.now()

    users_resource, nodes_resource, comments_resource, node_title_field, timestep_size, timestep_window, timestep_count, username, password, extraction_method, admin_roles, exclude_isolated, dumpto, create_datapackage, datapackage_title, license_type, license_url, destination_path = parse_options(
        sys.argv[1:]
    )

    logging.info("Network processing - started")

    # Load the files
    allusers, allnodes, allcomments = load_files(
        users_resource, nodes_resource, comments_resource, username, password, extraction_method, dumpto, generated
    )

    # extract a normalized set of data
    nodes_map, posts_map, comments_map = eu.extract.normalized_data(
        allusers, allnodes, allcomments, node_title_field, admin_roles, exclude_isolated
    )

    # this is the network object
    # going forward it should be read from a serialized format to handle caching
    network = {}

    # Add some file metadata
    network["meta"] = {}
    # Timestamp of the file generation (to show in the dashboard)
    network["meta"]["generated"] = int(time.mktime(generated.timetuple()))  # Windows-compatible

    network["edges"] = extract_edges(nodes_map, comments_map)
    network["edges"] += extract_multiauthor_post_edges(nodes_map, posts_map)

    # filter out nodes that have not participated to the full:conversations
    inactive_nodes = [v for v in nodes_map.values() if not v["active"]]
    logging.info("inactive nodes: %(n)i" % {"n": len(inactive_nodes)})
    network["nodes"] = [v for v in nodes_map.values() if v["active"]]

    directed_multiedge_network = calculate_network_metrics(
        nodes_map, posts_map, comments_map, network, timestep_size, timestep_window, timestep_count
    )

    eu.resource.write_network(
        network,
        directed_multiedge_network,
        generated,
        create_datapackage,
        datapackage_title,
        license_type,
        license_url,
        destination_path,
    )

    logging.info("Completed")
def main():
    initialize_logger('./log')

    generated = datetime.now()
    sources, outdir, moderators, charset, force_name_as_uid, debug = parse_options(
        sys.argv[1:])
    logging.info("Parsing mailinglist - Started")
    logging.info("Parsing mailinglist - Source files: %(s)s" %
                 {'s': repr(sources)})
    logging.info("Parsing mailinglist - Output directory: %(s)s" %
                 {'s': outdir})

    # 1. load and parse each file in a list of messages
    logging.info("Parsing mailinglist - Reading the files")
    messages = []
    for file in sources:
        mbox = mailbox.mbox(file)
        for msg in mbox:
            messages.append(emt.Message(msg))

    # 2. build the threaded containers
    logging.info("Parsing mailinglist - Threading the messages")
    subject_table = emt.thread(messages)
    root_containers = [ctr for (subj, ctr) in subject_table.items()]
    containers = emp.promote_none_root_set_children(root_containers)

    if force_name_as_uid:
        emp.force_name_as_address(containers)

    # Debug
    if debug:
        print('==== Message threads ====')
        for container in containers:
            emp.print_container(container)
        print('=========================')

    # 3. extract the users nodes comments and sort them
    logging.info("Parsing mailinglist - Extracting the data")
    users, nodes, comments = emp.users_nodes_comments_from(
        containers, moderators, charset)
    sorted_users = sorted(users, key=eu.sort_by('created'))
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    sorted_comments = sorted(comments, key=eu.sort_by('created'))

    # 5. saves the files
    logging.info("Parsing mailinglist - Saving the files")
    write_file(sorted_users, 'users.json', outdir)
    write_file(sorted_nodes, 'nodes.json', outdir)
    write_file(sorted_comments, 'comments.json', outdir)

    logging.info("Parsing mailinglist - Completed")
Exemplo n.º 8
0
def main():
    initialize_logger('./log')

    generated = datetime.now()
    source_csv, source_dir, outdir, dumpto = parse_options(sys.argv[1:])
    logging.info("Parsing tweets - Started")
    logging.info("Parsing tweets - Output directory: %(s)s" % {'s': outdir})

    # 1. load and parse the CSV file into a list of records
    if dumpto:
        tag = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.csv'
        dump_to = os.path.join(dumpto, tag)
    else:
        dump_to = None

    tweets = []
    if source_csv:
        tweets += et.parse.load_and_parse_csv(source_csv,
                                              sort_key='created_ts',
                                              dump_to=dump_to)

    if source_dir:
        tweets += et.parse.load_and_parse_from_dir(source_dir,
                                                   sort_key='created_ts',
                                                   dump_to=dump_to)

    # 2. extract the users from the tweets
    users = et.extract.extract_users(tweets)
    sorted_users = sorted(users, key=eu.sort_by('created'))
    # users = { 'users': [{'user': user_data} for user_data in users] }

    # 3. extract the nodes from the tweets
    nodes = et.extract.extract_nodes(tweets)
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    # nodes = { 'nodes': [{'node': node_data} for node_data in nodes] }

    # 4. extract the comments from the tweets
    comments = et.extract.extract_comments(tweets)
    sorted_comments = sorted(comments, key=eu.sort_by('created'))
    # comments = { 'comments': [{'comment': comment_data} for comment_data in comments] }

    # 5. saves the files
    write_file(tweets, 'tweets.json', outdir)
    write_file(sorted_users, 'users.json', outdir)
    write_file(sorted_nodes, 'nodes.json', outdir)
    write_file(sorted_comments, 'comments.json', outdir)
    logging.info("Parsing tweets - Completed")
Exemplo n.º 9
0
def main():
    initialize_logger("./log")

    generated = datetime.now()
    sources, outdir, moderators, charset, force_name_as_uid, debug = parse_options(sys.argv[1:])
    logging.info("Parsing mailinglist - Started")
    logging.info("Parsing mailinglist - Source files: %(s)s" % {"s": repr(sources)})
    logging.info("Parsing mailinglist - Output directory: %(s)s" % {"s": outdir})

    # 1. load and parse each file in a list of messages
    logging.info("Parsing mailinglist - Reading the files")
    messages = []
    for file in sources:
        mbox = mailbox.mbox(file)
        for msg in mbox:
            messages.append(emt.Message(msg))

    # 2. build the threaded containers
    logging.info("Parsing mailinglist - Threading the messages")
    subject_table = emt.thread(messages)
    root_containers = [ctr for (subj, ctr) in subject_table.items()]
    containers = emp.promote_none_root_set_children(root_containers)

    if force_name_as_uid:
        emp.force_name_as_address(containers)

    # Debug
    if debug:
        print("==== Message threads ====")
        for container in containers:
            emp.print_container(container)
        print("=========================")

    # 3. extract the users nodes comments and sort them
    logging.info("Parsing mailinglist - Extracting the data")
    users, nodes, comments = emp.users_nodes_comments_from(containers, moderators, charset)
    sorted_users = sorted(users, key=eu.sort_by("created"))
    sorted_nodes = sorted(nodes, key=eu.sort_by("created"))
    sorted_comments = sorted(comments, key=eu.sort_by("created"))

    # 5. saves the files
    logging.info("Parsing mailinglist - Saving the files")
    write_file(sorted_users, "users.json", outdir)
    write_file(sorted_nodes, "nodes.json", outdir)
    write_file(sorted_comments, "comments.json", outdir)

    logging.info("Parsing mailinglist - Completed")
Exemplo n.º 10
0
def main():
    initialize_logger('./log')
    
    generated = datetime.now()
    source_csv, source_dir, outdir, dumpto = parse_options(sys.argv[1:])
    logging.info("Parsing tweets - Started")
    logging.info("Parsing tweets - Output directory: %(s)s" % {'s':outdir})
    
    # 1. load and parse the CSV file into a list of records
    if dumpto:
        tag = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.csv'
        dump_to = os.path.join(dumpto, tag)
    else:
        dump_to = None
    
    tweets = []
    if source_csv:
        tweets += et.parse.load_and_parse_csv(source_csv, sort_key='created_ts', dump_to=dump_to)

    if source_dir:
        tweets += et.parse.load_and_parse_from_dir(source_dir, sort_key='created_ts', dump_to=dump_to)
        
    # 2. extract the users from the tweets
    users = et.extract.extract_users(tweets)
    sorted_users = sorted(users, key=eu.sort_by('created'))
    # users = { 'users': [{'user': user_data} for user_data in users] }

    # 3. extract the nodes from the tweets
    nodes = et.extract.extract_nodes(tweets)
    sorted_nodes = sorted(nodes, key=eu.sort_by('created'))
    # nodes = { 'nodes': [{'node': node_data} for node_data in nodes] }
    
    # 4. extract the comments from the tweets
    comments = et.extract.extract_comments(tweets)
    sorted_comments = sorted(comments, key=eu.sort_by('created'))
    # comments = { 'comments': [{'comment': comment_data} for comment_data in comments] }
    
    # 5. saves the files
    write_file(tweets, 'tweets.json', outdir)
    write_file(sorted_users, 'users.json', outdir)
    write_file(sorted_nodes, 'nodes.json', outdir)
    write_file(sorted_comments, 'comments.json', outdir)
    logging.info("Parsing tweets - Completed")
Exemplo n.º 11
0
def main():
    initialize_logger('./log')

    generated = datetime.now()
    kind, source, destination_path, moderator, timestep_size, timestep_window, timestep_count, create_datapackage, license_type, license_url, datapackage_title = parse_options(
        sys.argv[1:])
    logging.info("Parsing catalyst - Started")
    logging.info("Parsing catalyst - Source file: %(s)s" % {'s': source})
    logging.info("Parsing catalyst - Output directory: %(s)s" %
                 {'s': destination_path})
    logging.info("Parsing catalyst - Extraction Kind: %(s)s" % {'s': kind})

    # 1. load and parse the JSON file into a RDF Graph

    graph = ec.inference.catalyst_graph_for(source)

    # 2. extract the usersnodes,comments from the graph
    use_posts = (kind == 'posts') or (kind == 'both')
    use_ideas = (kind == 'ideas') or (kind == 'both')
    assert use_ideas or use_posts, "kind must be ideas, posts or both"
    moderator_test = None
    if moderator:
        moderator_test = partial(ec.extract.is_moderator,
                                 graph,
                                 moderator_roles=(moderator, ))
    network = ec.extract.ideas.graph_to_network(generated, graph, use_ideas,
                                                use_posts, moderator_test)

    directed_multiedge_network = calculate_network_metrics({}, {}, {}, network,
                                                           timestep_size,
                                                           timestep_window,
                                                           timestep_count)

    eu.resource.write_network(network, \
                     directed_multiedge_network, \
                     generated, \
                     create_datapackage, \
                     datapackage_title, \
                     license_type, \
                     license_url, \
                     destination_path)

    logging.info("Parsing catalyst - Completed")
Exemplo n.º 12
0
def main(argv):
    initialize_logger('./log')

    source_path, outuput_filename = parse_options(argv)
    
    logging.info("Tutorial result processing - started")
    
    all_files = [ f for f in os.listdir(source_path) if os.path.isfile(os.path.join(source_path,f)) ]
    
    runs = {}
    timestamp = datetime.now()
    base_run_id = timestamp.strftime('%Y-%m-%d-%H-%M-%S')
    fake_run_id = 1
    for filename in all_files:
        logging.info("Tutorial result processing - loading:"+os.path.join(source_path,filename))
        f = open(os.path.join(source_path,filename), 'r')
        try:
            parsed = json.load(f)
            if parsed.has_key('run_id'):
                run_id = parsed['run_id']
            else:
                run_id = base_run_id+'--'+str(fake_run_id)
                fake_run_id += 1
            
            if not runs.has_key(run_id):
                runs[run_id] = {}
        
            run_obj = runs[run_id]
            run_obj['run_id'] = run_id
            if parsed.has_key('base'):
                run_obj['base'] = parsed['base']
                m = re.search('(\d\d\d\d)-(\d\d)-(\d\d)-\d\d-\d\d-\d\d$', parsed['base'])
                if m:
                    run_obj['date'] = m.group(1)+"-"+m.group(2)+"-"+m.group(3)
            if parsed.has_key('comments'):
                run_obj['comments'] = parsed['comments'].encode('utf-8').strip()
            # collect the tutorial answer results
            if parsed.has_key('answers'):
                for a in parsed['answers']:
                    run_obj[a['step']] = a['success']
            # collect the tutorial survey results
            if parsed.has_key('surveys'):
                for a in parsed['surveys']:
                    run_obj[a['step']] = a['value']
            
                
        except:
            logging.info("Tutorial result processing - error parsing:"+os.path.join(source_path,filename))
        
    
    # save the runs to a CSV file
    logging.info("Tutorial result processing - Writing:"+outuput_filename) 
    headers = [ 'run_id','base', 'date', \
                'betweenness_bin', 'relationship_percentage', \
                'posts_percentage', 'comments_share', \
                'modularity_increase', 'survey-1', \
                'survey-2', 'survey-3', 'survey-4', \
                'survey-5', 'comments']
    with open(outuput_filename, 'wb') as f:  
        w = csv.DictWriter(f, headers)
        w.writeheader()
        w.writerows(runs.values())
        
    logging.info("Tutorial result processing - Completed")  
Exemplo n.º 13
0
def parse_options(argv):
    import getopt
    users_resource = 'users.json'
    nodes_resource = 'nodes.json'
    comments_resource = 'comments.json'
    node_title_field = 'uid'
    timestep_size = 60 * 60 * 24 * 7
    timestep_window = 1
    timestep_count = None
    username = None
    password = None
    extraction_method = 'nested'
    admin_roles = set()
    exclude_isolated = False
    dumpto = None
    basepath = os.path.dirname(__file__)
    destination_path = os.path.abspath(
        os.path.join(basepath, "..", "static", "json"))
    log_path = './log'
    create_datapackage = False
    datapackage_title = None
    license_type = None
    license_url = None
    site_url = None

    data = {}
    try:
        with open(argv[0], 'r') as datafile:
            data = json.load(datafile)
    except:
        print 'Error reading the parameters file'
        sys.exit(2)

    if not (data):
        print 'edgesense_drupal <path to the parameters file>'
        sys.exit()

    if data.has_key('users') and data['users']:
        users_resource = data['users']

    if data.has_key('nodes') and data['nodes']:
        nodes_resource = data['nodes']

    if data.has_key('comments') and data['comments']:
        comments_resource = data['comments']

    if data.has_key('node_title') and data['node_title']:
        node_title_field = data['node_title']

    if data.has_key('timestep_size') and data['timestep_size']:
        timestep_size = int(data['timestep_size'])

    if data.has_key('count_window') and data['count_window']:
        timestep_window = int(data['count_window'])

    if data.has_key('timestep_count') and data['timestep_count']:
        timestep_count = int(data['timestep_count'])

    if data.has_key('auth'):
        try:
            username = data['auth']['username']
        except:
            username = None
        try:
            password = data['auth']['password']
        except:
            password = None

    if data.has_key('extraction_method') and data['extraction_method']:
        extraction_method = data['extraction_method']

    if data.has_key('moderator_roles') and data['moderator_roles']:
        admin_roles = set([
            e.strip() for e in data['moderator_roles'].split(",") if e.strip()
        ])

    if data.has_key('exclude_isolated') and data['exclude_isolated']:
        exclude_isolated = True

    if data.has_key('dumpto') and data['dumpto']:
        dumpto = data['extraction_method']

    if data.has_key('destination_path') and data['destination_path']:
        destination_path = data['destination_path']

    if data.has_key('log_path') and data['log_path']:
        log_path = os.path.join(data['log_path'])

    if data.has_key('datapackage'):
        try:
            license_type = data['datapackage']['license_type']
            license_url = data['datapackage']['license_url']
            if data['datapackage'].has_key('title'):
                datapackage_title = data['datapackage']['title']
            site_url = data['datapackage']['site_url']
            create_datapackage = True
        except:
            license_type = None
            license_url = None
            site_url = None
            create_datapackage = True

    # set up logging to file (edgesense.log in the same dir as the parameters file)
    initialize_logger(log_path,
                      file_level=logging.DEBUG,
                      console_level=logging.DEBUG,
                      file_mode='w')

    logging.info("parsing files %(u)s %(n)s %(c)s" % {
        'u': users_resource,
        'n': nodes_resource,
        'c': comments_resource
    })
    return (users_resource, nodes_resource, comments_resource,
            node_title_field, timestep_size, timestep_window, timestep_count,
            username, password, extraction_method, admin_roles,
            exclude_isolated, dumpto, destination_path, create_datapackage,
            datapackage_title, license_type, license_url, site_url)
Exemplo n.º 14
0
def main(debug=False):
    initialize_logger('./log')
    app.run(debug=debug, host=(None if debug else '0.0.0.0'))
Exemplo n.º 15
0
def main(argv):
    initialize_logger('./log')

    source_path, outuput_filename = parse_options(argv)

    logging.info("Tutorial result processing - started")

    all_files = [
        f for f in os.listdir(source_path)
        if os.path.isfile(os.path.join(source_path, f))
    ]

    runs = {}
    timestamp = datetime.now()
    base_run_id = timestamp.strftime('%Y-%m-%d-%H-%M-%S')
    fake_run_id = 1
    for filename in all_files:
        logging.info("Tutorial result processing - loading:" +
                     os.path.join(source_path, filename))
        f = open(os.path.join(source_path, filename), 'r')
        try:
            parsed = json.load(f)
            if parsed.has_key('run_id'):
                run_id = parsed['run_id']
            else:
                run_id = base_run_id + '--' + str(fake_run_id)
                fake_run_id += 1

            if not runs.has_key(run_id):
                runs[run_id] = {}

            run_obj = runs[run_id]
            run_obj['run_id'] = run_id
            if parsed.has_key('base'):
                run_obj['base'] = parsed['base']
                m = re.search('(\d\d\d\d)-(\d\d)-(\d\d)-\d\d-\d\d-\d\d$',
                              parsed['base'])
                if m:
                    run_obj['date'] = m.group(1) + "-" + m.group(
                        2) + "-" + m.group(3)
            if parsed.has_key('comments'):
                run_obj['comments'] = parsed['comments'].encode(
                    'utf-8').strip()
            # collect the tutorial answer results
            if parsed.has_key('answers'):
                for a in parsed['answers']:
                    run_obj[a['step']] = a['success']
            # collect the tutorial survey results
            if parsed.has_key('surveys'):
                for a in parsed['surveys']:
                    run_obj[a['step']] = a['value']

        except:
            logging.info("Tutorial result processing - error parsing:" +
                         os.path.join(source_path, filename))

    # save the runs to a CSV file
    logging.info("Tutorial result processing - Writing:" + outuput_filename)
    headers = [ 'run_id','base', 'date', \
                'betweenness_bin', 'relationship_percentage', \
                'posts_percentage', 'comments_share', \
                'modularity_increase', 'survey-1', \
                'survey-2', 'survey-3', 'survey-4', \
                'survey-5', 'comments']
    with open(outuput_filename, 'wb') as f:
        w = csv.DictWriter(f, headers)
        w.writeheader()
        w.writerows(runs.values())

    logging.info("Tutorial result processing - Completed")
Exemplo n.º 16
0
def parse_options(argv):
    import getopt
    users_resource = 'users.json'
    nodes_resource = 'nodes.json'
    comments_resource = 'comments.json'
    node_title_field = 'uid'
    timestep_size = 60*60*24*7
    timestep_window = 1
    timestep_count = None
    username = None
    password = None
    extraction_method = 'nested'
    admin_roles = set()
    exclude_isolated = False
    dumpto = None
    basepath = os.path.dirname(__file__)
    destination_path = os.path.abspath(os.path.join(basepath, "..", "static", "json"))
    log_path = './log'
    create_datapackage = False
    datapackage_title = None
    license_type = None
    license_url = None
    site_url = None
    
    data = {}            
    try:
        with open(argv[0], 'r') as datafile:
            data = json.load(datafile)
    except:
        print 'Error reading the parameters file'
        sys.exit(2)
    
    if not(data):
           print 'edgesense_drupal <path to the parameters file>'
           sys.exit()

    if data.has_key('users') and data['users']:
     users_resource = data['users']
    
    if data.has_key('nodes') and data['nodes']:
     nodes_resource = data['nodes']
    
    if data.has_key('comments') and data['comments']:
     comments_resource = data['comments']
    
    if data.has_key('node_title') and data['node_title']:
     node_title_field = data['node_title']
    
    if data.has_key('timestep_size') and data['timestep_size']:
     timestep_size = int(data['timestep_size'])
    
    if data.has_key('count_window') and data['count_window']:
     timestep_window = int(data['count_window'])
    
    if data.has_key('timestep_count') and data['timestep_count']:
     timestep_count = int(data['timestep_count'])
    
    if data.has_key('auth'):
     try:
         username = data['auth']['username']
     except:
         username = None
     try:
         password = data['auth']['password']
     except:
         password = None
         
    if data.has_key('extraction_method') and data['extraction_method']:
     extraction_method = data['extraction_method']
    
    if data.has_key('moderator_roles') and data['moderator_roles']:
     admin_roles = set([e.strip() for e in data['moderator_roles'].split(",") if e.strip()])
    
    if data.has_key('exclude_isolated') and data['exclude_isolated']:
     exclude_isolated = True
    
    if data.has_key('dumpto') and data['dumpto']:
       dumpto = data['extraction_method']
    
    if data.has_key('destination_path') and data['destination_path']:
       destination_path = data['destination_path']
    
    if data.has_key('log_path') and data['log_path']:
       log_path = os.path.join(data['log_path'])
           
    if data.has_key('datapackage'):
     try:
         license_type = data['datapackage']['license_type']
         license_url = data['datapackage']['license_url']
         if data['datapackage'].has_key('title'):
             datapackage_title = data['datapackage']['title']
         site_url = data['datapackage']['site_url']
         create_datapackage = True
     except:
         license_type = None
         license_url = None
         site_url = None
         create_datapackage = True

    # set up logging to file (edgesense.log in the same dir as the parameters file)
    initialize_logger(log_path, file_level=logging.DEBUG, console_level=logging.DEBUG, file_mode='w')

    logging.info("parsing files %(u)s %(n)s %(c)s" % {'u': users_resource, 'n': nodes_resource, 'c': comments_resource})       
    return (users_resource, 
            nodes_resource,
            comments_resource,
            node_title_field,
            timestep_size,
            timestep_window,
            timestep_count,
            username, password,
            extraction_method,
            admin_roles,
            exclude_isolated,
            dumpto,
            destination_path,
            create_datapackage,
            datapackage_title,
            license_type,
            license_url,
            site_url)
def main():
    initialize_logger('./albertoEdgesenseLog')
    generated = datetime.now()

    users_resource, \
    nodes_resource, \
    comments_resource, \
    node_title_field, \
    timestep_size, \
    timestep_window, \
    timestep_count, \
    username, \
    password, \
    extraction_method, \
    admin_roles, \
    exclude_isolated, \
    dumpto, \
    create_datapackage, \
    datapackage_title, \
    license_type, \
    license_url, \
    destination_path = parse_options(sys.argv[1:])

    logging.info("Network processing - started")

    # Load the files
    allusers, allnodes, allcomments = load_files(users_resource,
                                                 nodes_resource,
                                                 comments_resource, username,
                                                 password, extraction_method,
                                                 dumpto, generated)

    # extract a normalized set of data
    nodes_map, posts_map, comments_map = eu.extract.normalized_data(
        allusers, allnodes, allcomments, node_title_field, admin_roles,
        exclude_isolated)

    # this is the network object
    # going forward it should be read from a serialized format to handle caching
    network = {}

    # Add some file metadata
    network['meta'] = {}
    # Timestamp of the file generation (to show in the dashboard)
    network['meta']['generated'] = int(generated.strftime("%s"))

    network['edges'] = extract_edges(nodes_map, comments_map)
    network['edges'] += extract_multiauthor_post_edges(nodes_map, posts_map)

    # filter out nodes that have not participated to the full:conversations
    inactive_nodes = [v for v in nodes_map.values() if not v['active']]
    logging.info("inactive nodes: %(n)i" % {'n': len(inactive_nodes)})
    network['nodes'] = [v for v in nodes_map.values() if v['active']]

    directed_multiedge_network = calculate_network_metrics(
        nodes_map, posts_map, comments_map, network, timestep_size,
        timestep_window, timestep_count)

    eu.resource.write_network(network, \
                     directed_multiedge_network, \
                     generated, \
                     create_datapackage, \
                     datapackage_title, \
                     license_type, \
                     license_url, \
                     destination_path)

    logging.info("Completed")
Exemplo n.º 18
0
def main(debug=False):
    initialize_logger('./log')
    app.run(debug=debug, host=(None if debug else '0.0.0.0'))