def main(): initialize_logger('./albertoEdgesenseLog') generated = datetime.now() users_resource, \ nodes_resource, \ comments_resource, \ node_title_field, \ timestep_size, \ timestep_window, \ timestep_count, \ username, \ password, \ extraction_method, \ admin_roles, \ exclude_isolated, \ dumpto, \ create_datapackage, \ datapackage_title, \ license_type, \ license_url, \ destination_path = parse_options(sys.argv[1:]) logging.info("Network processing - started") # Load the files allusers, allnodes, allcomments = load_files(users_resource, nodes_resource, comments_resource, username, password, extraction_method, dumpto, generated) # extract a normalized set of data nodes_map, posts_map, comments_map = eu.extract.normalized_data(allusers, allnodes, allcomments, node_title_field, admin_roles, exclude_isolated) # this is the network object # going forward it should be read from a serialized format to handle caching network = {} # Add some file metadata network['meta'] = {} # Timestamp of the file generation (to show in the dashboard) network['meta']['generated'] = int(generated.strftime("%s")) network['edges'] = extract_edges(nodes_map, comments_map) network['edges'] += extract_multiauthor_post_edges(nodes_map, posts_map) # filter out nodes that have not participated to the full:conversations inactive_nodes = [ v for v in nodes_map.values() if not v['active'] ] logging.info("inactive nodes: %(n)i" % {'n':len(inactive_nodes)}) network['nodes'] = [ v for v in nodes_map.values() if v['active'] ] directed_multiedge_network = calculate_network_metrics(nodes_map, posts_map, comments_map, network, timestep_size, timestep_window, timestep_count) eu.resource.write_network(network, \ directed_multiedge_network, \ generated, \ create_datapackage, \ datapackage_title, \ license_type, \ license_url, \ destination_path) logging.info("Completed")
def main(): initialize_logger('./log') generated = datetime.now() kind, source, outdir = parse_options(sys.argv[1:]) logging.info("Parsing catalyst - Started") logging.info("Parsing catalyst - Source file: %(s)s" % {'s':source}) logging.info("Parsing catalyst - Output directory: %(s)s" % {'s':outdir}) logging.info("Parsing catalyst - Extraction Kind: %(s)s" % {'s':kind}) # 1. load and parse the JSON file into a RDF Graph graph = ec.inference.catalyst_graph_for(source) # 2. extract the usersnodes,comments from the graph if kind == 'simple': users,nodes,comments = ec.extract.simple.users_nodes_comments_from(graph) elif kind == 'excerpts': users,nodes,comments = ec.extract.excerpts.users_nodes_comments_from(graph) else: logging.info("Parsing catalyst - Extraction kind not supported") return # 3. sort the lists sorted_users = sorted(users, key=eu.sort_by('created')) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) sorted_comments = sorted(comments, key=eu.sort_by('created')) # 4. saves the files write_file(sorted_users, 'users.json', outdir) write_file(sorted_nodes, 'nodes.json', outdir) write_file(sorted_comments, 'comments.json', outdir) logging.info("Parsing catalyst - Completed")
def main(): initialize_logger('./log') generated = datetime.now() kind, source, destination_path, moderator, timestep_size, timestep_window, timestep_count, create_datapackage, license_type, license_url, datapackage_title = parse_options(sys.argv[1:]) logging.info("Parsing catalyst - Started") logging.info("Parsing catalyst - Source file: %(s)s" % {'s':source}) logging.info("Parsing catalyst - Output directory: %(s)s" % {'s':destination_path}) logging.info("Parsing catalyst - Extraction Kind: %(s)s" % {'s':kind}) # 1. load and parse the JSON file into a RDF Graph graph = ec.inference.catalyst_graph_for(source) # 2. extract the usersnodes,comments from the graph use_posts = (kind == 'posts') or (kind == 'both') use_ideas = (kind == 'ideas') or (kind == 'both') assert use_ideas or use_posts, "kind must be ideas, posts or both" moderator_test = None if moderator: moderator_test = partial(ec.extract.is_moderator, graph, moderator_roles=(moderator,)) network = ec.extract.ideas.graph_to_network(generated, graph, use_ideas, use_posts, moderator_test) directed_multiedge_network = calculate_network_metrics({}, {}, {}, network, timestep_size, timestep_window, timestep_count) eu.resource.write_network(network, \ directed_multiedge_network, \ generated, \ create_datapackage, \ datapackage_title, \ license_type, \ license_url, \ destination_path) logging.info("Parsing catalyst - Completed")
def main(argv): initialize_logger('./log') users_resource, nodes_resource, comments_resource = parse_options(argv) logging.info("Network processing - started") # load users jusers = eu.resource.load(users_resource) allusers = jusers['users'] # load nodes jnodes = eu.resource.load(nodes_resource) allnodes = jnodes['nodes'] # load comments jcomments = eu.resource.load(comments_resource) allcomments = jcomments['comments'] logging.info("file loaded") generated = datetime.now() network = build(allusers, allnodes, allcomments, generated) write_network(network, generated) logging.info("Completed")
def parse(): node_title_field = 'uid' timestep_size = 60*60*24*7 timestep_window = 1 timestep_count = 20 username = None password = None extraction_method = 'nested' admin_roles = set() exclude_isolated = False create_datapackage = False license_type = None license_url = None datapackage_title = None kind = 'both' moderator = None generated = datetime.now() source_json = request.form['source'] if request.form.has_key('source') else None if not source_json: raise InvalidUsage('Missing parameters', status_code=400) initialize_logger('./log') logging.info("parse_source - Started") logging.info("parse_source - Source: %(s)s" % {'s':source_json}) logging.info("parse_source - Extraction Kind: %(s)s" % {'s':kind}) # 1. load and parse the JSON file into a RDF Graph graph = ec.inference.catalyst_graph_for(source_json) # 2. extract the usersnodes,comments from the graph use_posts = (kind == 'posts') or (kind == 'both') use_ideas = (kind == 'ideas') or (kind == 'both') assert use_ideas or use_posts, "kind must be ideas, posts or both" moderator_test = None if moderator: moderator_test = partial(ec.extract.is_moderator, graph, moderator_roles=(moderator,)) network = ec.extract.ideas.graph_to_network(generated, graph, use_ideas, use_posts, moderator_test) directed_multiedge_network = calculate_network_metrics({}, {}, {}, network, timestep_size, timestep_window, timestep_count) eu.resource.write_network(network, \ directed_multiedge_network, \ generated, \ create_datapackage, \ datapackage_title, \ license_type, \ license_url, \ destination_path) # return the result URL tag = generated.strftime('%Y-%m-%d-%H-%M-%S') base_path = os.path.join("/json/data", tag) result_path = os.path.join(base_path, "network.min.json") logging.info("Completed: %(s)s" % {'s':result_path}) return jsonify({'last': tag, 'base_path': base_path, 'metrics': 'network.min.json', 'gexf': 'network.gexf', 'datapackage': 'datapackage.json' })
def main(): initialize_logger("./log") generated = datetime.now() users_resource, nodes_resource, comments_resource, node_title_field, timestep_size, timestep_window, timestep_count, username, password, extraction_method, admin_roles, exclude_isolated, dumpto, create_datapackage, datapackage_title, license_type, license_url, destination_path = parse_options( sys.argv[1:] ) logging.info("Network processing - started") # Load the files allusers, allnodes, allcomments = load_files( users_resource, nodes_resource, comments_resource, username, password, extraction_method, dumpto, generated ) # extract a normalized set of data nodes_map, posts_map, comments_map = eu.extract.normalized_data( allusers, allnodes, allcomments, node_title_field, admin_roles, exclude_isolated ) # this is the network object # going forward it should be read from a serialized format to handle caching network = {} # Add some file metadata network["meta"] = {} # Timestamp of the file generation (to show in the dashboard) network["meta"]["generated"] = int(time.mktime(generated.timetuple())) # Windows-compatible network["edges"] = extract_edges(nodes_map, comments_map) network["edges"] += extract_multiauthor_post_edges(nodes_map, posts_map) # filter out nodes that have not participated to the full:conversations inactive_nodes = [v for v in nodes_map.values() if not v["active"]] logging.info("inactive nodes: %(n)i" % {"n": len(inactive_nodes)}) network["nodes"] = [v for v in nodes_map.values() if v["active"]] directed_multiedge_network = calculate_network_metrics( nodes_map, posts_map, comments_map, network, timestep_size, timestep_window, timestep_count ) eu.resource.write_network( network, directed_multiedge_network, generated, create_datapackage, datapackage_title, license_type, license_url, destination_path, ) logging.info("Completed")
def main(): initialize_logger('./log') generated = datetime.now() sources, outdir, moderators, charset, force_name_as_uid, debug = parse_options( sys.argv[1:]) logging.info("Parsing mailinglist - Started") logging.info("Parsing mailinglist - Source files: %(s)s" % {'s': repr(sources)}) logging.info("Parsing mailinglist - Output directory: %(s)s" % {'s': outdir}) # 1. load and parse each file in a list of messages logging.info("Parsing mailinglist - Reading the files") messages = [] for file in sources: mbox = mailbox.mbox(file) for msg in mbox: messages.append(emt.Message(msg)) # 2. build the threaded containers logging.info("Parsing mailinglist - Threading the messages") subject_table = emt.thread(messages) root_containers = [ctr for (subj, ctr) in subject_table.items()] containers = emp.promote_none_root_set_children(root_containers) if force_name_as_uid: emp.force_name_as_address(containers) # Debug if debug: print('==== Message threads ====') for container in containers: emp.print_container(container) print('=========================') # 3. extract the users nodes comments and sort them logging.info("Parsing mailinglist - Extracting the data") users, nodes, comments = emp.users_nodes_comments_from( containers, moderators, charset) sorted_users = sorted(users, key=eu.sort_by('created')) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) sorted_comments = sorted(comments, key=eu.sort_by('created')) # 5. saves the files logging.info("Parsing mailinglist - Saving the files") write_file(sorted_users, 'users.json', outdir) write_file(sorted_nodes, 'nodes.json', outdir) write_file(sorted_comments, 'comments.json', outdir) logging.info("Parsing mailinglist - Completed")
def main(): initialize_logger('./log') generated = datetime.now() source_csv, source_dir, outdir, dumpto = parse_options(sys.argv[1:]) logging.info("Parsing tweets - Started") logging.info("Parsing tweets - Output directory: %(s)s" % {'s': outdir}) # 1. load and parse the CSV file into a list of records if dumpto: tag = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.csv' dump_to = os.path.join(dumpto, tag) else: dump_to = None tweets = [] if source_csv: tweets += et.parse.load_and_parse_csv(source_csv, sort_key='created_ts', dump_to=dump_to) if source_dir: tweets += et.parse.load_and_parse_from_dir(source_dir, sort_key='created_ts', dump_to=dump_to) # 2. extract the users from the tweets users = et.extract.extract_users(tweets) sorted_users = sorted(users, key=eu.sort_by('created')) # users = { 'users': [{'user': user_data} for user_data in users] } # 3. extract the nodes from the tweets nodes = et.extract.extract_nodes(tweets) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) # nodes = { 'nodes': [{'node': node_data} for node_data in nodes] } # 4. extract the comments from the tweets comments = et.extract.extract_comments(tweets) sorted_comments = sorted(comments, key=eu.sort_by('created')) # comments = { 'comments': [{'comment': comment_data} for comment_data in comments] } # 5. saves the files write_file(tweets, 'tweets.json', outdir) write_file(sorted_users, 'users.json', outdir) write_file(sorted_nodes, 'nodes.json', outdir) write_file(sorted_comments, 'comments.json', outdir) logging.info("Parsing tweets - Completed")
def main(): initialize_logger("./log") generated = datetime.now() sources, outdir, moderators, charset, force_name_as_uid, debug = parse_options(sys.argv[1:]) logging.info("Parsing mailinglist - Started") logging.info("Parsing mailinglist - Source files: %(s)s" % {"s": repr(sources)}) logging.info("Parsing mailinglist - Output directory: %(s)s" % {"s": outdir}) # 1. load and parse each file in a list of messages logging.info("Parsing mailinglist - Reading the files") messages = [] for file in sources: mbox = mailbox.mbox(file) for msg in mbox: messages.append(emt.Message(msg)) # 2. build the threaded containers logging.info("Parsing mailinglist - Threading the messages") subject_table = emt.thread(messages) root_containers = [ctr for (subj, ctr) in subject_table.items()] containers = emp.promote_none_root_set_children(root_containers) if force_name_as_uid: emp.force_name_as_address(containers) # Debug if debug: print("==== Message threads ====") for container in containers: emp.print_container(container) print("=========================") # 3. extract the users nodes comments and sort them logging.info("Parsing mailinglist - Extracting the data") users, nodes, comments = emp.users_nodes_comments_from(containers, moderators, charset) sorted_users = sorted(users, key=eu.sort_by("created")) sorted_nodes = sorted(nodes, key=eu.sort_by("created")) sorted_comments = sorted(comments, key=eu.sort_by("created")) # 5. saves the files logging.info("Parsing mailinglist - Saving the files") write_file(sorted_users, "users.json", outdir) write_file(sorted_nodes, "nodes.json", outdir) write_file(sorted_comments, "comments.json", outdir) logging.info("Parsing mailinglist - Completed")
def main(): initialize_logger('./log') generated = datetime.now() source_csv, source_dir, outdir, dumpto = parse_options(sys.argv[1:]) logging.info("Parsing tweets - Started") logging.info("Parsing tweets - Output directory: %(s)s" % {'s':outdir}) # 1. load and parse the CSV file into a list of records if dumpto: tag = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.csv' dump_to = os.path.join(dumpto, tag) else: dump_to = None tweets = [] if source_csv: tweets += et.parse.load_and_parse_csv(source_csv, sort_key='created_ts', dump_to=dump_to) if source_dir: tweets += et.parse.load_and_parse_from_dir(source_dir, sort_key='created_ts', dump_to=dump_to) # 2. extract the users from the tweets users = et.extract.extract_users(tweets) sorted_users = sorted(users, key=eu.sort_by('created')) # users = { 'users': [{'user': user_data} for user_data in users] } # 3. extract the nodes from the tweets nodes = et.extract.extract_nodes(tweets) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) # nodes = { 'nodes': [{'node': node_data} for node_data in nodes] } # 4. extract the comments from the tweets comments = et.extract.extract_comments(tweets) sorted_comments = sorted(comments, key=eu.sort_by('created')) # comments = { 'comments': [{'comment': comment_data} for comment_data in comments] } # 5. saves the files write_file(tweets, 'tweets.json', outdir) write_file(sorted_users, 'users.json', outdir) write_file(sorted_nodes, 'nodes.json', outdir) write_file(sorted_comments, 'comments.json', outdir) logging.info("Parsing tweets - Completed")
def main(): initialize_logger('./log') generated = datetime.now() kind, source, destination_path, moderator, timestep_size, timestep_window, timestep_count, create_datapackage, license_type, license_url, datapackage_title = parse_options( sys.argv[1:]) logging.info("Parsing catalyst - Started") logging.info("Parsing catalyst - Source file: %(s)s" % {'s': source}) logging.info("Parsing catalyst - Output directory: %(s)s" % {'s': destination_path}) logging.info("Parsing catalyst - Extraction Kind: %(s)s" % {'s': kind}) # 1. load and parse the JSON file into a RDF Graph graph = ec.inference.catalyst_graph_for(source) # 2. extract the usersnodes,comments from the graph use_posts = (kind == 'posts') or (kind == 'both') use_ideas = (kind == 'ideas') or (kind == 'both') assert use_ideas or use_posts, "kind must be ideas, posts or both" moderator_test = None if moderator: moderator_test = partial(ec.extract.is_moderator, graph, moderator_roles=(moderator, )) network = ec.extract.ideas.graph_to_network(generated, graph, use_ideas, use_posts, moderator_test) directed_multiedge_network = calculate_network_metrics({}, {}, {}, network, timestep_size, timestep_window, timestep_count) eu.resource.write_network(network, \ directed_multiedge_network, \ generated, \ create_datapackage, \ datapackage_title, \ license_type, \ license_url, \ destination_path) logging.info("Parsing catalyst - Completed")
def main(argv): initialize_logger('./log') source_path, outuput_filename = parse_options(argv) logging.info("Tutorial result processing - started") all_files = [ f for f in os.listdir(source_path) if os.path.isfile(os.path.join(source_path,f)) ] runs = {} timestamp = datetime.now() base_run_id = timestamp.strftime('%Y-%m-%d-%H-%M-%S') fake_run_id = 1 for filename in all_files: logging.info("Tutorial result processing - loading:"+os.path.join(source_path,filename)) f = open(os.path.join(source_path,filename), 'r') try: parsed = json.load(f) if parsed.has_key('run_id'): run_id = parsed['run_id'] else: run_id = base_run_id+'--'+str(fake_run_id) fake_run_id += 1 if not runs.has_key(run_id): runs[run_id] = {} run_obj = runs[run_id] run_obj['run_id'] = run_id if parsed.has_key('base'): run_obj['base'] = parsed['base'] m = re.search('(\d\d\d\d)-(\d\d)-(\d\d)-\d\d-\d\d-\d\d$', parsed['base']) if m: run_obj['date'] = m.group(1)+"-"+m.group(2)+"-"+m.group(3) if parsed.has_key('comments'): run_obj['comments'] = parsed['comments'].encode('utf-8').strip() # collect the tutorial answer results if parsed.has_key('answers'): for a in parsed['answers']: run_obj[a['step']] = a['success'] # collect the tutorial survey results if parsed.has_key('surveys'): for a in parsed['surveys']: run_obj[a['step']] = a['value'] except: logging.info("Tutorial result processing - error parsing:"+os.path.join(source_path,filename)) # save the runs to a CSV file logging.info("Tutorial result processing - Writing:"+outuput_filename) headers = [ 'run_id','base', 'date', \ 'betweenness_bin', 'relationship_percentage', \ 'posts_percentage', 'comments_share', \ 'modularity_increase', 'survey-1', \ 'survey-2', 'survey-3', 'survey-4', \ 'survey-5', 'comments'] with open(outuput_filename, 'wb') as f: w = csv.DictWriter(f, headers) w.writeheader() w.writerows(runs.values()) logging.info("Tutorial result processing - Completed")
def parse_options(argv): import getopt users_resource = 'users.json' nodes_resource = 'nodes.json' comments_resource = 'comments.json' node_title_field = 'uid' timestep_size = 60 * 60 * 24 * 7 timestep_window = 1 timestep_count = None username = None password = None extraction_method = 'nested' admin_roles = set() exclude_isolated = False dumpto = None basepath = os.path.dirname(__file__) destination_path = os.path.abspath( os.path.join(basepath, "..", "static", "json")) log_path = './log' create_datapackage = False datapackage_title = None license_type = None license_url = None site_url = None data = {} try: with open(argv[0], 'r') as datafile: data = json.load(datafile) except: print 'Error reading the parameters file' sys.exit(2) if not (data): print 'edgesense_drupal <path to the parameters file>' sys.exit() if data.has_key('users') and data['users']: users_resource = data['users'] if data.has_key('nodes') and data['nodes']: nodes_resource = data['nodes'] if data.has_key('comments') and data['comments']: comments_resource = data['comments'] if data.has_key('node_title') and data['node_title']: node_title_field = data['node_title'] if data.has_key('timestep_size') and data['timestep_size']: timestep_size = int(data['timestep_size']) if data.has_key('count_window') and data['count_window']: timestep_window = int(data['count_window']) if data.has_key('timestep_count') and data['timestep_count']: timestep_count = int(data['timestep_count']) if data.has_key('auth'): try: username = data['auth']['username'] except: username = None try: password = data['auth']['password'] except: password = None if data.has_key('extraction_method') and data['extraction_method']: extraction_method = data['extraction_method'] if data.has_key('moderator_roles') and data['moderator_roles']: admin_roles = set([ e.strip() for e in data['moderator_roles'].split(",") if e.strip() ]) if data.has_key('exclude_isolated') and data['exclude_isolated']: exclude_isolated = True if data.has_key('dumpto') and data['dumpto']: dumpto = data['extraction_method'] if data.has_key('destination_path') and data['destination_path']: destination_path = data['destination_path'] if data.has_key('log_path') and data['log_path']: log_path = os.path.join(data['log_path']) if data.has_key('datapackage'): try: license_type = data['datapackage']['license_type'] license_url = data['datapackage']['license_url'] if data['datapackage'].has_key('title'): datapackage_title = data['datapackage']['title'] site_url = data['datapackage']['site_url'] create_datapackage = True except: license_type = None license_url = None site_url = None create_datapackage = True # set up logging to file (edgesense.log in the same dir as the parameters file) initialize_logger(log_path, file_level=logging.DEBUG, console_level=logging.DEBUG, file_mode='w') logging.info("parsing files %(u)s %(n)s %(c)s" % { 'u': users_resource, 'n': nodes_resource, 'c': comments_resource }) return (users_resource, nodes_resource, comments_resource, node_title_field, timestep_size, timestep_window, timestep_count, username, password, extraction_method, admin_roles, exclude_isolated, dumpto, destination_path, create_datapackage, datapackage_title, license_type, license_url, site_url)
def main(debug=False): initialize_logger('./log') app.run(debug=debug, host=(None if debug else '0.0.0.0'))
def main(argv): initialize_logger('./log') source_path, outuput_filename = parse_options(argv) logging.info("Tutorial result processing - started") all_files = [ f for f in os.listdir(source_path) if os.path.isfile(os.path.join(source_path, f)) ] runs = {} timestamp = datetime.now() base_run_id = timestamp.strftime('%Y-%m-%d-%H-%M-%S') fake_run_id = 1 for filename in all_files: logging.info("Tutorial result processing - loading:" + os.path.join(source_path, filename)) f = open(os.path.join(source_path, filename), 'r') try: parsed = json.load(f) if parsed.has_key('run_id'): run_id = parsed['run_id'] else: run_id = base_run_id + '--' + str(fake_run_id) fake_run_id += 1 if not runs.has_key(run_id): runs[run_id] = {} run_obj = runs[run_id] run_obj['run_id'] = run_id if parsed.has_key('base'): run_obj['base'] = parsed['base'] m = re.search('(\d\d\d\d)-(\d\d)-(\d\d)-\d\d-\d\d-\d\d$', parsed['base']) if m: run_obj['date'] = m.group(1) + "-" + m.group( 2) + "-" + m.group(3) if parsed.has_key('comments'): run_obj['comments'] = parsed['comments'].encode( 'utf-8').strip() # collect the tutorial answer results if parsed.has_key('answers'): for a in parsed['answers']: run_obj[a['step']] = a['success'] # collect the tutorial survey results if parsed.has_key('surveys'): for a in parsed['surveys']: run_obj[a['step']] = a['value'] except: logging.info("Tutorial result processing - error parsing:" + os.path.join(source_path, filename)) # save the runs to a CSV file logging.info("Tutorial result processing - Writing:" + outuput_filename) headers = [ 'run_id','base', 'date', \ 'betweenness_bin', 'relationship_percentage', \ 'posts_percentage', 'comments_share', \ 'modularity_increase', 'survey-1', \ 'survey-2', 'survey-3', 'survey-4', \ 'survey-5', 'comments'] with open(outuput_filename, 'wb') as f: w = csv.DictWriter(f, headers) w.writeheader() w.writerows(runs.values()) logging.info("Tutorial result processing - Completed")
def parse_options(argv): import getopt users_resource = 'users.json' nodes_resource = 'nodes.json' comments_resource = 'comments.json' node_title_field = 'uid' timestep_size = 60*60*24*7 timestep_window = 1 timestep_count = None username = None password = None extraction_method = 'nested' admin_roles = set() exclude_isolated = False dumpto = None basepath = os.path.dirname(__file__) destination_path = os.path.abspath(os.path.join(basepath, "..", "static", "json")) log_path = './log' create_datapackage = False datapackage_title = None license_type = None license_url = None site_url = None data = {} try: with open(argv[0], 'r') as datafile: data = json.load(datafile) except: print 'Error reading the parameters file' sys.exit(2) if not(data): print 'edgesense_drupal <path to the parameters file>' sys.exit() if data.has_key('users') and data['users']: users_resource = data['users'] if data.has_key('nodes') and data['nodes']: nodes_resource = data['nodes'] if data.has_key('comments') and data['comments']: comments_resource = data['comments'] if data.has_key('node_title') and data['node_title']: node_title_field = data['node_title'] if data.has_key('timestep_size') and data['timestep_size']: timestep_size = int(data['timestep_size']) if data.has_key('count_window') and data['count_window']: timestep_window = int(data['count_window']) if data.has_key('timestep_count') and data['timestep_count']: timestep_count = int(data['timestep_count']) if data.has_key('auth'): try: username = data['auth']['username'] except: username = None try: password = data['auth']['password'] except: password = None if data.has_key('extraction_method') and data['extraction_method']: extraction_method = data['extraction_method'] if data.has_key('moderator_roles') and data['moderator_roles']: admin_roles = set([e.strip() for e in data['moderator_roles'].split(",") if e.strip()]) if data.has_key('exclude_isolated') and data['exclude_isolated']: exclude_isolated = True if data.has_key('dumpto') and data['dumpto']: dumpto = data['extraction_method'] if data.has_key('destination_path') and data['destination_path']: destination_path = data['destination_path'] if data.has_key('log_path') and data['log_path']: log_path = os.path.join(data['log_path']) if data.has_key('datapackage'): try: license_type = data['datapackage']['license_type'] license_url = data['datapackage']['license_url'] if data['datapackage'].has_key('title'): datapackage_title = data['datapackage']['title'] site_url = data['datapackage']['site_url'] create_datapackage = True except: license_type = None license_url = None site_url = None create_datapackage = True # set up logging to file (edgesense.log in the same dir as the parameters file) initialize_logger(log_path, file_level=logging.DEBUG, console_level=logging.DEBUG, file_mode='w') logging.info("parsing files %(u)s %(n)s %(c)s" % {'u': users_resource, 'n': nodes_resource, 'c': comments_resource}) return (users_resource, nodes_resource, comments_resource, node_title_field, timestep_size, timestep_window, timestep_count, username, password, extraction_method, admin_roles, exclude_isolated, dumpto, destination_path, create_datapackage, datapackage_title, license_type, license_url, site_url)
def main(): initialize_logger('./albertoEdgesenseLog') generated = datetime.now() users_resource, \ nodes_resource, \ comments_resource, \ node_title_field, \ timestep_size, \ timestep_window, \ timestep_count, \ username, \ password, \ extraction_method, \ admin_roles, \ exclude_isolated, \ dumpto, \ create_datapackage, \ datapackage_title, \ license_type, \ license_url, \ destination_path = parse_options(sys.argv[1:]) logging.info("Network processing - started") # Load the files allusers, allnodes, allcomments = load_files(users_resource, nodes_resource, comments_resource, username, password, extraction_method, dumpto, generated) # extract a normalized set of data nodes_map, posts_map, comments_map = eu.extract.normalized_data( allusers, allnodes, allcomments, node_title_field, admin_roles, exclude_isolated) # this is the network object # going forward it should be read from a serialized format to handle caching network = {} # Add some file metadata network['meta'] = {} # Timestamp of the file generation (to show in the dashboard) network['meta']['generated'] = int(generated.strftime("%s")) network['edges'] = extract_edges(nodes_map, comments_map) network['edges'] += extract_multiauthor_post_edges(nodes_map, posts_map) # filter out nodes that have not participated to the full:conversations inactive_nodes = [v for v in nodes_map.values() if not v['active']] logging.info("inactive nodes: %(n)i" % {'n': len(inactive_nodes)}) network['nodes'] = [v for v in nodes_map.values() if v['active']] directed_multiedge_network = calculate_network_metrics( nodes_map, posts_map, comments_map, network, timestep_size, timestep_window, timestep_count) eu.resource.write_network(network, \ directed_multiedge_network, \ generated, \ create_datapackage, \ datapackage_title, \ license_type, \ license_url, \ destination_path) logging.info("Completed")