def main(): initialize_logger('./log') generated = datetime.now() kind, source, outdir = parse_options(sys.argv[1:]) logging.info("Parsing catalyst - Started") logging.info("Parsing catalyst - Source file: %(s)s" % {'s':source}) logging.info("Parsing catalyst - Output directory: %(s)s" % {'s':outdir}) logging.info("Parsing catalyst - Extraction Kind: %(s)s" % {'s':kind}) # 1. load and parse the JSON file into a RDF Graph graph = ec.inference.catalyst_graph_for(source) # 2. extract the usersnodes,comments from the graph if kind == 'simple': users,nodes,comments = ec.extract.simple.users_nodes_comments_from(graph) elif kind == 'excerpts': users,nodes,comments = ec.extract.excerpts.users_nodes_comments_from(graph) else: logging.info("Parsing catalyst - Extraction kind not supported") return # 3. sort the lists sorted_users = sorted(users, key=eu.sort_by('created')) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) sorted_comments = sorted(comments, key=eu.sort_by('created')) # 4. saves the files write_file(sorted_users, 'users.json', outdir) write_file(sorted_nodes, 'nodes.json', outdir) write_file(sorted_comments, 'comments.json', outdir) logging.info("Parsing catalyst - Completed")
def convert_to_network(generated, graph, posts, creator_of_post, reply_of, moderator_test=None): all_creators = {creator_of_post.get(n, None) for n in posts} all_creators.discard(None) profile_of_account = {} nodes = [account_as_node(graph, account, profile_of_account, moderator_test) for account in all_creators] edges = [] for post in posts: for i, replying in enumerate(reply_of.get(post, ())): post_id = stringify(post) if i: post_id = "%s__%d" % (post_id, i) edges.append( post_as_link( graph, post, post_id, replying, profile_of_account[creator_of_post[post]], profile_of_account[creator_of_post[replying]], moderator_test, ) ) nodes.sort(key=eu.sort_by("created_ts")) edges.sort(key=eu.sort_by("ts")) # this is the network object # going forward it should be read from a serialized format to handle caching return {"meta": {"generated": int(generated.strftime("%s"))}, "edges": edges, "nodes": nodes}
def parse_cif(source, kind): logging.info("parse_source - Started") logging.info("parse_source - Source: %(s)s" % {'s': source}) logging.info("parse_source - Extraction Kind: %(s)s" % {'s': kind}) # 1. load and parse the JSON file into a RDF Graph graph = ec.inference.catalyst_graph_for(source) # 2. extract the usersnodes,comments from the graph if kind == 'simple': users, nodes, comments = ec.extract.simple.users_nodes_comments_from( graph) elif kind == 'excerpts': users, nodes, comments = ec.extract.excerpts.users_nodes_comments_from( graph) else: logging.info("Parsing catalyst - Extraction kind not supported") return # 3. sort the lists sorted_users = sorted(users, key=eu.sort_by('created')) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) sorted_comments = sorted(comments, key=eu.sort_by('created')) # 4. return the data logging.info("Parsing catalyst - Completed") return (sorted_users, sorted_nodes, sorted_comments)
def main(): initialize_logger('./log') generated = datetime.now() sources, outdir, moderators, charset, force_name_as_uid, debug = parse_options( sys.argv[1:]) logging.info("Parsing mailinglist - Started") logging.info("Parsing mailinglist - Source files: %(s)s" % {'s': repr(sources)}) logging.info("Parsing mailinglist - Output directory: %(s)s" % {'s': outdir}) # 1. load and parse each file in a list of messages logging.info("Parsing mailinglist - Reading the files") messages = [] for file in sources: mbox = mailbox.mbox(file) for msg in mbox: messages.append(emt.Message(msg)) # 2. build the threaded containers logging.info("Parsing mailinglist - Threading the messages") subject_table = emt.thread(messages) root_containers = [ctr for (subj, ctr) in subject_table.items()] containers = emp.promote_none_root_set_children(root_containers) if force_name_as_uid: emp.force_name_as_address(containers) # Debug if debug: print('==== Message threads ====') for container in containers: emp.print_container(container) print('=========================') # 3. extract the users nodes comments and sort them logging.info("Parsing mailinglist - Extracting the data") users, nodes, comments = emp.users_nodes_comments_from( containers, moderators, charset) sorted_users = sorted(users, key=eu.sort_by('created')) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) sorted_comments = sorted(comments, key=eu.sort_by('created')) # 5. saves the files logging.info("Parsing mailinglist - Saving the files") write_file(sorted_users, 'users.json', outdir) write_file(sorted_nodes, 'nodes.json', outdir) write_file(sorted_comments, 'comments.json', outdir) logging.info("Parsing mailinglist - Completed")
def extract_multiauthor_post_edges(nodes_map, posts_map): # build the list of edges edges_list = [] # a comment is 'valid' if it has a recipient and an author multiauthor_posts = [e for e in posts_map.values() if e.get('all_authors', None) and hasattr(e.get('all_authors', None), '__iter__') and len(e.get('all_authors', None))>1] logging.info("%(v)i multiauthor posts on %(t)i total" % {'v':len(multiauthor_posts), 't':len(posts_map.values())}) # build the posts network to use for metrics for post in multiauthor_posts: for authors in itertools.product(post['all_authors'], post['all_authors']): if authors[0]!=authors[1]: link = { 'id': "{0}_{1}_{2}".format(authors[0],authors[1],post['created_ts']), 'source': authors[0], 'target': authors[1], 'ts': post['created_ts'], 'effort': post['length'], 'team': post['team'] } if nodes_map.has_key(authors[0]): nodes_map[authors[0]]['active'] = True else: logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':authors[0]}) if nodes_map.has_key(authors[1]): nodes_map[authors[1]]['active'] = True else: logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':authors[1]}) edges_list.append(link) return sorted(edges_list, key=eu.sort_by('ts'))
def extract_edges(nodes_map, comments_map): # build the list of edges edges_list = [] # a comment is 'valid' if it has a recipient and an author valid_comments = [e for e in comments_map.values() if e.get('recipient_id', None) and e.get('author_id', None)] logging.info("%(v)i valid comments on %(t)i total" % {'v':len(valid_comments), 't':len(comments_map.values())}) # build the comments network to use for metrics for comment in valid_comments: if nodes_map.has_key(comment['author_id']): nodes_map[comment['author_id']]['active'] = True else: logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':comment['author_id']}) if comment.get('post_all_authors', None) and hasattr(comment['post_all_authors'], '__iter__'): links = [make_edge(comment, recipient) for recipient in comment['post_all_authors']] else: links = [make_edge(comment, comment['recipient_id'])] for link in links: if nodes_map.has_key(link['target']): nodes_map[link['target']]['active'] = True else: logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':link['target']}) edges_list.append(link) return sorted(edges_list, key=eu.sort_by('ts'))
def extract_edges(nodes_map, comments_map): # build the list of edges edges_list = [] # a comment is 'valid' if it has a recipient and an author valid_comments = [e for e in comments_map.values() if e.get('recipient_id', None) and e.get('author_id', None)] logging.info("%(v)i valid comments on %(t)i total" % {'v':len(valid_comments), 't':len(comments_map.values())}) # build the whole network to use for metrics for comment in valid_comments: link = { 'id': "{0}_{1}_{2}".format(comment['author_id'],comment['recipient_id'],comment['created_ts']), 'source': comment['author_id'], 'target': comment['recipient_id'], 'ts': comment['created_ts'], 'effort': comment['length'], 'team': comment['team'] } if nodes_map.has_key(comment['author_id']): nodes_map[comment['author_id']]['active'] = True else: logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':comment['author_id']}) if nodes_map.has_key(comment['recipient_id']): nodes_map[comment['recipient_id']]['active'] = True else: logging.info("error: node %(n)s was linked but not found in the nodes_map" % {'n':comment['recipient_id']}) edges_list.append(link) return sorted(edges_list, key=eu.sort_by('ts'))
def main(): initialize_logger('./log') generated = datetime.now() source_csv, source_dir, outdir, dumpto = parse_options(sys.argv[1:]) logging.info("Parsing tweets - Started") logging.info("Parsing tweets - Output directory: %(s)s" % {'s': outdir}) # 1. load and parse the CSV file into a list of records if dumpto: tag = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.csv' dump_to = os.path.join(dumpto, tag) else: dump_to = None tweets = [] if source_csv: tweets += et.parse.load_and_parse_csv(source_csv, sort_key='created_ts', dump_to=dump_to) if source_dir: tweets += et.parse.load_and_parse_from_dir(source_dir, sort_key='created_ts', dump_to=dump_to) # 2. extract the users from the tweets users = et.extract.extract_users(tweets) sorted_users = sorted(users, key=eu.sort_by('created')) # users = { 'users': [{'user': user_data} for user_data in users] } # 3. extract the nodes from the tweets nodes = et.extract.extract_nodes(tweets) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) # nodes = { 'nodes': [{'node': node_data} for node_data in nodes] } # 4. extract the comments from the tweets comments = et.extract.extract_comments(tweets) sorted_comments = sorted(comments, key=eu.sort_by('created')) # comments = { 'comments': [{'comment': comment_data} for comment_data in comments] } # 5. saves the files write_file(tweets, 'tweets.json', outdir) write_file(sorted_users, 'users.json', outdir) write_file(sorted_nodes, 'nodes.json', outdir) write_file(sorted_comments, 'comments.json', outdir) logging.info("Parsing tweets - Completed")
def main(): initialize_logger("./log") generated = datetime.now() sources, outdir, moderators, charset, force_name_as_uid, debug = parse_options(sys.argv[1:]) logging.info("Parsing mailinglist - Started") logging.info("Parsing mailinglist - Source files: %(s)s" % {"s": repr(sources)}) logging.info("Parsing mailinglist - Output directory: %(s)s" % {"s": outdir}) # 1. load and parse each file in a list of messages logging.info("Parsing mailinglist - Reading the files") messages = [] for file in sources: mbox = mailbox.mbox(file) for msg in mbox: messages.append(emt.Message(msg)) # 2. build the threaded containers logging.info("Parsing mailinglist - Threading the messages") subject_table = emt.thread(messages) root_containers = [ctr for (subj, ctr) in subject_table.items()] containers = emp.promote_none_root_set_children(root_containers) if force_name_as_uid: emp.force_name_as_address(containers) # Debug if debug: print("==== Message threads ====") for container in containers: emp.print_container(container) print("=========================") # 3. extract the users nodes comments and sort them logging.info("Parsing mailinglist - Extracting the data") users, nodes, comments = emp.users_nodes_comments_from(containers, moderators, charset) sorted_users = sorted(users, key=eu.sort_by("created")) sorted_nodes = sorted(nodes, key=eu.sort_by("created")) sorted_comments = sorted(comments, key=eu.sort_by("created")) # 5. saves the files logging.info("Parsing mailinglist - Saving the files") write_file(sorted_users, "users.json", outdir) write_file(sorted_nodes, "nodes.json", outdir) write_file(sorted_comments, "comments.json", outdir) logging.info("Parsing mailinglist - Completed")
def compute_all_metrics(nodes_map, posts_map, comments_map, network, timesteps_range, timestep, timestep_window): metrics = {} # calculate the network metrics for ts in timesteps_range: metrics[ts] = metrics_for_ts(nodes_map, posts_map, comments_map, network, ts, timestep, timestep_window) return sorted(metrics.values(), key=sort_by('ts'))
def compute_all_metrics(nodes_map, posts_map, comments_map, network, timesteps_range, timestep, timestep_window): metrics = {} # calculate the network metrics for ts in timesteps_range: metrics[ts] = metrics_for_ts(nodes_map, posts_map, comments_map, network, ts, timestep, timestep_window) return sorted([m for m in metrics.values() if m is not None], key=sort_by('ts'))
def main(): initialize_logger('./log') generated = datetime.now() source_csv, source_dir, outdir, dumpto = parse_options(sys.argv[1:]) logging.info("Parsing tweets - Started") logging.info("Parsing tweets - Output directory: %(s)s" % {'s':outdir}) # 1. load and parse the CSV file into a list of records if dumpto: tag = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.csv' dump_to = os.path.join(dumpto, tag) else: dump_to = None tweets = [] if source_csv: tweets += et.parse.load_and_parse_csv(source_csv, sort_key='created_ts', dump_to=dump_to) if source_dir: tweets += et.parse.load_and_parse_from_dir(source_dir, sort_key='created_ts', dump_to=dump_to) # 2. extract the users from the tweets users = et.extract.extract_users(tweets) sorted_users = sorted(users, key=eu.sort_by('created')) # users = { 'users': [{'user': user_data} for user_data in users] } # 3. extract the nodes from the tweets nodes = et.extract.extract_nodes(tweets) sorted_nodes = sorted(nodes, key=eu.sort_by('created')) # nodes = { 'nodes': [{'node': node_data} for node_data in nodes] } # 4. extract the comments from the tweets comments = et.extract.extract_comments(tweets) sorted_comments = sorted(comments, key=eu.sort_by('created')) # comments = { 'comments': [{'comment': comment_data} for comment_data in comments] } # 5. saves the files write_file(tweets, 'tweets.json', outdir) write_file(sorted_users, 'users.json', outdir) write_file(sorted_nodes, 'nodes.json', outdir) write_file(sorted_comments, 'comments.json', outdir) logging.info("Parsing tweets - Completed")
def load_and_parse_csv(source, sort_key=None, dump_to=None): raw_tweets = eu.resource.load_csv(source, dump_to=dump_to) tweets = [map_csv_data(t) for t in raw_tweets] tweets = [t for t in tweets if t] logging.info( "Parsing tweets - read %(t)i tweets in CSV, using %(v)i valid tweets" % { 't': len(raw_tweets), 'v': len(tweets) }) return sorted(tweets, key=eu.sort_by(sort_key))
def load_and_parse_from_dir(source, sort_key=None, dump_to=None): raw_tweets = [ json.load(open(f, 'r')) for f in glob.glob(os.path.join(source, "*.json")) ] tweets = [map_json_data(t) for t in raw_tweets] tweets = [t for t in tweets if t] logging.info( "Parsing tweets - read %(t)i tweets from JSON, using %(v)i valid tweets" % { 't': len(raw_tweets), 'v': len(tweets) }) return sorted(tweets, key=eu.sort_by(sort_key))
def convert_to_network(generated, graph, posts, creator_of_post, reply_of, moderator_test=None): all_creators = {creator_of_post.get(n, None) for n in posts} all_creators.discard(None) profile_of_account = {} nodes = [ account_as_node(graph, account, profile_of_account, moderator_test) for account in all_creators ] edges = [] for post in posts: for i, replying in enumerate(reply_of.get(post, ())): post_id = stringify(post) if i: post_id = '%s__%d' % (post_id, i) edges.append( post_as_link(graph, post, post_id, replying, profile_of_account[creator_of_post[post]], profile_of_account[creator_of_post[replying]], moderator_test)) nodes.sort(key=eu.sort_by('created_ts')) edges.sort(key=eu.sort_by('ts')) # this is the network object # going forward it should be read from a serialized format to handle caching return { 'meta': { 'generated': int(generated.strftime("%s")) }, 'edges': edges, 'nodes': nodes }
def load_and_parse_from_dir(source, sort_key=None, dump_to=None): raw_tweets = [json.load(open(f, 'r')) for f in glob.glob(os.path.join(source, "*.json"))] tweets = [map_json_data(t) for t in raw_tweets] tweets = [t for t in tweets if t] logging.info("Parsing tweets - read %(t)i tweets from JSON, using %(v)i valid tweets" % {'t': len(raw_tweets), 'v': len(tweets)}) return sorted(tweets, key=eu.sort_by(sort_key))
def load_and_parse_csv(source, sort_key=None, dump_to=None): raw_tweets = eu.resource.load_csv(source, dump_to=dump_to) tweets = [map_csv_data(t) for t in raw_tweets] tweets = [t for t in tweets if t] logging.info("Parsing tweets - read %(t)i tweets in CSV, using %(v)i valid tweets" % {'t': len(raw_tweets), 'v': len(tweets)}) return sorted(tweets, key=eu.sort_by(sort_key))