def make_artificial_data( # for main events n_events, event_size_mu, event_size_sigma, participant_mu, participant_sigma, # for minor events n_minor_events, minor_event_size_mu, minor_event_size_sigma, minor_event_participant_mu, minor_event_participant_sigma, # shared n_total_participants, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, n_noisy_interactions, n_noisy_interactions_fraction, alpha, tau, forward_proba, reply_proba, create_new_proba, dist_func): events, taboo_topics = random_events( n_events, event_size_mu, event_size_sigma, n_total_participants, participant_mu, participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, accumulate_taboo=True ) minor_events, _ = random_events( n_minor_events, minor_event_size_mu, minor_event_size_sigma, n_total_participants, minor_event_participant_mu, minor_event_participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, taboo_topics=taboo_topics, accumulate_taboo=False ) (n_noisy_interactions, _) = get_number_and_percentage( sum([1 for e in events for _ in e]), n_noisy_interactions, n_noisy_interactions_fraction ) noisy_interactions = random_noisy_interactions( n_noisy_interactions, min_time, max_time, n_total_participants, n_topics, topic_noise, taboo_topics ) event_interactions = [e.node[n] for e in events for n in e.nodes_iter()] minor_event_interactions = [e.node[n] for e in minor_events for n in e.nodes_iter()] all_interactions = (event_interactions + minor_event_interactions + noisy_interactions) # add interaction id for i, intr in enumerate(all_interactions): intr['message_id'] = i intr['topics'] = intr['topics'].tolist() # relabel the nodes relabeled_events = [] for e in events: mapping = {n: e.node[n]['message_id'] for n in e.nodes_iter()} relabeled_events.append(nx.relabel_nodes(e, mapping)) for e in events: e = IU.assign_edge_weights(e, dist_func) gen_cand_trees_params = [get_gen_cand_tree_params(e) for e in events] return relabeled_events, all_interactions, gen_cand_trees_params
def run(gen_tree_func, msg_ids_path, root_sampling_method='random', interaction_path=os.path.join(CURDIR, 'data/enron.json'), lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'), corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'), meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'), meta_graph_pkl_suffix='', cand_tree_number=None, # higher priority than percentage cand_tree_percent=0.1, result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'), result_suffix='', all_paths_pkl_prefix='', all_paths_pkl_suffix='', true_events_path='', meta_graph_kws={ 'dist_func': cosine, 'preprune_secs': timedelta(weeks=4), 'distance_weights': {'topics': 0.2, 'bow': 0.8}, # 'timestamp_converter': lambda s: s }, gen_tree_kws={ 'timespan': timedelta(weeks=4), 'U': 0.5, 'dijkstra': False }, convert_time=True, roots=None, calculate_graph=False, given_topics=False, print_summary=False, should_binarize_dag=False): if isinstance(gen_tree_kws['timespan'], timedelta): timespan = gen_tree_kws['timespan'].total_seconds() else: timespan = gen_tree_kws['timespan'] U = gen_tree_kws['U'] if interaction_path.endswith(".json"): try: interactions = json.load(open(interaction_path)) except ValueError: interactions = load_json_by_line(interaction_path) elif interaction_path.endswith(".pkl"): interactions = pickle.load(open(interaction_path)) else: raise ValueError("invalid path extension: {}".format(interaction_path)) logger.info('loading lda from {}'.format(lda_model_path)) if not given_topics: lda_model = gensim.models.wrappers.LdaMallet.load( os.path.join(CURDIR, lda_model_path) ) dictionary = gensim.corpora.dictionary.Dictionary.load( os.path.join(CURDIR, corpus_dict_path) ) else: lda_model = None dictionary = None meta_graph_pkl_path = "{}--{}{}.pkl".format( meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws), meta_graph_pkl_suffix ) logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path)) if calculate_graph or not os.path.exists(meta_graph_pkl_path): # we want to calculate the graph or # it's not there so we have to logger.info('calculating meta_graph...') meta_graph_kws_copied = copy.deepcopy(meta_graph_kws) with open(msg_ids_path) as f: msg_ids = [l.strip() for l in f] if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta): meta_graph_kws_copied['preprune_secs'] = meta_graph_kws['preprune_secs'].total_seconds() g = IU.get_topic_meta_graph( interactions, msg_ids=msg_ids, lda_model=lda_model, dictionary=dictionary, undirected=False, # deprecated given_topics=given_topics, decompose_interactions=False, convert_time=convert_time, **meta_graph_kws_copied ) logger.info('pickling...') nx.write_gpickle( IU.compactize_meta_graph(g, map_nodes=False), meta_graph_pkl_path ) else: logger.info('loading pickle...') g = nx.read_gpickle(meta_graph_pkl_path) if print_summary: logger.debug(get_summary(g)) assert g.number_of_nodes() > 0, 'empty graph!' if not roots: cand_tree_number, cand_tree_percent = get_number_and_percentage( g.number_of_nodes(), cand_tree_number, cand_tree_percent ) if root_sampling_method == 'random': root_sampler = RandomSampler(g, timespan) elif root_sampling_method == 'upperbound': root_sampler = UBSampler(g, U, timespan) else: logger.info('init AdaptiveSampler...') root_sampler = AdaptiveSampler(g, U, timespan) else: logger.info('Roots given') cand_tree_number = len(roots) root_sampler = DeterministicSampler(g, roots, timespan) logger.info('#roots: {}'.format(cand_tree_number)) logger.info('#cand_tree_percent: {}'.format( cand_tree_number / float(g.number_of_nodes())) ) trees = [] dags = [] for i in xrange(cand_tree_number): logger.info("sampling root...") try: root, dag = root_sampler.take() except IndexError: logger.warn('not enough root to take, terminate') break dags.append(dag) start = datetime.now() tree = calc_tree(i, root, dag, U, gen_tree_func, gen_tree_kws, print_summary, should_binarize_dag=should_binarize_dag) tree.graph['calculation_time'] = (datetime.now() - start).total_seconds() trees.append(tree) logger.info("updating sampler states...") root_sampler.update(root, tree) def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature( cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method ), suffix ) result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix) logger.info('result_pkl_path: {}'.format(result_pkl_path)) pickle.dump(trees, open(result_pkl_path, 'w'), protocol=pickle.HIGHEST_PROTOCOL) if False: # for debugging purpose pickle.dump(dags, open(result_pkl_path+'.dag', 'w'), protocol=pickle.HIGHEST_PROTOCOL) all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix, all_paths_pkl_suffix) logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path)) paths_dict = {'interactions': interaction_path, 'meta_graph': meta_graph_pkl_path, 'result': result_pkl_path, 'true_events': true_events_path, 'self': all_paths_pkl_path } pickle.dump( paths_dict, open(all_paths_pkl_path, 'w') ) return paths_dict
def make_artificial_data( # for main events n_events, event_size_mu, event_size_sigma, participant_mu, participant_sigma, # for minor events n_minor_events, minor_event_size_mu, minor_event_size_sigma, minor_event_participant_mu, minor_event_participant_sigma, # shared n_total_participants, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, n_noisy_interactions, n_noisy_interactions_fraction, alpha, tau, forward_proba, reply_proba, create_new_proba, dist_func): events, taboo_topics = random_events(n_events, event_size_mu, event_size_sigma, n_total_participants, participant_mu, participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, accumulate_taboo=True) minor_events, _ = random_events(n_minor_events, minor_event_size_mu, minor_event_size_sigma, n_total_participants, minor_event_participant_mu, minor_event_participant_sigma, min_time, max_time, event_duration_mu, event_duration_sigma, n_topics, topic_scaling_factor, topic_noise, alpha, tau, forward_proba, reply_proba, create_new_proba, taboo_topics=taboo_topics, accumulate_taboo=False) (n_noisy_interactions, _) = get_number_and_percentage(sum([1 for e in events for _ in e]), n_noisy_interactions, n_noisy_interactions_fraction) noisy_interactions = random_noisy_interactions(n_noisy_interactions, min_time, max_time, n_total_participants, n_topics, topic_noise, taboo_topics) event_interactions = [e.node[n] for e in events for n in e.nodes_iter()] minor_event_interactions = [ e.node[n] for e in minor_events for n in e.nodes_iter() ] all_interactions = (event_interactions + minor_event_interactions + noisy_interactions) # add interaction id for i, intr in enumerate(all_interactions): intr['message_id'] = i intr['topics'] = intr['topics'].tolist() # relabel the nodes relabeled_events = [] for e in events: mapping = {n: e.node[n]['message_id'] for n in e.nodes_iter()} relabeled_events.append(nx.relabel_nodes(e, mapping)) for e in events: e = IU.assign_edge_weights(e, dist_func) gen_cand_trees_params = [get_gen_cand_tree_params(e) for e in events] return relabeled_events, all_interactions, gen_cand_trees_params
def run( gen_tree_func, msg_ids_path, root_sampling_method='random', interaction_path=os.path.join(CURDIR, 'data/enron.json'), lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'), corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'), meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'), meta_graph_pkl_suffix='', cand_tree_number=None, # higher priority than percentage cand_tree_percent=0.1, result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'), result_suffix='', all_paths_pkl_prefix='', all_paths_pkl_suffix='', true_events_path='', meta_graph_kws={ 'dist_func': cosine, 'preprune_secs': timedelta(weeks=4), 'distance_weights': { 'topics': 0.2, 'bow': 0.8 }, # 'timestamp_converter': lambda s: s }, gen_tree_kws={ 'timespan': timedelta(weeks=4), 'U': 0.5, 'dijkstra': False }, convert_time=True, roots=None, calculate_graph=False, given_topics=False, print_summary=False, should_binarize_dag=False): if isinstance(gen_tree_kws['timespan'], timedelta): timespan = gen_tree_kws['timespan'].total_seconds() else: timespan = gen_tree_kws['timespan'] U = gen_tree_kws['U'] if interaction_path.endswith(".json"): try: interactions = json.load(open(interaction_path)) except ValueError: interactions = load_json_by_line(interaction_path) elif interaction_path.endswith(".pkl"): interactions = pickle.load(open(interaction_path)) else: raise ValueError("invalid path extension: {}".format(interaction_path)) logger.info('loading lda from {}'.format(lda_model_path)) if not given_topics: lda_model = gensim.models.wrappers.LdaMallet.load( os.path.join(CURDIR, lda_model_path)) dictionary = gensim.corpora.dictionary.Dictionary.load( os.path.join(CURDIR, corpus_dict_path)) else: lda_model = None dictionary = None meta_graph_pkl_path = "{}--{}{}.pkl".format( meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws), meta_graph_pkl_suffix) logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path)) if calculate_graph or not os.path.exists(meta_graph_pkl_path): # we want to calculate the graph or # it's not there so we have to logger.info('calculating meta_graph...') meta_graph_kws_copied = copy.deepcopy(meta_graph_kws) with open(msg_ids_path) as f: msg_ids = [l.strip() for l in f] if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta): meta_graph_kws_copied['preprune_secs'] = meta_graph_kws[ 'preprune_secs'].total_seconds() g = IU.get_topic_meta_graph( interactions, msg_ids=msg_ids, lda_model=lda_model, dictionary=dictionary, undirected=False, # deprecated given_topics=given_topics, decompose_interactions=False, convert_time=convert_time, **meta_graph_kws_copied) logger.info('pickling...') nx.write_gpickle(IU.compactize_meta_graph(g, map_nodes=False), meta_graph_pkl_path) else: logger.info('loading pickle...') g = nx.read_gpickle(meta_graph_pkl_path) if print_summary: logger.debug(get_summary(g)) assert g.number_of_nodes() > 0, 'empty graph!' if not roots: cand_tree_number, cand_tree_percent = get_number_and_percentage( g.number_of_nodes(), cand_tree_number, cand_tree_percent) if root_sampling_method == 'random': root_sampler = RandomSampler(g, timespan) elif root_sampling_method == 'upperbound': root_sampler = UBSampler(g, U, timespan) else: logger.info('init AdaptiveSampler...') root_sampler = AdaptiveSampler(g, U, timespan) else: logger.info('Roots given') cand_tree_number = len(roots) root_sampler = DeterministicSampler(g, roots, timespan) logger.info('#roots: {}'.format(cand_tree_number)) logger.info('#cand_tree_percent: {}'.format(cand_tree_number / float(g.number_of_nodes()))) trees = [] dags = [] for i in xrange(cand_tree_number): logger.info("sampling root...") try: root, dag = root_sampler.take() except IndexError: logger.warn('not enough root to take, terminate') break dags.append(dag) start = datetime.now() tree = calc_tree(i, root, dag, U, gen_tree_func, gen_tree_kws, print_summary, should_binarize_dag=should_binarize_dag) tree.graph['calculation_time'] = (datetime.now() - start).total_seconds() trees.append(tree) logger.info("updating sampler states...") root_sampler.update(root, tree) def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature(cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method), suffix) result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix) logger.info('result_pkl_path: {}'.format(result_pkl_path)) pickle.dump(trees, open(result_pkl_path, 'w'), protocol=pickle.HIGHEST_PROTOCOL) if False: # for debugging purpose pickle.dump(dags, open(result_pkl_path + '.dag', 'w'), protocol=pickle.HIGHEST_PROTOCOL) all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix, all_paths_pkl_suffix) logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path)) paths_dict = { 'interactions': interaction_path, 'meta_graph': meta_graph_pkl_path, 'result': result_pkl_path, 'true_events': true_events_path, 'self': all_paths_pkl_path } pickle.dump(paths_dict, open(all_paths_pkl_path, 'w')) return paths_dict