def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature( cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method ), suffix )
def main(): import ujson as json import argparse from pprint import pprint parser = argparse.ArgumentParser('Make sythetic interaction data') parser.add_argument('--n_events', type=int, default=10) parser.add_argument('--event_size_mu', type=int, default=40) parser.add_argument('--event_size_sigma', type=int, default=5) parser.add_argument('--participant_mu', type=int, default=5) parser.add_argument('--participant_sigma', type=float, default=3) parser.add_argument('--n_minor_events', type=int, default=0) parser.add_argument('--minor_event_size_mu', type=int, default=10) parser.add_argument('--minor_event_size_sigma', type=int, default=1) parser.add_argument('--minor_event_participant_mu', type=int, default=4) parser.add_argument('--minor_event_participant_sigma', type=float, default=0.1) parser.add_argument('--n_total_participants', type=int, default=50) parser.add_argument('--min_time', type=int, default=10) parser.add_argument('--max_time', type=int, default=1100) parser.add_argument('--event_duration_mu', type=int, default=100) parser.add_argument('--event_duration_sigma', type=int, default=1) parser.add_argument('--n_topics', type=int, default=10) parser.add_argument('--topic_scaling_factor', type=float, default=0.5) parser.add_argument('--topic_noise', type=float, default=0.1) parser.add_argument('--n_noisy_interactions', type=int, default=None) parser.add_argument('--n_noisy_interactions_fraction', type=float, default=0.1) parser.add_argument('--output_dir', type=str, default='data/synthetic') parser.add_argument('--alpha', type=float, default=1.0) parser.add_argument('--tau', type=float, default=0.8) parser.add_argument('--forward_proba', type=float, default=0.3) parser.add_argument('--reply_proba', type=float, default=0.5) parser.add_argument('--create_new_proba', type=float, default=0.2) parser.add_argument('--result_suffix', default='') parser.add_argument('--random_seed', type=int, default=None) args = parser.parse_args() random.seed(args.random_seed) np.random.seed(args.random_seed) pprint(vars(args)) result_suffix = args.result_suffix output_dir = args.output_dir args_dict = vars(args) del args_dict['output_dir'] del args_dict['result_suffix'] del args_dict['random_seed'] events, interactions, gen_cand_tree_params = make_artificial_data( dist_func=cosine, **args_dict ) sig = experiment_signature( n_noisy_interactions_fraction=args.n_noisy_interactions_fraction, event_size=args.event_size_mu, ) nx.write_gpickle(events, '{}/events--{}{}.pkl'.format(output_dir, sig, result_suffix) ) json.dump(interactions, open('{}/interactions--{}{}.json'.format(output_dir, sig, result_suffix), 'w')) pkl.dump(gen_cand_tree_params, open('{}/gen_cand_tree_params--{}{}.pkl'.format(output_dir, sig, result_suffix), 'w'))
def main(): import ujson as json import argparse from pprint import pprint parser = argparse.ArgumentParser('Make sythetic interaction data') parser.add_argument('--n_events', type=int, default=10) parser.add_argument('--event_size_mu', type=int, default=40) parser.add_argument('--event_size_sigma', type=int, default=5) parser.add_argument('--participant_mu', type=int, default=5) parser.add_argument('--participant_sigma', type=float, default=3) parser.add_argument('--n_minor_events', type=int, default=0) parser.add_argument('--minor_event_size_mu', type=int, default=10) parser.add_argument('--minor_event_size_sigma', type=int, default=1) parser.add_argument('--minor_event_participant_mu', type=int, default=4) parser.add_argument('--minor_event_participant_sigma', type=float, default=0.1) parser.add_argument('--n_total_participants', type=int, default=50) parser.add_argument('--min_time', type=int, default=10) parser.add_argument('--max_time', type=int, default=1100) parser.add_argument('--event_duration_mu', type=int, default=100) parser.add_argument('--event_duration_sigma', type=int, default=1) parser.add_argument('--n_topics', type=int, default=10) parser.add_argument('--topic_scaling_factor', type=float, default=0.5) parser.add_argument('--topic_noise', type=float, default=0.1) parser.add_argument('--n_noisy_interactions', type=int, default=None) parser.add_argument('--n_noisy_interactions_fraction', type=float, default=0.1) parser.add_argument('--output_dir', type=str, default='data/synthetic') parser.add_argument('--alpha', type=float, default=1.0) parser.add_argument('--tau', type=float, default=0.8) parser.add_argument('--forward_proba', type=float, default=0.3) parser.add_argument('--reply_proba', type=float, default=0.5) parser.add_argument('--create_new_proba', type=float, default=0.2) parser.add_argument('--result_suffix', default='') parser.add_argument('--random_seed', type=int, default=None) args = parser.parse_args() random.seed(args.random_seed) np.random.seed(args.random_seed) pprint(vars(args)) result_suffix = args.result_suffix output_dir = args.output_dir args_dict = vars(args) del args_dict['output_dir'] del args_dict['result_suffix'] del args_dict['random_seed'] events, interactions, gen_cand_tree_params = make_artificial_data( dist_func=cosine, **args_dict) sig = experiment_signature( n_noisy_interactions_fraction=args.n_noisy_interactions_fraction, event_size=args.event_size_mu, ) nx.write_gpickle( events, '{}/events--{}{}.pkl'.format(output_dir, sig, result_suffix)) json.dump( interactions, open( '{}/interactions--{}{}.json'.format(output_dir, sig, result_suffix), 'w')) pkl.dump( gen_cand_tree_params, open( '{}/gen_cand_tree_params--{}{}.pkl'.format(output_dir, sig, result_suffix), 'w'))
def run(gen_tree_func, msg_ids_path, root_sampling_method='random', interaction_path=os.path.join(CURDIR, 'data/enron.json'), lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'), corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'), meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'), meta_graph_pkl_suffix='', cand_tree_number=None, # higher priority than percentage cand_tree_percent=0.1, result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'), result_suffix='', all_paths_pkl_prefix='', all_paths_pkl_suffix='', true_events_path='', meta_graph_kws={ 'dist_func': cosine, 'preprune_secs': timedelta(weeks=4), 'distance_weights': {'topics': 0.2, 'bow': 0.8}, # 'timestamp_converter': lambda s: s }, gen_tree_kws={ 'timespan': timedelta(weeks=4), 'U': 0.5, 'dijkstra': False }, convert_time=True, roots=None, calculate_graph=False, given_topics=False, print_summary=False, should_binarize_dag=False): if isinstance(gen_tree_kws['timespan'], timedelta): timespan = gen_tree_kws['timespan'].total_seconds() else: timespan = gen_tree_kws['timespan'] U = gen_tree_kws['U'] if interaction_path.endswith(".json"): try: interactions = json.load(open(interaction_path)) except ValueError: interactions = load_json_by_line(interaction_path) elif interaction_path.endswith(".pkl"): interactions = pickle.load(open(interaction_path)) else: raise ValueError("invalid path extension: {}".format(interaction_path)) logger.info('loading lda from {}'.format(lda_model_path)) if not given_topics: lda_model = gensim.models.wrappers.LdaMallet.load( os.path.join(CURDIR, lda_model_path) ) dictionary = gensim.corpora.dictionary.Dictionary.load( os.path.join(CURDIR, corpus_dict_path) ) else: lda_model = None dictionary = None meta_graph_pkl_path = "{}--{}{}.pkl".format( meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws), meta_graph_pkl_suffix ) logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path)) if calculate_graph or not os.path.exists(meta_graph_pkl_path): # we want to calculate the graph or # it's not there so we have to logger.info('calculating meta_graph...') meta_graph_kws_copied = copy.deepcopy(meta_graph_kws) with open(msg_ids_path) as f: msg_ids = [l.strip() for l in f] if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta): meta_graph_kws_copied['preprune_secs'] = meta_graph_kws['preprune_secs'].total_seconds() g = IU.get_topic_meta_graph( interactions, msg_ids=msg_ids, lda_model=lda_model, dictionary=dictionary, undirected=False, # deprecated given_topics=given_topics, decompose_interactions=False, convert_time=convert_time, **meta_graph_kws_copied ) logger.info('pickling...') nx.write_gpickle( IU.compactize_meta_graph(g, map_nodes=False), meta_graph_pkl_path ) else: logger.info('loading pickle...') g = nx.read_gpickle(meta_graph_pkl_path) if print_summary: logger.debug(get_summary(g)) assert g.number_of_nodes() > 0, 'empty graph!' if not roots: cand_tree_number, cand_tree_percent = get_number_and_percentage( g.number_of_nodes(), cand_tree_number, cand_tree_percent ) if root_sampling_method == 'random': root_sampler = RandomSampler(g, timespan) elif root_sampling_method == 'upperbound': root_sampler = UBSampler(g, U, timespan) else: logger.info('init AdaptiveSampler...') root_sampler = AdaptiveSampler(g, U, timespan) else: logger.info('Roots given') cand_tree_number = len(roots) root_sampler = DeterministicSampler(g, roots, timespan) logger.info('#roots: {}'.format(cand_tree_number)) logger.info('#cand_tree_percent: {}'.format( cand_tree_number / float(g.number_of_nodes())) ) trees = [] dags = [] for i in xrange(cand_tree_number): logger.info("sampling root...") try: root, dag = root_sampler.take() except IndexError: logger.warn('not enough root to take, terminate') break dags.append(dag) start = datetime.now() tree = calc_tree(i, root, dag, U, gen_tree_func, gen_tree_kws, print_summary, should_binarize_dag=should_binarize_dag) tree.graph['calculation_time'] = (datetime.now() - start).total_seconds() trees.append(tree) logger.info("updating sampler states...") root_sampler.update(root, tree) def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature( cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method ), suffix ) result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix) logger.info('result_pkl_path: {}'.format(result_pkl_path)) pickle.dump(trees, open(result_pkl_path, 'w'), protocol=pickle.HIGHEST_PROTOCOL) if False: # for debugging purpose pickle.dump(dags, open(result_pkl_path+'.dag', 'w'), protocol=pickle.HIGHEST_PROTOCOL) all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix, all_paths_pkl_suffix) logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path)) paths_dict = {'interactions': interaction_path, 'meta_graph': meta_graph_pkl_path, 'result': result_pkl_path, 'true_events': true_events_path, 'self': all_paths_pkl_path } pickle.dump( paths_dict, open(all_paths_pkl_path, 'w') ) return paths_dict
def run( gen_tree_func, msg_ids_path, root_sampling_method='random', interaction_path=os.path.join(CURDIR, 'data/enron.json'), lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'), corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'), meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'), meta_graph_pkl_suffix='', cand_tree_number=None, # higher priority than percentage cand_tree_percent=0.1, result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'), result_suffix='', all_paths_pkl_prefix='', all_paths_pkl_suffix='', true_events_path='', meta_graph_kws={ 'dist_func': cosine, 'preprune_secs': timedelta(weeks=4), 'distance_weights': { 'topics': 0.2, 'bow': 0.8 }, # 'timestamp_converter': lambda s: s }, gen_tree_kws={ 'timespan': timedelta(weeks=4), 'U': 0.5, 'dijkstra': False }, convert_time=True, roots=None, calculate_graph=False, given_topics=False, print_summary=False, should_binarize_dag=False): if isinstance(gen_tree_kws['timespan'], timedelta): timespan = gen_tree_kws['timespan'].total_seconds() else: timespan = gen_tree_kws['timespan'] U = gen_tree_kws['U'] if interaction_path.endswith(".json"): try: interactions = json.load(open(interaction_path)) except ValueError: interactions = load_json_by_line(interaction_path) elif interaction_path.endswith(".pkl"): interactions = pickle.load(open(interaction_path)) else: raise ValueError("invalid path extension: {}".format(interaction_path)) logger.info('loading lda from {}'.format(lda_model_path)) if not given_topics: lda_model = gensim.models.wrappers.LdaMallet.load( os.path.join(CURDIR, lda_model_path)) dictionary = gensim.corpora.dictionary.Dictionary.load( os.path.join(CURDIR, corpus_dict_path)) else: lda_model = None dictionary = None meta_graph_pkl_path = "{}--{}{}.pkl".format( meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws), meta_graph_pkl_suffix) logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path)) if calculate_graph or not os.path.exists(meta_graph_pkl_path): # we want to calculate the graph or # it's not there so we have to logger.info('calculating meta_graph...') meta_graph_kws_copied = copy.deepcopy(meta_graph_kws) with open(msg_ids_path) as f: msg_ids = [l.strip() for l in f] if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta): meta_graph_kws_copied['preprune_secs'] = meta_graph_kws[ 'preprune_secs'].total_seconds() g = IU.get_topic_meta_graph( interactions, msg_ids=msg_ids, lda_model=lda_model, dictionary=dictionary, undirected=False, # deprecated given_topics=given_topics, decompose_interactions=False, convert_time=convert_time, **meta_graph_kws_copied) logger.info('pickling...') nx.write_gpickle(IU.compactize_meta_graph(g, map_nodes=False), meta_graph_pkl_path) else: logger.info('loading pickle...') g = nx.read_gpickle(meta_graph_pkl_path) if print_summary: logger.debug(get_summary(g)) assert g.number_of_nodes() > 0, 'empty graph!' if not roots: cand_tree_number, cand_tree_percent = get_number_and_percentage( g.number_of_nodes(), cand_tree_number, cand_tree_percent) if root_sampling_method == 'random': root_sampler = RandomSampler(g, timespan) elif root_sampling_method == 'upperbound': root_sampler = UBSampler(g, U, timespan) else: logger.info('init AdaptiveSampler...') root_sampler = AdaptiveSampler(g, U, timespan) else: logger.info('Roots given') cand_tree_number = len(roots) root_sampler = DeterministicSampler(g, roots, timespan) logger.info('#roots: {}'.format(cand_tree_number)) logger.info('#cand_tree_percent: {}'.format(cand_tree_number / float(g.number_of_nodes()))) trees = [] dags = [] for i in xrange(cand_tree_number): logger.info("sampling root...") try: root, dag = root_sampler.take() except IndexError: logger.warn('not enough root to take, terminate') break dags.append(dag) start = datetime.now() tree = calc_tree(i, root, dag, U, gen_tree_func, gen_tree_kws, print_summary, should_binarize_dag=should_binarize_dag) tree.graph['calculation_time'] = (datetime.now() - start).total_seconds() trees.append(tree) logger.info("updating sampler states...") root_sampler.update(root, tree) def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature(cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method), suffix) result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix) logger.info('result_pkl_path: {}'.format(result_pkl_path)) pickle.dump(trees, open(result_pkl_path, 'w'), protocol=pickle.HIGHEST_PROTOCOL) if False: # for debugging purpose pickle.dump(dags, open(result_pkl_path + '.dag', 'w'), protocol=pickle.HIGHEST_PROTOCOL) all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix, all_paths_pkl_suffix) logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path)) paths_dict = { 'interactions': interaction_path, 'meta_graph': meta_graph_pkl_path, 'result': result_pkl_path, 'true_events': true_events_path, 'self': all_paths_pkl_path } pickle.dump(paths_dict, open(all_paths_pkl_path, 'w')) return paths_dict
def make_detailed_path(prefix, suffix): return "{}--{}----{}----{}{}.pkl".format( prefix, experiment_signature(**gen_tree_kws), experiment_signature(**meta_graph_kws), experiment_signature(cand_tree_percent=cand_tree_percent, root_sampling=root_sampling_method), suffix)