Пример #1
0
 def make_detailed_path(prefix, suffix):
     return "{}--{}----{}----{}{}.pkl".format(
         prefix,
         experiment_signature(**gen_tree_kws),
         experiment_signature(**meta_graph_kws),
         experiment_signature(
             cand_tree_percent=cand_tree_percent,
             root_sampling=root_sampling_method
         ),
         suffix
     )
Пример #2
0
def main():
    import ujson as json
    import argparse
    from pprint import pprint

    parser = argparse.ArgumentParser('Make sythetic interaction data')
    parser.add_argument('--n_events', type=int, default=10)
    parser.add_argument('--event_size_mu', type=int, default=40)
    parser.add_argument('--event_size_sigma', type=int, default=5)
    parser.add_argument('--participant_mu', type=int, default=5)
    parser.add_argument('--participant_sigma', type=float, default=3)

    parser.add_argument('--n_minor_events', type=int, default=0)
    parser.add_argument('--minor_event_size_mu', type=int, default=10)
    parser.add_argument('--minor_event_size_sigma', type=int, default=1)
    parser.add_argument('--minor_event_participant_mu', type=int, default=4)
    parser.add_argument('--minor_event_participant_sigma', type=float, default=0.1)

    parser.add_argument('--n_total_participants', type=int, default=50)
    parser.add_argument('--min_time', type=int, default=10)
    parser.add_argument('--max_time', type=int, default=1100)
    parser.add_argument('--event_duration_mu', type=int, default=100)
    parser.add_argument('--event_duration_sigma', type=int, default=1)

    parser.add_argument('--n_topics', type=int, default=10)
    parser.add_argument('--topic_scaling_factor', type=float, default=0.5)
    parser.add_argument('--topic_noise', type=float, default=0.1)

    parser.add_argument('--n_noisy_interactions', type=int, default=None)
    parser.add_argument('--n_noisy_interactions_fraction',
                        type=float, default=0.1)
    parser.add_argument('--output_dir', type=str, default='data/synthetic')

    parser.add_argument('--alpha',
                        type=float, default=1.0)
    parser.add_argument('--tau',
                        type=float, default=0.8)
    parser.add_argument('--forward_proba',
                        type=float, default=0.3)
    parser.add_argument('--reply_proba',
                        type=float, default=0.5)
    parser.add_argument('--create_new_proba',
                        type=float, default=0.2)

    parser.add_argument('--result_suffix',
                        default='')
    parser.add_argument('--random_seed',
                        type=int,
                        default=None)    

    args = parser.parse_args()

    random.seed(args.random_seed)
    np.random.seed(args.random_seed)

    pprint(vars(args))
    result_suffix = args.result_suffix
    output_dir = args.output_dir

    args_dict = vars(args)
    del args_dict['output_dir']
    del args_dict['result_suffix']
    del args_dict['random_seed']

    events, interactions, gen_cand_tree_params = make_artificial_data(
        dist_func=cosine,
        **args_dict
    )
    sig = experiment_signature(
        n_noisy_interactions_fraction=args.n_noisy_interactions_fraction,
        event_size=args.event_size_mu,
    )
    nx.write_gpickle(events,
                     '{}/events--{}{}.pkl'.format(output_dir, sig,
                                                  result_suffix)
    )
    json.dump(interactions,
              open('{}/interactions--{}{}.json'.format(output_dir, sig,
                                                       result_suffix),
                   'w'))
    pkl.dump(gen_cand_tree_params,
             open('{}/gen_cand_tree_params--{}{}.pkl'.format(output_dir, sig,
                                                             result_suffix),
                  'w'))
Пример #3
0
def main():
    import ujson as json
    import argparse
    from pprint import pprint

    parser = argparse.ArgumentParser('Make sythetic interaction data')
    parser.add_argument('--n_events', type=int, default=10)
    parser.add_argument('--event_size_mu', type=int, default=40)
    parser.add_argument('--event_size_sigma', type=int, default=5)
    parser.add_argument('--participant_mu', type=int, default=5)
    parser.add_argument('--participant_sigma', type=float, default=3)

    parser.add_argument('--n_minor_events', type=int, default=0)
    parser.add_argument('--minor_event_size_mu', type=int, default=10)
    parser.add_argument('--minor_event_size_sigma', type=int, default=1)
    parser.add_argument('--minor_event_participant_mu', type=int, default=4)
    parser.add_argument('--minor_event_participant_sigma',
                        type=float,
                        default=0.1)

    parser.add_argument('--n_total_participants', type=int, default=50)
    parser.add_argument('--min_time', type=int, default=10)
    parser.add_argument('--max_time', type=int, default=1100)
    parser.add_argument('--event_duration_mu', type=int, default=100)
    parser.add_argument('--event_duration_sigma', type=int, default=1)

    parser.add_argument('--n_topics', type=int, default=10)
    parser.add_argument('--topic_scaling_factor', type=float, default=0.5)
    parser.add_argument('--topic_noise', type=float, default=0.1)

    parser.add_argument('--n_noisy_interactions', type=int, default=None)
    parser.add_argument('--n_noisy_interactions_fraction',
                        type=float,
                        default=0.1)
    parser.add_argument('--output_dir', type=str, default='data/synthetic')

    parser.add_argument('--alpha', type=float, default=1.0)
    parser.add_argument('--tau', type=float, default=0.8)
    parser.add_argument('--forward_proba', type=float, default=0.3)
    parser.add_argument('--reply_proba', type=float, default=0.5)
    parser.add_argument('--create_new_proba', type=float, default=0.2)

    parser.add_argument('--result_suffix', default='')
    parser.add_argument('--random_seed', type=int, default=None)

    args = parser.parse_args()

    random.seed(args.random_seed)
    np.random.seed(args.random_seed)

    pprint(vars(args))
    result_suffix = args.result_suffix
    output_dir = args.output_dir

    args_dict = vars(args)
    del args_dict['output_dir']
    del args_dict['result_suffix']
    del args_dict['random_seed']

    events, interactions, gen_cand_tree_params = make_artificial_data(
        dist_func=cosine, **args_dict)
    sig = experiment_signature(
        n_noisy_interactions_fraction=args.n_noisy_interactions_fraction,
        event_size=args.event_size_mu,
    )
    nx.write_gpickle(
        events, '{}/events--{}{}.pkl'.format(output_dir, sig, result_suffix))
    json.dump(
        interactions,
        open(
            '{}/interactions--{}{}.json'.format(output_dir, sig,
                                                result_suffix), 'w'))
    pkl.dump(
        gen_cand_tree_params,
        open(
            '{}/gen_cand_tree_params--{}{}.pkl'.format(output_dir, sig,
                                                       result_suffix), 'w'))
Пример #4
0
def run(gen_tree_func,
        msg_ids_path,
        root_sampling_method='random',
        interaction_path=os.path.join(CURDIR, 'data/enron.json'),
        lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'),
        corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'),
        meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'),
        meta_graph_pkl_suffix='',
        cand_tree_number=None,  # higher priority than percentage
        cand_tree_percent=0.1,
        result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'),
        result_suffix='',
        all_paths_pkl_prefix='',
        all_paths_pkl_suffix='',
        true_events_path='',
        meta_graph_kws={
            'dist_func': cosine,
            'preprune_secs': timedelta(weeks=4),
            'distance_weights': {'topics': 0.2,
                                 'bow': 0.8},
            # 'timestamp_converter': lambda s: s
        },
        gen_tree_kws={
            'timespan': timedelta(weeks=4),
            'U': 0.5,
            'dijkstra': False
        },
        convert_time=True,
        roots=None,
        calculate_graph=False,
        given_topics=False,
        print_summary=False,
        should_binarize_dag=False):
    if isinstance(gen_tree_kws['timespan'], timedelta):
        timespan = gen_tree_kws['timespan'].total_seconds()
    else:
        timespan = gen_tree_kws['timespan']
    U = gen_tree_kws['U']
        
    if interaction_path.endswith(".json"):
        try:
            interactions = json.load(open(interaction_path))
        except ValueError:
            interactions = load_json_by_line(interaction_path)
    elif interaction_path.endswith(".pkl"):
        interactions = pickle.load(open(interaction_path))
    else:
        raise ValueError("invalid path extension: {}".format(interaction_path))


    logger.info('loading lda from {}'.format(lda_model_path))
    if not given_topics:
        lda_model = gensim.models.wrappers.LdaMallet.load(
            os.path.join(CURDIR, lda_model_path)
        )
        dictionary = gensim.corpora.dictionary.Dictionary.load(
            os.path.join(CURDIR, corpus_dict_path)
        )
    else:
        lda_model = None
        dictionary = None

    meta_graph_pkl_path = "{}--{}{}.pkl".format(
        meta_graph_pkl_path_prefix,
        experiment_signature(**meta_graph_kws),
        meta_graph_pkl_suffix
    )
    logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path))

    if calculate_graph or not os.path.exists(meta_graph_pkl_path):
        # we want to calculate the graph or
        # it's not there so we have to
        logger.info('calculating meta_graph...')
        meta_graph_kws_copied = copy.deepcopy(meta_graph_kws)
        with open(msg_ids_path) as f:
            msg_ids = [l.strip() for l in f]

        if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta):
            meta_graph_kws_copied['preprune_secs'] = meta_graph_kws['preprune_secs'].total_seconds()
        g = IU.get_topic_meta_graph(
            interactions,
            msg_ids=msg_ids,
            lda_model=lda_model,
            dictionary=dictionary,
            undirected=False,  # deprecated
            given_topics=given_topics,
            decompose_interactions=False,
            convert_time=convert_time,
            **meta_graph_kws_copied
        )

        logger.info('pickling...')
        nx.write_gpickle(
            IU.compactize_meta_graph(g, map_nodes=False),
            meta_graph_pkl_path
        )
    else:
        logger.info('loading pickle...')
        g = nx.read_gpickle(meta_graph_pkl_path)
        
    if print_summary:
        logger.debug(get_summary(g))

    assert g.number_of_nodes() > 0, 'empty graph!'

    if not roots:
        cand_tree_number, cand_tree_percent = get_number_and_percentage(
            g.number_of_nodes(),
            cand_tree_number,
            cand_tree_percent
        )
        if root_sampling_method == 'random':
            root_sampler = RandomSampler(g, timespan)
        elif root_sampling_method == 'upperbound':
            root_sampler = UBSampler(g, U, timespan)
        else:
            logger.info('init AdaptiveSampler...')
            root_sampler = AdaptiveSampler(g, U, timespan)
    else:
        logger.info('Roots given')
        cand_tree_number = len(roots)
        root_sampler = DeterministicSampler(g, roots, timespan)
    
    logger.info('#roots: {}'.format(cand_tree_number))
    logger.info('#cand_tree_percent: {}'.format(
        cand_tree_number / float(g.number_of_nodes()))
    )

    trees = []
    dags = []
    for i in xrange(cand_tree_number):
        logger.info("sampling root...")
        try:
            root, dag = root_sampler.take()
        except IndexError:
            logger.warn('not enough root to take, terminate')
            break
        dags.append(dag)
        
        
        start = datetime.now()
        tree = calc_tree(i, root, dag, U,
                         gen_tree_func,
                         gen_tree_kws,
                         print_summary,
                         should_binarize_dag=should_binarize_dag)
        tree.graph['calculation_time'] = (datetime.now() - start).total_seconds()
        
        trees.append(tree)

        logger.info("updating sampler states...")
        root_sampler.update(root, tree)

    def make_detailed_path(prefix, suffix):
        return "{}--{}----{}----{}{}.pkl".format(
            prefix,
            experiment_signature(**gen_tree_kws),
            experiment_signature(**meta_graph_kws),
            experiment_signature(
                cand_tree_percent=cand_tree_percent,
                root_sampling=root_sampling_method
            ),
            suffix
        )
    result_pkl_path = make_detailed_path(result_pkl_path_prefix,
                                         result_suffix)

    logger.info('result_pkl_path: {}'.format(result_pkl_path))
    pickle.dump(trees,
                open(result_pkl_path, 'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
    if False:
        # for debugging purpose
        pickle.dump(dags,
                    open(result_pkl_path+'.dag', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)
    
    all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix,
                                            all_paths_pkl_suffix)
    logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path))
    paths_dict = {'interactions': interaction_path,
                  'meta_graph': meta_graph_pkl_path,
                  'result': result_pkl_path,
                  'true_events': true_events_path,
                  'self': all_paths_pkl_path
    }
    pickle.dump(
        paths_dict,
        open(all_paths_pkl_path, 'w')
    )
    return paths_dict
def run(
        gen_tree_func,
        msg_ids_path,
        root_sampling_method='random',
        interaction_path=os.path.join(CURDIR, 'data/enron.json'),
        lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'),
        corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'),
        meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'),
        meta_graph_pkl_suffix='',
        cand_tree_number=None,  # higher priority than percentage
        cand_tree_percent=0.1,
        result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'),
        result_suffix='',
        all_paths_pkl_prefix='',
        all_paths_pkl_suffix='',
        true_events_path='',
        meta_graph_kws={
            'dist_func': cosine,
            'preprune_secs': timedelta(weeks=4),
            'distance_weights': {
                'topics': 0.2,
                'bow': 0.8
            },
            # 'timestamp_converter': lambda s: s
        },
        gen_tree_kws={
            'timespan': timedelta(weeks=4),
            'U': 0.5,
            'dijkstra': False
        },
        convert_time=True,
        roots=None,
        calculate_graph=False,
        given_topics=False,
        print_summary=False,
        should_binarize_dag=False):
    if isinstance(gen_tree_kws['timespan'], timedelta):
        timespan = gen_tree_kws['timespan'].total_seconds()
    else:
        timespan = gen_tree_kws['timespan']
    U = gen_tree_kws['U']

    if interaction_path.endswith(".json"):
        try:
            interactions = json.load(open(interaction_path))
        except ValueError:
            interactions = load_json_by_line(interaction_path)
    elif interaction_path.endswith(".pkl"):
        interactions = pickle.load(open(interaction_path))
    else:
        raise ValueError("invalid path extension: {}".format(interaction_path))

    logger.info('loading lda from {}'.format(lda_model_path))
    if not given_topics:
        lda_model = gensim.models.wrappers.LdaMallet.load(
            os.path.join(CURDIR, lda_model_path))
        dictionary = gensim.corpora.dictionary.Dictionary.load(
            os.path.join(CURDIR, corpus_dict_path))
    else:
        lda_model = None
        dictionary = None

    meta_graph_pkl_path = "{}--{}{}.pkl".format(
        meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws),
        meta_graph_pkl_suffix)
    logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path))

    if calculate_graph or not os.path.exists(meta_graph_pkl_path):
        # we want to calculate the graph or
        # it's not there so we have to
        logger.info('calculating meta_graph...')
        meta_graph_kws_copied = copy.deepcopy(meta_graph_kws)
        with open(msg_ids_path) as f:
            msg_ids = [l.strip() for l in f]

        if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta):
            meta_graph_kws_copied['preprune_secs'] = meta_graph_kws[
                'preprune_secs'].total_seconds()
        g = IU.get_topic_meta_graph(
            interactions,
            msg_ids=msg_ids,
            lda_model=lda_model,
            dictionary=dictionary,
            undirected=False,  # deprecated
            given_topics=given_topics,
            decompose_interactions=False,
            convert_time=convert_time,
            **meta_graph_kws_copied)

        logger.info('pickling...')
        nx.write_gpickle(IU.compactize_meta_graph(g, map_nodes=False),
                         meta_graph_pkl_path)
    else:
        logger.info('loading pickle...')
        g = nx.read_gpickle(meta_graph_pkl_path)

    if print_summary:
        logger.debug(get_summary(g))

    assert g.number_of_nodes() > 0, 'empty graph!'

    if not roots:
        cand_tree_number, cand_tree_percent = get_number_and_percentage(
            g.number_of_nodes(), cand_tree_number, cand_tree_percent)
        if root_sampling_method == 'random':
            root_sampler = RandomSampler(g, timespan)
        elif root_sampling_method == 'upperbound':
            root_sampler = UBSampler(g, U, timespan)
        else:
            logger.info('init AdaptiveSampler...')
            root_sampler = AdaptiveSampler(g, U, timespan)
    else:
        logger.info('Roots given')
        cand_tree_number = len(roots)
        root_sampler = DeterministicSampler(g, roots, timespan)

    logger.info('#roots: {}'.format(cand_tree_number))
    logger.info('#cand_tree_percent: {}'.format(cand_tree_number /
                                                float(g.number_of_nodes())))

    trees = []
    dags = []
    for i in xrange(cand_tree_number):
        logger.info("sampling root...")
        try:
            root, dag = root_sampler.take()
        except IndexError:
            logger.warn('not enough root to take, terminate')
            break
        dags.append(dag)

        start = datetime.now()
        tree = calc_tree(i,
                         root,
                         dag,
                         U,
                         gen_tree_func,
                         gen_tree_kws,
                         print_summary,
                         should_binarize_dag=should_binarize_dag)
        tree.graph['calculation_time'] = (datetime.now() -
                                          start).total_seconds()

        trees.append(tree)

        logger.info("updating sampler states...")
        root_sampler.update(root, tree)

    def make_detailed_path(prefix, suffix):
        return "{}--{}----{}----{}{}.pkl".format(
            prefix, experiment_signature(**gen_tree_kws),
            experiment_signature(**meta_graph_kws),
            experiment_signature(cand_tree_percent=cand_tree_percent,
                                 root_sampling=root_sampling_method), suffix)

    result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix)

    logger.info('result_pkl_path: {}'.format(result_pkl_path))
    pickle.dump(trees,
                open(result_pkl_path, 'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
    if False:
        # for debugging purpose
        pickle.dump(dags,
                    open(result_pkl_path + '.dag', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)

    all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix,
                                            all_paths_pkl_suffix)
    logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path))
    paths_dict = {
        'interactions': interaction_path,
        'meta_graph': meta_graph_pkl_path,
        'result': result_pkl_path,
        'true_events': true_events_path,
        'self': all_paths_pkl_path
    }
    pickle.dump(paths_dict, open(all_paths_pkl_path, 'w'))
    return paths_dict
 def make_detailed_path(prefix, suffix):
     return "{}--{}----{}----{}{}.pkl".format(
         prefix, experiment_signature(**gen_tree_kws),
         experiment_signature(**meta_graph_kws),
         experiment_signature(cand_tree_percent=cand_tree_percent,
                              root_sampling=root_sampling_method), suffix)