Exemplo n.º 1
0
def make_artificial_data(
        # for main events
        n_events,
        event_size_mu, event_size_sigma,
        participant_mu, participant_sigma,
        # for minor events
        n_minor_events,
        minor_event_size_mu, minor_event_size_sigma,
        minor_event_participant_mu, minor_event_participant_sigma,
        # shared
        n_total_participants,
        min_time, max_time, event_duration_mu, event_duration_sigma,
        n_topics, topic_scaling_factor, topic_noise,
        n_noisy_interactions, n_noisy_interactions_fraction,
        alpha, tau,
        forward_proba,
        reply_proba,
        create_new_proba,
        dist_func):
    events, taboo_topics = random_events(
        n_events, event_size_mu, event_size_sigma,
        n_total_participants, participant_mu, participant_sigma,
        min_time, max_time, event_duration_mu, event_duration_sigma,
        n_topics, topic_scaling_factor, topic_noise,
        alpha, tau,
        forward_proba,
        reply_proba,
        create_new_proba,
        accumulate_taboo=True
    )

    minor_events, _ = random_events(
        n_minor_events, minor_event_size_mu, minor_event_size_sigma,
        n_total_participants, minor_event_participant_mu,
        minor_event_participant_sigma,
        min_time, max_time, event_duration_mu, event_duration_sigma,
        n_topics, topic_scaling_factor, topic_noise,
        alpha, tau,
        forward_proba,
        reply_proba,
        create_new_proba,
        taboo_topics=taboo_topics,
        accumulate_taboo=False
    )
    
    (n_noisy_interactions, _) = get_number_and_percentage(
        sum([1 for e in events for _ in e]),
        n_noisy_interactions, n_noisy_interactions_fraction
    )
    noisy_interactions = random_noisy_interactions(
        n_noisy_interactions,
        min_time, max_time,
        n_total_participants,
        n_topics, topic_noise,
        taboo_topics
    )

    event_interactions = [e.node[n] for e in events
                          for n in e.nodes_iter()]
    minor_event_interactions = [e.node[n] for e in minor_events
                                for n in e.nodes_iter()]
    all_interactions = (event_interactions + minor_event_interactions
                        + noisy_interactions)

    # add interaction id
    for i, intr in enumerate(all_interactions):
        intr['message_id'] = i
        intr['topics'] = intr['topics'].tolist()

    # relabel the nodes
    relabeled_events = []
    for e in events:
        mapping = {n: e.node[n]['message_id'] for n in e.nodes_iter()}
        relabeled_events.append(nx.relabel_nodes(e, mapping))

    for e in events:
        e = IU.assign_edge_weights(e, dist_func)

    gen_cand_trees_params = [get_gen_cand_tree_params(e)
                             for e in events]
    return relabeled_events, all_interactions, gen_cand_trees_params
Exemplo n.º 2
0
def run(gen_tree_func,
        msg_ids_path,
        root_sampling_method='random',
        interaction_path=os.path.join(CURDIR, 'data/enron.json'),
        lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'),
        corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'),
        meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'),
        meta_graph_pkl_suffix='',
        cand_tree_number=None,  # higher priority than percentage
        cand_tree_percent=0.1,
        result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'),
        result_suffix='',
        all_paths_pkl_prefix='',
        all_paths_pkl_suffix='',
        true_events_path='',
        meta_graph_kws={
            'dist_func': cosine,
            'preprune_secs': timedelta(weeks=4),
            'distance_weights': {'topics': 0.2,
                                 'bow': 0.8},
            # 'timestamp_converter': lambda s: s
        },
        gen_tree_kws={
            'timespan': timedelta(weeks=4),
            'U': 0.5,
            'dijkstra': False
        },
        convert_time=True,
        roots=None,
        calculate_graph=False,
        given_topics=False,
        print_summary=False,
        should_binarize_dag=False):
    if isinstance(gen_tree_kws['timespan'], timedelta):
        timespan = gen_tree_kws['timespan'].total_seconds()
    else:
        timespan = gen_tree_kws['timespan']
    U = gen_tree_kws['U']
        
    if interaction_path.endswith(".json"):
        try:
            interactions = json.load(open(interaction_path))
        except ValueError:
            interactions = load_json_by_line(interaction_path)
    elif interaction_path.endswith(".pkl"):
        interactions = pickle.load(open(interaction_path))
    else:
        raise ValueError("invalid path extension: {}".format(interaction_path))


    logger.info('loading lda from {}'.format(lda_model_path))
    if not given_topics:
        lda_model = gensim.models.wrappers.LdaMallet.load(
            os.path.join(CURDIR, lda_model_path)
        )
        dictionary = gensim.corpora.dictionary.Dictionary.load(
            os.path.join(CURDIR, corpus_dict_path)
        )
    else:
        lda_model = None
        dictionary = None

    meta_graph_pkl_path = "{}--{}{}.pkl".format(
        meta_graph_pkl_path_prefix,
        experiment_signature(**meta_graph_kws),
        meta_graph_pkl_suffix
    )
    logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path))

    if calculate_graph or not os.path.exists(meta_graph_pkl_path):
        # we want to calculate the graph or
        # it's not there so we have to
        logger.info('calculating meta_graph...')
        meta_graph_kws_copied = copy.deepcopy(meta_graph_kws)
        with open(msg_ids_path) as f:
            msg_ids = [l.strip() for l in f]

        if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta):
            meta_graph_kws_copied['preprune_secs'] = meta_graph_kws['preprune_secs'].total_seconds()
        g = IU.get_topic_meta_graph(
            interactions,
            msg_ids=msg_ids,
            lda_model=lda_model,
            dictionary=dictionary,
            undirected=False,  # deprecated
            given_topics=given_topics,
            decompose_interactions=False,
            convert_time=convert_time,
            **meta_graph_kws_copied
        )

        logger.info('pickling...')
        nx.write_gpickle(
            IU.compactize_meta_graph(g, map_nodes=False),
            meta_graph_pkl_path
        )
    else:
        logger.info('loading pickle...')
        g = nx.read_gpickle(meta_graph_pkl_path)
        
    if print_summary:
        logger.debug(get_summary(g))

    assert g.number_of_nodes() > 0, 'empty graph!'

    if not roots:
        cand_tree_number, cand_tree_percent = get_number_and_percentage(
            g.number_of_nodes(),
            cand_tree_number,
            cand_tree_percent
        )
        if root_sampling_method == 'random':
            root_sampler = RandomSampler(g, timespan)
        elif root_sampling_method == 'upperbound':
            root_sampler = UBSampler(g, U, timespan)
        else:
            logger.info('init AdaptiveSampler...')
            root_sampler = AdaptiveSampler(g, U, timespan)
    else:
        logger.info('Roots given')
        cand_tree_number = len(roots)
        root_sampler = DeterministicSampler(g, roots, timespan)
    
    logger.info('#roots: {}'.format(cand_tree_number))
    logger.info('#cand_tree_percent: {}'.format(
        cand_tree_number / float(g.number_of_nodes()))
    )

    trees = []
    dags = []
    for i in xrange(cand_tree_number):
        logger.info("sampling root...")
        try:
            root, dag = root_sampler.take()
        except IndexError:
            logger.warn('not enough root to take, terminate')
            break
        dags.append(dag)
        
        
        start = datetime.now()
        tree = calc_tree(i, root, dag, U,
                         gen_tree_func,
                         gen_tree_kws,
                         print_summary,
                         should_binarize_dag=should_binarize_dag)
        tree.graph['calculation_time'] = (datetime.now() - start).total_seconds()
        
        trees.append(tree)

        logger.info("updating sampler states...")
        root_sampler.update(root, tree)

    def make_detailed_path(prefix, suffix):
        return "{}--{}----{}----{}{}.pkl".format(
            prefix,
            experiment_signature(**gen_tree_kws),
            experiment_signature(**meta_graph_kws),
            experiment_signature(
                cand_tree_percent=cand_tree_percent,
                root_sampling=root_sampling_method
            ),
            suffix
        )
    result_pkl_path = make_detailed_path(result_pkl_path_prefix,
                                         result_suffix)

    logger.info('result_pkl_path: {}'.format(result_pkl_path))
    pickle.dump(trees,
                open(result_pkl_path, 'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
    if False:
        # for debugging purpose
        pickle.dump(dags,
                    open(result_pkl_path+'.dag', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)
    
    all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix,
                                            all_paths_pkl_suffix)
    logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path))
    paths_dict = {'interactions': interaction_path,
                  'meta_graph': meta_graph_pkl_path,
                  'result': result_pkl_path,
                  'true_events': true_events_path,
                  'self': all_paths_pkl_path
    }
    pickle.dump(
        paths_dict,
        open(all_paths_pkl_path, 'w')
    )
    return paths_dict
Exemplo n.º 3
0
def make_artificial_data(
        # for main events
        n_events,
        event_size_mu,
        event_size_sigma,
        participant_mu,
        participant_sigma,
        # for minor events
        n_minor_events,
        minor_event_size_mu,
        minor_event_size_sigma,
        minor_event_participant_mu,
        minor_event_participant_sigma,
        # shared
        n_total_participants,
        min_time,
        max_time,
        event_duration_mu,
        event_duration_sigma,
        n_topics,
        topic_scaling_factor,
        topic_noise,
        n_noisy_interactions,
        n_noisy_interactions_fraction,
        alpha,
        tau,
        forward_proba,
        reply_proba,
        create_new_proba,
        dist_func):
    events, taboo_topics = random_events(n_events,
                                         event_size_mu,
                                         event_size_sigma,
                                         n_total_participants,
                                         participant_mu,
                                         participant_sigma,
                                         min_time,
                                         max_time,
                                         event_duration_mu,
                                         event_duration_sigma,
                                         n_topics,
                                         topic_scaling_factor,
                                         topic_noise,
                                         alpha,
                                         tau,
                                         forward_proba,
                                         reply_proba,
                                         create_new_proba,
                                         accumulate_taboo=True)

    minor_events, _ = random_events(n_minor_events,
                                    minor_event_size_mu,
                                    minor_event_size_sigma,
                                    n_total_participants,
                                    minor_event_participant_mu,
                                    minor_event_participant_sigma,
                                    min_time,
                                    max_time,
                                    event_duration_mu,
                                    event_duration_sigma,
                                    n_topics,
                                    topic_scaling_factor,
                                    topic_noise,
                                    alpha,
                                    tau,
                                    forward_proba,
                                    reply_proba,
                                    create_new_proba,
                                    taboo_topics=taboo_topics,
                                    accumulate_taboo=False)

    (n_noisy_interactions,
     _) = get_number_and_percentage(sum([1 for e in events for _ in e]),
                                    n_noisy_interactions,
                                    n_noisy_interactions_fraction)
    noisy_interactions = random_noisy_interactions(n_noisy_interactions,
                                                   min_time, max_time,
                                                   n_total_participants,
                                                   n_topics, topic_noise,
                                                   taboo_topics)

    event_interactions = [e.node[n] for e in events for n in e.nodes_iter()]
    minor_event_interactions = [
        e.node[n] for e in minor_events for n in e.nodes_iter()
    ]
    all_interactions = (event_interactions + minor_event_interactions +
                        noisy_interactions)

    # add interaction id
    for i, intr in enumerate(all_interactions):
        intr['message_id'] = i
        intr['topics'] = intr['topics'].tolist()

    # relabel the nodes
    relabeled_events = []
    for e in events:
        mapping = {n: e.node[n]['message_id'] for n in e.nodes_iter()}
        relabeled_events.append(nx.relabel_nodes(e, mapping))

    for e in events:
        e = IU.assign_edge_weights(e, dist_func)

    gen_cand_trees_params = [get_gen_cand_tree_params(e) for e in events]
    return relabeled_events, all_interactions, gen_cand_trees_params
def run(
        gen_tree_func,
        msg_ids_path,
        root_sampling_method='random',
        interaction_path=os.path.join(CURDIR, 'data/enron.json'),
        lda_model_path=os.path.join(CURDIR, 'models/model-4-50.lda'),
        corpus_dict_path=os.path.join(CURDIR, 'models/dictionary.pkl'),
        meta_graph_pkl_path_prefix=os.path.join(CURDIR, 'data/enron'),
        meta_graph_pkl_suffix='',
        cand_tree_number=None,  # higher priority than percentage
        cand_tree_percent=0.1,
        result_pkl_path_prefix=os.path.join(CURDIR, 'tmp/results'),
        result_suffix='',
        all_paths_pkl_prefix='',
        all_paths_pkl_suffix='',
        true_events_path='',
        meta_graph_kws={
            'dist_func': cosine,
            'preprune_secs': timedelta(weeks=4),
            'distance_weights': {
                'topics': 0.2,
                'bow': 0.8
            },
            # 'timestamp_converter': lambda s: s
        },
        gen_tree_kws={
            'timespan': timedelta(weeks=4),
            'U': 0.5,
            'dijkstra': False
        },
        convert_time=True,
        roots=None,
        calculate_graph=False,
        given_topics=False,
        print_summary=False,
        should_binarize_dag=False):
    if isinstance(gen_tree_kws['timespan'], timedelta):
        timespan = gen_tree_kws['timespan'].total_seconds()
    else:
        timespan = gen_tree_kws['timespan']
    U = gen_tree_kws['U']

    if interaction_path.endswith(".json"):
        try:
            interactions = json.load(open(interaction_path))
        except ValueError:
            interactions = load_json_by_line(interaction_path)
    elif interaction_path.endswith(".pkl"):
        interactions = pickle.load(open(interaction_path))
    else:
        raise ValueError("invalid path extension: {}".format(interaction_path))

    logger.info('loading lda from {}'.format(lda_model_path))
    if not given_topics:
        lda_model = gensim.models.wrappers.LdaMallet.load(
            os.path.join(CURDIR, lda_model_path))
        dictionary = gensim.corpora.dictionary.Dictionary.load(
            os.path.join(CURDIR, corpus_dict_path))
    else:
        lda_model = None
        dictionary = None

    meta_graph_pkl_path = "{}--{}{}.pkl".format(
        meta_graph_pkl_path_prefix, experiment_signature(**meta_graph_kws),
        meta_graph_pkl_suffix)
    logger.info('meta_graph_pkl_path: {}'.format(meta_graph_pkl_path))

    if calculate_graph or not os.path.exists(meta_graph_pkl_path):
        # we want to calculate the graph or
        # it's not there so we have to
        logger.info('calculating meta_graph...')
        meta_graph_kws_copied = copy.deepcopy(meta_graph_kws)
        with open(msg_ids_path) as f:
            msg_ids = [l.strip() for l in f]

        if isinstance(meta_graph_kws_copied['preprune_secs'], timedelta):
            meta_graph_kws_copied['preprune_secs'] = meta_graph_kws[
                'preprune_secs'].total_seconds()
        g = IU.get_topic_meta_graph(
            interactions,
            msg_ids=msg_ids,
            lda_model=lda_model,
            dictionary=dictionary,
            undirected=False,  # deprecated
            given_topics=given_topics,
            decompose_interactions=False,
            convert_time=convert_time,
            **meta_graph_kws_copied)

        logger.info('pickling...')
        nx.write_gpickle(IU.compactize_meta_graph(g, map_nodes=False),
                         meta_graph_pkl_path)
    else:
        logger.info('loading pickle...')
        g = nx.read_gpickle(meta_graph_pkl_path)

    if print_summary:
        logger.debug(get_summary(g))

    assert g.number_of_nodes() > 0, 'empty graph!'

    if not roots:
        cand_tree_number, cand_tree_percent = get_number_and_percentage(
            g.number_of_nodes(), cand_tree_number, cand_tree_percent)
        if root_sampling_method == 'random':
            root_sampler = RandomSampler(g, timespan)
        elif root_sampling_method == 'upperbound':
            root_sampler = UBSampler(g, U, timespan)
        else:
            logger.info('init AdaptiveSampler...')
            root_sampler = AdaptiveSampler(g, U, timespan)
    else:
        logger.info('Roots given')
        cand_tree_number = len(roots)
        root_sampler = DeterministicSampler(g, roots, timespan)

    logger.info('#roots: {}'.format(cand_tree_number))
    logger.info('#cand_tree_percent: {}'.format(cand_tree_number /
                                                float(g.number_of_nodes())))

    trees = []
    dags = []
    for i in xrange(cand_tree_number):
        logger.info("sampling root...")
        try:
            root, dag = root_sampler.take()
        except IndexError:
            logger.warn('not enough root to take, terminate')
            break
        dags.append(dag)

        start = datetime.now()
        tree = calc_tree(i,
                         root,
                         dag,
                         U,
                         gen_tree_func,
                         gen_tree_kws,
                         print_summary,
                         should_binarize_dag=should_binarize_dag)
        tree.graph['calculation_time'] = (datetime.now() -
                                          start).total_seconds()

        trees.append(tree)

        logger.info("updating sampler states...")
        root_sampler.update(root, tree)

    def make_detailed_path(prefix, suffix):
        return "{}--{}----{}----{}{}.pkl".format(
            prefix, experiment_signature(**gen_tree_kws),
            experiment_signature(**meta_graph_kws),
            experiment_signature(cand_tree_percent=cand_tree_percent,
                                 root_sampling=root_sampling_method), suffix)

    result_pkl_path = make_detailed_path(result_pkl_path_prefix, result_suffix)

    logger.info('result_pkl_path: {}'.format(result_pkl_path))
    pickle.dump(trees,
                open(result_pkl_path, 'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
    if False:
        # for debugging purpose
        pickle.dump(dags,
                    open(result_pkl_path + '.dag', 'w'),
                    protocol=pickle.HIGHEST_PROTOCOL)

    all_paths_pkl_path = make_detailed_path(all_paths_pkl_prefix,
                                            all_paths_pkl_suffix)
    logger.info('Dumping the paths info to {}'.format(all_paths_pkl_path))
    paths_dict = {
        'interactions': interaction_path,
        'meta_graph': meta_graph_pkl_path,
        'result': result_pkl_path,
        'true_events': true_events_path,
        'self': all_paths_pkl_path
    }
    pickle.dump(paths_dict, open(all_paths_pkl_path, 'w'))
    return paths_dict