Exemplo n.º 1
0
    def test_identify_hubs_and_experts(self):

        log_data = util.load_from_disk(self.test_data_dir + "hits/log_data")
        nicks = util.load_from_disk(self.test_data_dir + "hits/nicks")
        nick_same_list = util.load_from_disk(self.test_data_dir +
                                             "hits/nick_same_list")
        expected_top_hub = util.load_from_disk(self.test_data_dir +
                                               "hits/top_hub")
        expected_top_keyword_overlap = util.load_from_disk(
            self.test_data_dir + "hits/top_keyword_overlap")
        expected_top_auth = util.load_from_disk(self.test_data_dir +
                                                "hits/top_auth")
        message_graph = util.load_from_disk(self.test_data_dir +
                                            "hits/message_graph")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput

        message_num_graph, top_hub, top_keyword_overlap, top_auth = network.identify_hubs_and_experts(
            log_data, nicks, nick_same_list)

        sys.stdout = sys.__stdout__
        capturedOutput.close()

        self.assertEqual(top_hub, expected_top_hub)
        self.assertEqual(top_keyword_overlap, expected_top_keyword_overlap)
        self.assertEqual(top_auth, expected_top_auth)
        self.assertTrue(nx.is_isomorphic(message_graph, message_num_graph))
Exemplo n.º 2
0
def keywords_hits_overlap(log_directory, output_directory, channel_name):
    # Correlational: overlap for keyword digest and HITS
    for month in xrange(1, 13):
        log_data_m1 = reader.linux_input(log_directory, channel_name,
                                         "2013-" + str(month) + "-1",
                                         "2013-" + str(month) + "-31")
        nicks_m1, nick_same_list_m1 = nickTracker.nick_tracker(log_data_m1)
        message_graph_m1, top_hubs_m1, top_keyword_overlap_m1, top_auth_m1 = network.identify_hubs_and_experts(
            log_data_m1, nicks_m1, nick_same_list_m1)
        saver.draw_nx_graph(message_graph_m1, output_directory,
                            "expert-month-" + str(month))

        log_data_m2 = reader.linux_input(log_directory, channel_name,
                                         "2013-" + str(month + 1) + "-1",
                                         "2013-" + str(month + 1) + "-31")
        nicks_m2, nick_same_list_m2 = nickTracker.nick_tracker(log_data_m1)
        message_graph_m2, top_hubs_m2, top_keyword_overlap_with_score_m2, top_auth_m2 = network.identify_hubs_and_experts(
            log_data_m2, nicks_m2, nick_same_list_m2)

        print "Top 10 HUBS for Month [HITS]", month, ":", top_hubs_m1
        print "Top 10 HUBS for Month [HITS]", month + 1, ":", top_hubs_m2
        print "Number of common HUBS (from 10) between above 2 months:", len(
            list(set(top_hubs_m1).intersection(top_hubs_m2)))

        print "Top 10 Experts by keywords for Months", month, ":", top_keyword_overlap_m1
        print "Top 10 Experts by keywords for Months", month + 1, ":", top_keyword_overlap_with_score_m2
        print "Number of common Experts by keywords (from 10) between above 2 months:", len(
            list(
                set(top_keyword_overlap_m1).intersection(
                    top_keyword_overlap_with_score_m2)))

        print "Top 10 AUTH for Month [HITS]", month, ":", top_auth_m1
        print "Top 10 AUTH for Month [HITS]", month + 1, ":", top_auth_m2
        print "Number of common AUTH (from 10) between above 2 months:", len(
            list(set(top_auth_m1).intersection(top_auth_m2)))

        print "Number of users common btw HUBS from HITS and Experts by Keywords (from 10) for month", month, ":", len(
            list(set(top_keyword_overlap_m1).intersection(top_hubs_m1)))
        print "Number of users common btw AUTH from HITS and Experts by Keywords (from 10) for month", month, ":", len(
            list(set(top_keyword_overlap_m1).intersection(top_auth_m1)))
        print "Number of users common btw HUBS from HITS and AUTH from HITS (from 10) for month", month, ":", len(
            list(set(top_hubs_m1).intersection(top_auth_m1)))
        print "Number of users common btw HUBS, HITS and KEYWORDS", month, ":", len(
            set(list(set(top_keyword_overlap_m1).intersection(
                top_hubs_m1))).intersection(top_auth_m1))
Exemplo n.º 3
0
    def test_identify_hubs_and_experts(self, log_data, nicks, nick_same_list):
        update_expected_output_directory(log_data)
        message_graph, top_hub, top_keyword_overlap, top_auth = network.identify_hubs_and_experts(
            log_data, nicks, nick_same_list)

        expected_top_hub = util.load_from_disk(expected_output_directory +
                                               "top_hub")
        expected_top_keyword_overlap = util.load_from_disk(
            expected_output_directory + "top_keyword_overlap")
        expected_top_auth = util.load_from_disk(expected_output_directory +
                                                "top_auth")
        expected_message_graph = util.load_from_disk(
            expected_output_directory + "message_graph")

        assert top_hub == expected_top_hub
        assert top_keyword_overlap == expected_top_keyword_overlap
        assert top_auth == expected_top_auth
        assert nx.is_isomorphic(message_graph, expected_message_graph)
Exemplo n.º 4
0
    def test_identify_hubs_and_experts(self, mock_keywords, mock_msg_graph):

        log_data = util.load_from_disk(self.test_data_dir + "hits/log_data")
        nicks = util.load_from_disk(self.test_data_dir + "hits/nicks")
        nick_same_list = util.load_from_disk(self.test_data_dir +
                                             "hits/nick_same_list")
        expected_top_hub = util.load_from_disk(self.test_data_dir +
                                               "hits/top_hub")
        expected_top_keyword_overlap = util.load_from_disk(
            self.test_data_dir + "hits/top_keyword_overlap")
        expected_top_auth = util.load_from_disk(self.test_data_dir +
                                                "hits/top_auth")
        message_graph = util.load_from_disk(self.test_data_dir +
                                            "hits/message_graph")
        keyword_dict_list = util.load_from_disk(self.test_data_dir +
                                                "hits/keyword_dict_list")
        user_keyword_freq_dict = util.load_from_disk(
            self.test_data_dir + "hits/user_keyword_freq_dict")
        user_words_dict_list = util.load_from_disk(self.test_data_dir +
                                                   "hits/user_words_dict_list")
        nicks_for_stop_words = util.load_from_disk(self.test_data_dir +
                                                   "hits/nicks_for_stop_words")
        keywords_for_channels = util.load_from_disk(
            self.test_data_dir + "hits/keywords_for_channels")

        # setup mock
        mock_msg_graph.return_value = message_graph
        mock_keywords.return_value = keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words, keywords_for_channels
        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput

        message_num_graph, top_hub, top_keyword_overlap, top_auth = network.identify_hubs_and_experts(
            log_data, nicks, nick_same_list)

        sys.stdout = sys.__stdout__
        capturedOutput.close()

        self.assertEqual(top_hub, expected_top_hub)
        self.assertEqual(top_keyword_overlap, expected_top_keyword_overlap)
        self.assertEqual(top_auth, expected_top_auth)
        self.assertTrue(nx.is_isomorphic(message_graph, message_num_graph))
Exemplo n.º 5
0
    def test_identify_hubs_and_experts(self, mock_keywords, mock_msg_graph):
        top_hub_ = util.load_from_disk(self.current_directory +
                                       "/data/top_hub")
        top_keyword_overlap_ = util.load_from_disk(self.current_directory +
                                                   "/data/top_keyword_overlap")
        top_auth_ = util.load_from_disk(self.current_directory +
                                        "/data/top_auth")
        message_graph = util.load_from_disk(self.current_directory +
                                            "/data/test_hits_message_graph")
        keyword_dict_list = util.load_from_disk(self.current_directory +
                                                "/data/keyword_dict_list")
        user_keyword_freq_dict = util.load_from_disk(
            self.current_directory + "/data/user_keyword_freq_dict")
        user_words_dict_list = util.load_from_disk(
            self.current_directory + "/data/user_words_dict_list")
        nicks_for_stop_words = util.load_from_disk(
            self.current_directory + "/data/nicks_for_stop_words")
        keywords_for_channels = util.load_from_disk(
            self.current_directory + "/data/keywords_for_channels")
        keywords_return = util.load_from_disk(self.current_directory +
                                              "/data/keywords_return")

        # setup mock
        mock_msg_graph.return_value = message_graph
        mock_keywords.return_value = keyword_dict_list, user_keyword_freq_dict, user_words_dict_list, nicks_for_stop_words, keywords_for_channels
        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput

        message_num_graph, top_hub, top_keyword_overlap, top_auth = network.identify_hubs_and_experts(
            self.log_data, self.nicks, self.nick_same_list)

        sys.stdout = sys.__stdout__
        capturedOutput.close()

        self.assertEqual(top_hub, top_hub_)
        self.assertEqual(top_keyword_overlap, top_keyword_overlap_)
        self.assertEqual(top_auth, top_auth_)
        self.assertTrue(nx.is_isomorphic(message_graph, message_num_graph))
Exemplo n.º 6
0
    def test_identify_hubs_and_experts(self):

        expected_top_hub = util.load_from_disk(self.out_dir + "top_hub")
        expected_top_keyword_overlap = util.load_from_disk(
            self.out_dir + "top_keyword_overlap")
        expected_top_auth = util.load_from_disk(self.out_dir + "top_auth")
        expected_message_graph = util.load_from_disk(self.out_dir +
                                                     "message_num_graph")

        capturedOutput = StringIO.StringIO()
        sys.stdout = capturedOutput

        nicks, nick_same_list = nickTracker.nick_tracker(self.log_data)
        message_num_graph, top_hub, top_keyword_overlap, top_auth = network.identify_hubs_and_experts(
            self.log_data, nicks, nick_same_list)

        sys.stdout = sys.__stdout__
        capturedOutput.close()

        self.assertEqual(top_hub, expected_top_hub)
        self.assertEqual(top_keyword_overlap, expected_top_keyword_overlap)
        self.assertEqual(top_auth, expected_top_auth)
        self.assertTrue(
            nx.is_isomorphic(expected_message_graph, message_num_graph))
Exemplo n.º 7
0
    conv_ref_time_curve_fit_parameters = vis.exponential_curve_fit_and_plot_x_shifted(conv_ref_time, output_directory, "conv_ref_time_cutoff" + str(cutoff))
    saver.save_csv( [["a","b","c", "MSE"], [conv_len_curve_fit_parameters]], output_directory,"conv_len_curve_fit_parameters-cutoff-" + str(cutoff))
    saver.save_csv( [["a","b","c", "MSE"], [resp_time_curve_fit_parameters]], output_directory,"resp_time_curve_fit_parameters-cutoff-" + str(cutoff))
    saver.save_csv( [["a","b","c", "MSE"], [conv_ref_time_curve_fit_parameters]], output_directory,"conv_ref_time_curve_fit_parameters-cutoff-"+str(cutoff))

config.CUTOFF_PERCENTILE = default_cutoff #revert back to default

user.keywords_clusters(log_data, nicks, nick_same_list, output_directory, "keywords")
network.degree_analysis_on_graph(message_number_graph)

threshold = config.THRESHOLD_MESSAGE_NUMBER_GRAPH #store original default config
cutoffs = [0, 10, 20]

for cutoff in cutoffs:
    config.THRESHOLD_MESSAGE_NUMBER_GRAPH = cutoff
    msg_graph_experts, top_hub, top_keyword_overlap, top_auth = network.identify_hubs_and_experts(log_data, nicks, nick_same_list)
    saver.draw_nx_graph (msg_graph_experts, output_directory, "hits-cutoff-"+str(cutoff))

config.THRESHOLD_MESSAGE_NUMBER_GRAPH = threshold #revert to default config

# ============== OUTPUT ================
saver.save_net_nx_graph (message_number_graph, output_directory, "message_number_graph")
saver.draw_nx_graph(message_number_graph, output_directory, "message_number_graph")

saver.save_csv([["response_time_cutoff"], [rt_cutoff_time]], output_directory, "rt_cutoff")
saver.save_csv([["month", "users", "directed_messages"], ["Jan-2013", len(message_number_graph), int(message_number_graph.size('weight'))]], output_directory, "users_messages")

for dtype in degree_type:
    saver.save_csv(degree_anal_message_number[dtype]["formatted_for_csv"], output_directory, dtype)   

saver.save_csv(bin_matrix, output_directory, "MessageNumber_binsize_"+str(config.BIN_LENGTH_MINS)) 
Exemplo n.º 8
0
def keywords_hits_overlap(log_directory, output_directory, channel_name,
                          start_date, end_date):
    """
        The function iterates through the months in the given date range and produces the authorities, top keywords and
        top hubs for the current month and the next month. It also produces the overlap of authorities, top keywords and
        top hubs between the current and the next month.

    Args:
        log_directory(str): path to the location of Logs
        output_directory(str):  path to the location where the results are to be stored
        channel_name(list): channels for which the analysis is to be done
        start_date(datetime): starting date for the logs to be analysed. This has to be the beginning of the month.
        end_date(datetime): ending date for which the logs are to be analysed. This has to be the end of the month.

    Returns:
       null

    """
    start_date = start_date.strptime('%Y-%m-%d')
    end_date = end_date.strptime('%Y-%m-%d')
    for dt in rrule(MONTHLY, dtstart=start_date, until=end_date):
        last_day_of_the_month1 = dt + relativedelta(
            months=1) - datetime.timedelta(days=1)
        log_data_m1 = reader.linux_input(
            log_directory, channel_name, dt.strftime("%Y-%m-%d"),
            last_day_of_the_month1.strftime("%Y-%m-%d"))
        nicks_m1, nick_same_list_m1 = nickTracker.nick_tracker(log_data_m1)
        message_graph_m1, top_hubs_m1, top_keyword_overlap_m1, top_auth_m1 = network.identify_hubs_and_experts(
            log_data_m1, nicks_m1, nick_same_list_m1)
        saver.draw_nx_graph(message_graph_m1, output_directory,
                            "expert-month-" + str(dt.month))

        next_month_dt = dt + relativedelta(months=1)
        last_day_of_the_month2 = next_month_dt + relativedelta(
            months=1) - datetime.timedelta(days=1)
        log_data_m2 = reader.linux_input(
            log_directory, channel_name, next_month_dt.strftime("%Y-%m-%d"),
            last_day_of_the_month2.strftime("%Y-%m-%d"))
        nicks_m2, nick_same_list_m2 = nickTracker.nick_tracker(log_data_m2)
        message_graph_m2, top_hubs_m2, top_keyword_overlap_with_score_m2, top_auth_m2 = network.identify_hubs_and_experts(
            log_data_m2, nicks_m2, nick_same_list_m2)

        print "Top 10 HUBS for Month [HITS]", dt.month, ":", top_hubs_m1
        print "Top 10 HUBS for Month [HITS]", next_month_dt.month, ":", top_hubs_m2
        print "Number of common HUBS (from 10) between above 2 months:", len(
            list(set(top_hubs_m1).intersection(top_hubs_m2)))

        print "Top 10 Experts by keywords for Months", dt.month, ":", top_keyword_overlap_m1
        print "Top 10 Experts by keywords for Months", next_month_dt.month, ":", top_keyword_overlap_with_score_m2
        print "Number of common Experts by keywords (from 10) between above 2 months:", len(
            list(
                set(top_keyword_overlap_m1).intersection(
                    top_keyword_overlap_with_score_m2)))

        print "Top 10 AUTH for Month [HITS]", dt.month, ":", top_auth_m1
        print "Top 10 AUTH for Month [HITS]", next_month_dt.month, ":", top_auth_m2
        print "Number of common AUTH (from 10) between above 2 months:", len(
            list(set(top_auth_m1).intersection(top_auth_m2)))

        print "Number of users common btw HUBS from HITS and Experts by Keywords (from 10) for month", dt.month, ":", len(
            list(set(top_keyword_overlap_m1).intersection(top_hubs_m1)))
        print "Number of users common btw AUTH from HITS and Experts by Keywords (from 10) for month", dt.month, ":", len(
            list(set(top_keyword_overlap_m1).intersection(top_auth_m1)))
        print "Number of users common btw HUBS from HITS and AUTH from HITS (from 10) for month", dt.month, ":", len(
            list(set(top_hubs_m1).intersection(top_auth_m1)))
        print "Number of users common btw HUBS, HITS and KEYWORDS", dt.month, ":", len(
            set(list(set(top_keyword_overlap_m1).intersection(
                top_hubs_m1))).intersection(top_auth_m1))