예제 #1
0
def social_context_dataset_statistics():
    """
    Utility method to perform social context (PHEME dataset structure based) corpus statistics by a given path

    :return:
    """
    # social_context_data_dir = "C:\\Data\\NLP-corpus\\aug_rnr\\twitter1516"
    from data_loader import load_abs_path
    social_context_data_dir = os.path.join(
        os.path.dirname(__file__), '..', "data", "social_context",
        "aug-rnr-annotated-threads-retweets")
    social_context_data_dir = load_abs_path(social_context_data_dir)

    print("check social context corpus [%s] ... " % social_context_data_dir)
    events_dataset_dirs = []
    for root, dirs, files in os.walk(social_context_data_dir):
        # print("root: ", root)
        # print("dirs: ", dirs)
        # print("files size: ", len(files))
        events_dataset_dirs = dirs
        break

    print("total [%s] events dataset" % len(events_dataset_dirs))
    print(events_dataset_dirs)

    # check every individual event corpus
    for event_dataset_dir in events_dataset_dirs:
        labelled_event_dataset_statistics(social_context_data_dir,
                                          event_dataset_dir)
        print(" ========================================== ")

    print("complete.")
예제 #2
0
    def test_context_sequence_encoding(self):
        elmo_credbank_model_path = load_abs_path(
            os.path.join(
                os.path.dirname(__file__), '..', "resource", "embedding",
                "elmo_model",
                "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5"
            ))

        elmo_embedder = ElmoTokenEmbedder(
            options_file=
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
            weight_file=elmo_credbank_model_path,
            do_layer_norm=False,
            dropout=0.5)
        word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

        EXPECTED_CONTEXT_INPUT_SIZE = 60

        rumor_classifier = RumorTweetsClassifer(
            word_embeddings,
            None,
            None,
            None,
            classifier_feedforward=None,
            cxt_content_encoder=None,
            cxt_metadata_encoder=None,
            social_context_self_attention_encoder=None,
            cuda_device=-1)

        tweet_id = "500327120770301952"
        single_source_tweet_tensor_1 = self.tweet_context_encoding_by_tweet_id(
            rumor_classifier, tweet_id)
        print(type(single_source_tweet_tensor_1))
        print(single_source_tweet_tensor_1.shape)
        assert type(single_source_tweet_tensor_1) == torch.Tensor
        assert single_source_tweet_tensor_1.shape == (
            97, EXPECTED_CONTEXT_INPUT_SIZE
        ), "expected shape is [19, %s]" % EXPECTED_CONTEXT_INPUT_SIZE

        tweet_id = "552806117328568321"  # with three replies
        single_source_tweet_tensor_2 = self.tweet_context_encoding_by_tweet_id(
            rumor_classifier, tweet_id)
        print(type(single_source_tweet_tensor_2))
        print(single_source_tweet_tensor_2.shape)
        assert type(single_source_tweet_tensor_2) == torch.Tensor
        assert single_source_tweet_tensor_2.shape == (
            94, EXPECTED_CONTEXT_INPUT_SIZE
        ), "expected shape is [3, %s]" % EXPECTED_CONTEXT_INPUT_SIZE

        tweet_id = "552806117328568321"  # with three replies
        print("social context encoding without numerical feature .")
        single_source_tweet_tensor_2 = self.tweet_context_encoding_by_tweet_id(
            rumor_classifier, tweet_id, disable_nf=True)
        print(type(single_source_tweet_tensor_2))
        print(single_source_tweet_tensor_2.shape)
        assert type(single_source_tweet_tensor_2) == torch.Tensor
        assert single_source_tweet_tensor_2.shape == (
            94, EXPECTED_CONTEXT_INPUT_SIZE
        ), "expected shape is [3, %s]" % EXPECTED_CONTEXT_INPUT_SIZE
예제 #3
0
def test_elmo_with_attention():
    import os
    from data_loader import load_abs_path
    from embedding_layer import word_embedding_elmo

    elmo_credbank_model_path = load_abs_path(
        os.path.join(
            os.path.dirname(__file__), '..', "resource", "embedding",
            "elmo_model",
            "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5")
    )
    elmo_options_file_path = load_abs_path(
        os.path.join(os.path.dirname(__file__), '..', "resource", "embedding",
                     "elmo_model",
                     "elmo_2x4096_512_2048cnn_2xhighway_options.json"))

    sentence3 = [
        "9/11", "sandy", "hook", "movie", "shooting", "boston", "bomb",
        "threats", "from", "n.", "korea", "and", "several", "other",
        "tragedies", "were", "all", "under", "the", "age", "of", "18"
    ]

    fine_tuned_elmo = ElmoEmbedder(options_file=elmo_options_file_path,
                                   weight_file=elmo_credbank_model_path)

    avg_all_layer_sent_embedding = word_embedding_elmo(sentence3,
                                                       fine_tuned_elmo)

    # print(avg_all_layer_sent_embedding)
    print("content avg ELMo embedding shape : ",
          avg_all_layer_sent_embedding.shape)
    assert avg_all_layer_sent_embedding.shape == (22, 1024)

    attention_layer = HierarchicalAttentionNet(1024, step_dim=22)
    maxlen = 200

    attention_weights = attention_layer.forward(
        torch.as_tensor(avg_all_layer_sent_embedding), maxlen)
    print("context attention weights shape: ", attention_weights.shape)
    assert attention_weights.shape == torch.Size([1, 1024])
    print(attention_weights)
예제 #4
0
    def test_context_feature_encoder(self):
        elmo_credbank_model_path = load_abs_path(
            os.path.join(
                os.path.dirname(__file__), '..', "resource", "embedding",
                "elmo_model",
                "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5"
            ))

        # test context feature encoding with small sample data
        #    to make sure that source tweet context are sorted in  temporal order
        elmo_embedder = ElmoTokenEmbedder(
            options_file=
            "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
            weight_file=elmo_credbank_model_path,
            do_layer_norm=False,
            dropout=0.5)
        word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})
        rumor_classifier = RumorTweetsClassifer(word_embeddings, None, None,
                                                None, None)

        propagation_embeddings_tensor = rumor_classifier.batch_compute_context_feature_encoding(
            ['500294803402137600', '500327120770301952'])
        print("propagation_embeddings_tensor: ", propagation_embeddings_tensor)
예제 #5
0
            "1) AttentionWithContext (default). However, we got [%s]" %
            attention_option)

    print(
        "training RumourDNN model on development dataset [%s] and [%s] with gpu [%s]"
        % (train_set_path, heldout_set_path, no_gpu))

    import allennlp_rumor_classifier
    import data_loader
    from allennlp_rumor_classifier import config_gpu_use

    config_gpu_use(no_gpu)

    allennlp_rumor_classifier.elmo_credbank_model_path = load_abs_path(
        os.path.join(
            os.path.dirname(__file__), '..', "resource", "embedding",
            "elmo_model",
            "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5")
    )

    data_loader.social_context_data_dir = os.path.join(
        os.path.dirname(__file__), '..', "data", "social_context",
        "aug-rnr-annotated-threads-retweets")

    print("Fine-tuned ELMo model is set to [%s]" %
          allennlp_rumor_classifier.elmo_credbank_model_path)
    print("social context corpus for all events directory is set to [%s]" %
          data_loader.social_context_data_dir)

    # Reasonable minibatch sizes are usually: 32, 64, 128, 256, 512, 1024 (powers of 2 are a common convention)
    # Usually, you can choose a batch size that is as large as your GPU memory allows
    #   (matrix-multiplication and the size of fully-connected layers are usually the bottleneck)
예제 #6
0
def statistics_rumour_dnn_dataset(file_name):
    """
    perform statistics of social context for a given training data set file

    :param file_name:
    :return:
    """
    print("statistics of [%s]" % file_name)

    df_file = load_matrix_from_csv(file_name, 0, 1, header=0)

    #for dataset_row in df_file[:]:
    #    print("tweet id: [%s]" % dataset_row[0])

    all_tweet_ids = [dataset_row[0] for dataset_row in df_file[:]]
    print("all_tweet_ids size: ", len(all_tweet_ids))

    from data_loader import load_tweets_context_dataset_dir
    from data_loader import load_abs_path

    social_context_data_dir = os.path.join(
        os.path.dirname(__file__), '..', "data", "social_context",
        "aug-rnr-annotated-threads-retweets")
    social_context_data_dir = load_abs_path(social_context_data_dir)
    context_tweets_dataset_dir_dict = load_tweets_context_dataset_dir(
        social_context_data_dir)

    all_replies_list = []
    all_retweets_list = []
    for tweet_id in all_tweet_ids:
        total_replies, total_retweets = count_social_context(
            str(tweet_id), context_tweets_dataset_dir_dict)
        all_replies_list.append(total_replies)
        all_retweets_list.append(total_retweets)

    print("total_replies_list: ", all_replies_list)
    print("total_retweets_list: ", all_retweets_list)

    total_reactions = sum(all_replies_list)
    min_reactions = 0 if len(all_replies_list) == 0 else min(all_replies_list)
    max_reactions = 0 if len(all_replies_list) == 0 else max(all_replies_list)
    avg_reactions = 0 if len(all_replies_list) == 0 else round(
        sum(all_replies_list) / len(all_replies_list), 1)
    std_reactions = 0 if len(all_replies_list) == 0 else statistics.stdev(
        all_replies_list)
    # Median has a very big advantage over Mean, which is the median value is not skewed so much by extremely large or small values.
    # see also https://www.geeksforgeeks.org/python-statistics-median/
    median_reactions = 0 if len(all_replies_list) == 0 else statistics.median(
        all_replies_list)

    total_retweets = sum(all_retweets_list)
    min_retweets = 0 if len(all_retweets_list) == 0 else min(all_retweets_list)
    max_retweets = 0 if len(all_retweets_list) == 0 else max(all_retweets_list)
    avg_retweets = 0 if len(all_retweets_list) == 0 else round(
        total_retweets / len(all_retweets_list), 1)
    std_retweets = 0 if len(all_retweets_list) == 0 else statistics.stdev(
        all_retweets_list)
    # Median has a very big advantage over Mean, which is the median value is not skewed so much by extremely large or small values,
    # see also https://www.geeksforgeeks.org/python-statistics-median/
    median_retweets = 0 if len(all_retweets_list) == 0 else statistics.median(
        all_retweets_list)

    print(
        "total reactions: [%s], min reaction: [%s], max reaction: [%s], avg reaction: [%s], std reactions: [%s], median reactions: [%s]"
        % (total_reactions, min_reactions, max_reactions, avg_reactions,
           std_reactions, median_reactions))
    print(
        "total retweets: [%s], min retweets: [%s], max retweets: [%s], avg retweets: [%s], std retweets: [%s], median retweets: [%s]"
        % (total_retweets, min_retweets, max_retweets, avg_retweets,
           std_retweets, median_retweets))

    print("total tweets without reaction: [%s]" % (all_replies_list.count(0)))
    print("total tweets without retweets: [%s]" % (all_retweets_list.count(0)))

    results = dict()
    results["total_reactions"] = total_reactions
    results["min_reactions"] = min_reactions
    results["max_reactions"] = max_reactions
    results["avg_reactions"] = avg_reactions
    results["std_reactions"] = std_reactions
    results["median_reactions"] = median_reactions

    results["total_retweets"] = total_retweets
    results["min_retweets"] = min_retweets
    results["max_retweets"] = max_retweets
    results["avg_retweets"] = avg_retweets
    results["std_retweets"] = std_retweets
    results["median_retweets"] = median_retweets

    print("statistics: ")
    print(results)
예제 #7
0
def test_elmo_output_with_self_attention():
    import os
    from data_loader import load_abs_path
    import numpy as np

    elmo_credbank_model_path = load_abs_path(
        os.path.join(
            os.path.dirname(__file__), '..', "resource", "embedding",
            "elmo_model",
            "elmo_credbank_2x4096_512_2048cnn_2xhighway_weights_10052019.hdf5")
    )
    elmo_options_file_path = load_abs_path(
        os.path.join(os.path.dirname(__file__), '..', "resource", "embedding",
                     "elmo_model",
                     "elmo_2x4096_512_2048cnn_2xhighway_options.json"))

    sentence4 = [
        "i", "really", "enjoy", "Ashley", "and", "Ami", "salon", "she", "do",
        "a", "great", "job", "be", "friendly", "and", "professional", "I",
        "usually", "get", "my", "hair", "do", "when", "i", "go", "to", "to",
        "MI", "because", "of", "the", "quality", "of", "the", "highlight",
        "and", "the", "price", "be", "very", "affordable", "the", "highlight",
        "fantastic", "thank", "Ashley", "i", "highly", "recommend", "you",
        "and", "ill", "be", "back"
    ]
    fine_tuned_elmo = ElmoEmbedder(options_file=elmo_options_file_path,
                                   weight_file=elmo_credbank_model_path)
    sentence_vectors = fine_tuned_elmo.embed_sentence(sentence4)
    avg_all_layer_sent_embedding = np.mean(sentence_vectors,
                                           axis=0,
                                           dtype='float32')

    print("test with self-attentive model: ")
    self_attention_elmo_input = torch.stack(
        [torch.as_tensor(avg_all_layer_sent_embedding)]).permute(1, 0, 2)
    # ELMo output.size() = (batch_size, num_seq, 2*hidden_size)
    self_attention_elmo_input = self_attention_elmo_input.permute(1, 0, 2)

    print(
        "self_attention_elmo_input shape (batch_size, num_seq, 2*hidden_size) : ",
        self_attention_elmo_input.shape)

    self_attention_model = StructuredSelfAttention(1024)
    # print(self_attention_elmo_input)
    concatenated_context_embeddings, attn_weight_matrix = self_attention_model.forward(
        self_attention_elmo_input, if_concat=True)
    print(
        "self attention weights (annotation A) of ELMo embedding shape (batch_size, r, num_seq): ",
        attn_weight_matrix.shape)
    print("attn_weight_matrix: ", attn_weight_matrix)
    # assert attn_weight_matrix.shape == torch.Size([22, 30, 1])

    # fc_input_tesnor = hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2])
    print(
        " concatenate the hidden_matrix  (b4 feeding into FC and softmax) shape:",
        concatenated_context_embeddings.shape)

    avg_context_embeddings, attn_weight_matrix = self_attention_model.forward(
        self_attention_elmo_input, if_concat=False)
    print(
        " averaged the hidden_matrix  (b4 feeding into FC and softmax) shape:",
        avg_context_embeddings.shape)