Exemplo n.º 1
0
def prep_pipeline(dataset='RumEv',
                  seq=True,
                  feature_set=['avgw2v'],
                  fs_name="text",
                  timeline=False):
    #%%

    path = 'saved_data_' + dataset + fs_name

    folds = {}

    folds = read_fullPHEME()

    help_prep_functions.loadW2vModel()
    for fold in folds.keys():
        print(fold)
        feature_fold = []
        labels = []
        ids = []
        for conversation in folds[fold]:
            thread_feature_dict = extract_thread_features_incl_response(
                conversation)
            #             Optionally here can collect feature dict and save it to read it instead of raw data?
            thread_features_array = transform_feature_dict(
                thread_feature_dict,
                conversation,
                sequential=seq,
                feature_set=feature_set,
                timeline=timeline)

            #            print thread_features_array.shape
            if seq:
                feature_fold.extend(thread_features_array)
                for i in range(len(thread_features_array)):
                    labels.append(convert_label(conversation['veracity']))
                    ids.append(conversation['id'])
            else:
                feature_fold.append(thread_features_array)
                labels.append(convert_label(conversation['veracity']))
                ids.append(conversation['id'])

        feature_fold = pad_sequences(feature_fold,
                                     maxlen=None,
                                     dtype='float32',
                                     padding='post',
                                     truncating='post',
                                     value=0.)
        labels = np.asarray(labels)

        #
        #        categorical_labels = to_categorical(labels, num_classes=None)

        path_fold = os.path.join(path, fold)
        if not os.path.exists(path_fold):
            os.makedirs(path_fold)
        np.save(os.path.join(path_fold, 'train_array'), feature_fold)
        np.save(os.path.join(path_fold, 'labels'), labels)
        np.save(os.path.join(path_fold, 'ids'), ids)
#%%    
for fold in list(newfolds.keys()):
    print(fold)

    preprocessed_expanded_folds = {}
    preprocessed_expanded_folds[fold] = {}
    for tree in list(newfolds[fold].keys()):
         preprocessed_expanded_folds[fold][tree] = {}
         subtree_dict = newfolds[fold][tree]
         for subtree_num in list(subtree_dict.keys()):
             preprocessed_expanded_folds[fold][tree][subtree_num] = {}
             
             subtree = subtree_dict[subtree_num]
             
             thread_feature_dict = extract_thread_features_incl_response(subtree)

             thread_features_array, thread_stance_labels, branches = transform_feature_dict(
                                   thread_feature_dict, subtree,
                                   feature_set=feature_set)
             
             preprocessed_expanded_folds[fold][tree][subtree_num]['features'] = deepcopy(thread_features_array)
             preprocessed_expanded_folds[fold][tree][subtree_num]['tweet_ids'] = branches
             preprocessed_expanded_folds[fold][tree][subtree_num]['fold_stance_labels'] = thread_stance_labels
             preprocessed_expanded_folds[fold][tree][subtree_num]['veracity_labels'] = []
             preprocessed_expanded_folds[fold][tree][subtree_num]['ids'] = []
             for i in range(len(thread_features_array)):
                  preprocessed_expanded_folds[fold][tree][subtree_num]['veracity_labels'].append(convert_label(subtree['veracity']))
                  preprocessed_expanded_folds[fold][tree][subtree_num]['ids'].append(subtree['id'])
    
Exemplo n.º 3
0
def prep_pipeline(dataset='RumEval2019', feature_set=['avgw2v']):

    path = 'saved_data' + dataset
    folds = {}
    folds = load_dataset()
    reddit = load_data()

    folds['train'].extend(reddit['train'])
    folds['dev'].extend(reddit['dev'])
    folds['test'].extend(reddit['test'])

    help_prep_functions.loadW2vModel()

    #%%
    for fold in folds.keys():

        print(fold)
        feature_fold = []
        tweet_ids = []
        fold_stance_labels = []
        labels = []
        ids = []
        for conversation in folds[fold]:

            thread_feature_dict = extract_thread_features_incl_response(
                conversation)

            thread_features_array, thread_stance_labels, branches = transform_feature_dict(
                thread_feature_dict, conversation, feature_set=feature_set)

            fold_stance_labels.extend(thread_stance_labels)
            tweet_ids.extend(branches)
            feature_fold.extend(thread_features_array)
            for i in range(len(thread_features_array)):
                labels.append(convert_label(conversation['veracity']))
                ids.append(conversation['id'])

#%
        if feature_fold != []:

            feature_fold = pad_sequences(feature_fold,
                                         maxlen=None,
                                         dtype='float32',
                                         padding='post',
                                         truncating='post',
                                         value=0.)

            fold_stance_labels = pad_sequences(fold_stance_labels,
                                               maxlen=None,
                                               dtype='float32',
                                               padding='post',
                                               truncating='post',
                                               value=0.)

            labels = np.asarray(labels)
            path_fold = os.path.join(path, fold)
            if not os.path.exists(path_fold):
                os.makedirs(path_fold)

            np.save(os.path.join(path_fold, 'train_array'), feature_fold)
            np.save(os.path.join(path_fold, 'labels'), labels)
            np.save(os.path.join(path_fold, 'fold_stance_labels'),
                    fold_stance_labels)
            np.save(os.path.join(path_fold, 'ids'), ids)
            np.save(os.path.join(path_fold, 'tweet_ids'), tweet_ids)
Exemplo n.º 4
0
def prep_pipeline(dataset='RumEval2019', feature_set=['avgw2v']):

    # path = 'saved_data'+dataset
    folds = {}
    folds = load_dataset()
    reddit = load_data()

    folds['train'].extend(reddit['train'])
    folds['dev'].extend(reddit['dev'])
    folds['test'].extend(reddit['test'])

    # help_prep_functions.loadW2vModel()

    #%%
    for fold in folds.keys():

        print(fold)
        feature_fold = []
        tweet_ids = []
        fold_stance_labels = []
        labels = []
        ids = []
        for conversation in folds[fold]:

            thread_feature_dict = extract_thread_features_incl_response(
                conversation)

            # thread_features_array, thread_stance_labels, branches = \
            #   transform_feature_dict(
            #                        thread_feature_dict, conversation,
            #                        feature_set=feature_set)

            thread_text, thread_stance_labels, branches = \
                transform_feature_dict(thread_feature_dict, conversation,
                                       feature_set=feature_set)

            fold_stance_labels.extend(thread_stance_labels)
            tweet_ids.extend(branches)
            feature_fold.extend(thread_text)
            for i in range(len(thread_text)):
                # labels.append(convert_label(conversation['veracity']))
                labels.append(conversation['veracity'])
                ids.append(conversation['id'])

        # result: feature_fold (nested thread_text),
        #         fold_stance_labels (nested),
        #         labels (veracity),
        #         tweet_ids (nested)

        datafile = os.path.join(DATA_PATH, fold + '.txt')
        idfile = os.path.join(DATA_PATH, fold + '.id.txt')
        assert len(feature_fold) == len(fold_stance_labels)
        assert len(feature_fold) == len(labels)
        fdata = open(datafile, 'w')
        fid = open(idfile, 'w')
        for twids, thread, stance, veracity in zip(tweet_ids, feature_fold,
                                                   fold_stance_labels, labels):
            orig_tweet = ' '.join(thread[0])
            fdata.write('{} ||| {} ||| {}\n'.format(orig_tweet, stance[0],
                                                    veracity))
            fid.write('{}\n'.format(twids[0]))
            for twid, thr, sdqc in zip(twids[1:], thread[1:], stance[1:]):
                tweet = ' '.join(thr)
                fdata.write('{} ||| {}\n'.format(tweet, sdqc))
                fid.write('{}\n'.format(twid))
            fdata.write('\n')
            fid.write('\n')
def prep_pipeline(dataset='15',
                  seq=True,
                  feature_set=['avgw2v'],
                  timeline=False):
    #%%

    path = 'saved_data_' + dataset

    #    if not os.path.exists(path):
    #        os.makedirs(path)
    #    folds = {}
    if dataset == '15':
        folds_test = read_Twitter15_16(dataset="15", set='test')
        folds_train = read_Twitter15_16(dataset="15", set='train')

    else:
        folds_test = read_Twitter15_16(dataset="16", set='test')
        folds_train = read_Twitter15_16(dataset="16", set='train')

    help_prep_functions.loadW2vModel()

    for fold in folds_test.keys():

        print(fold)

        feature_fold = []
        labels = []
        ids = []
        for conversation in folds_test[fold]:
            #print (conversation['id'])

            thread_feature_dict = extract_thread_features_incl_response(
                conversation)
            #             Optionally here can collect feature dict and save it to read it instead of raw data?
            thread_features_array = transform_feature_dict(
                thread_feature_dict,
                conversation,
                sequential=seq,
                feature_set=feature_set,
                timeline=timeline)
            #            print thread_features_array.shape
            if seq:
                feature_fold.extend(thread_features_array)
                for i in range(len(thread_features_array)):
                    labels.append(convert_label(conversation['label']))
                    ids.append(conversation['id'])
            else:
                feature_fold.append(thread_features_array)
                labels.append(convert_label(conversation['label']))
                ids.append(conversation['id'])

        feature_fold = pad_sequences(feature_fold,
                                     maxlen=None,
                                     dtype='float32',
                                     padding='post',
                                     truncating='post',
                                     value=0.)
        labels = np.asarray(labels)

        #
        #        categorical_labels = to_categorical(labels, num_classes=None)

        path_fold = os.path.join(path, str(fold), 'test')
        if not os.path.exists(path_fold):
            os.makedirs(path_fold)
#        print feature_fold.shape
        np.save(os.path.join(path_fold, 'train_array'), feature_fold)
        np.save(os.path.join(path_fold, 'labels'), labels)
        np.save(os.path.join(path_fold, 'ids'), ids)

    for fold in folds_train.keys():

        print(fold)

        feature_fold = []
        labels = []
        ids = []
        for conversation in folds_train[fold]:

            thread_feature_dict = extract_thread_features_incl_response(
                conversation)
            #             Optionally here can collect feature dict and save it to read it instead of raw data?
            thread_features_array = transform_feature_dict(
                thread_feature_dict,
                conversation,
                sequential=seq,
                feature_set=feature_set,
                timeline=timeline)
            #            print thread_features_array.shape
            if seq:
                feature_fold.extend(thread_features_array)
                for i in range(len(thread_features_array)):
                    labels.append(convert_label(conversation['label']))
                    ids.append(conversation['id'])
            else:
                feature_fold.append(thread_features_array)
                labels.append(convert_label(conversation['label']))
                ids.append(conversation['id'])

        feature_fold = pad_sequences(feature_fold,
                                     maxlen=None,
                                     dtype='float32',
                                     padding='post',
                                     truncating='post',
                                     value=0.)
        labels = np.asarray(labels)

        #
        #        categorical_labels = to_categorical(labels, num_classes=None)

        path_fold = os.path.join(path, str(fold), 'train')
        if not os.path.exists(path_fold):
            os.makedirs(path_fold)


#        print feature_fold.shape
        np.save(os.path.join(path_fold, 'train_array'), feature_fold)
        np.save(os.path.join(path_fold, 'labels'), labels)
        np.save(os.path.join(path_fold, 'ids'), ids)
Exemplo n.º 6
0
def prep_pipeline(dataset="RumEval2019", feature_set=["avgw2v"]):
    use_reddit_data = True
    path = "saved_data" + dataset
    folds = {}
    folds = load_dataset()
    reddit = load_data()

    #folds["train"].extend(reddit["train"])
    #folds["dev"].extend(reddit["dev"])
    #folds["test"].extend(reddit["test"])

    folds["test"] = load_test_data_twitter()["test"]
    if use_reddit_data:
        reddit = load_data()
        folds['train'].extend(reddit['train'])
        folds['dev'].extend(reddit['dev'])
        reddit_test_data = load_test_data_reddit()['test']
        folds["test"].extend(reddit_test_data)

    help_prep_functions.loadW2vModel()

    ###

    #%%
    for fold in list(reversed(list(folds.keys()))):

        print(fold)
        feature_fold = []
        tweet_ids = []
        fold_stance_labels = []
        labels = []
        ids = []
        for conversation in folds[fold]:

            thread_feature_dict = extract_thread_features_incl_response(
                conversation)

            if fold == "test":
                # if it's in the test set it wont have veracity, assign it.
                conversation['veracity'] = TEST_DATA_LABELS["subtaskbenglish"][
                    conversation['id']]
                conversation['source']['label'] = TEST_DATA_LABELS[
                    "subtaskaenglish"][conversation['id']]
                for reply in conversation['replies']:
                    reply['label'] = TEST_DATA_LABELS["subtaskaenglish"][
                        reply['id_str']]

            (
                thread_features_array,
                thread_stance_labels,
                branches,
            ) = transform_feature_dict(thread_feature_dict,
                                       conversation,
                                       feature_set=feature_set)

            fold_stance_labels.extend(thread_stance_labels)
            tweet_ids.extend(branches)
            feature_fold.extend(thread_features_array)
            for i in range(len(thread_features_array)):
                labels.append(convert_label(conversation["veracity"]))
                ids.append(conversation["id"])

        #%
        if feature_fold != []:

            feature_fold = pad_sequences(
                feature_fold,
                maxlen=None,
                dtype="float32",
                padding="post",
                truncating="post",
                value=0.0,
            )

            fold_stance_labels = pad_sequences(
                fold_stance_labels,
                maxlen=None,
                dtype="float32",
                padding="post",
                truncating="post",
                value=0.0,
            )

            labels = np.asarray(labels)
            path_fold = os.path.join(path, fold)
            if not os.path.exists(path_fold):
                os.makedirs(path_fold)

            np.save(os.path.join(path_fold, "train_array"), feature_fold)
            np.save(os.path.join(path_fold, "labels"), labels)
            np.save(os.path.join(path_fold, "fold_stance_labels"),
                    fold_stance_labels)
            np.save(os.path.join(path_fold, "ids"), ids)
            np.save(os.path.join(path_fold, "tweet_ids"), tweet_ids)