def prep_pipeline(dataset='RumEv', seq=True, feature_set=['avgw2v'], fs_name="text", timeline=False): #%% path = 'saved_data_' + dataset + fs_name folds = {} folds = read_fullPHEME() help_prep_functions.loadW2vModel() for fold in folds.keys(): print(fold) feature_fold = [] labels = [] ids = [] for conversation in folds[fold]: thread_feature_dict = extract_thread_features_incl_response( conversation) # Optionally here can collect feature dict and save it to read it instead of raw data? thread_features_array = transform_feature_dict( thread_feature_dict, conversation, sequential=seq, feature_set=feature_set, timeline=timeline) # print thread_features_array.shape if seq: feature_fold.extend(thread_features_array) for i in range(len(thread_features_array)): labels.append(convert_label(conversation['veracity'])) ids.append(conversation['id']) else: feature_fold.append(thread_features_array) labels.append(convert_label(conversation['veracity'])) ids.append(conversation['id']) feature_fold = pad_sequences(feature_fold, maxlen=None, dtype='float32', padding='post', truncating='post', value=0.) labels = np.asarray(labels) # # categorical_labels = to_categorical(labels, num_classes=None) path_fold = os.path.join(path, fold) if not os.path.exists(path_fold): os.makedirs(path_fold) np.save(os.path.join(path_fold, 'train_array'), feature_fold) np.save(os.path.join(path_fold, 'labels'), labels) np.save(os.path.join(path_fold, 'ids'), ids)
else: print(label) dataset='fullPHEME' feature_set=['avgw2v'] path = 'saved_data'+dataset folds = {} if dataset == 'RumEv': folds = read_RumEv() else: folds = read_fullPHEME() newfolds = add_partial_folds(folds) help_prep_functions.loadW2vModel() #%% for fold in list(newfolds.keys()): print(fold) preprocessed_expanded_folds = {} preprocessed_expanded_folds[fold] = {} for tree in list(newfolds[fold].keys()): preprocessed_expanded_folds[fold][tree] = {} subtree_dict = newfolds[fold][tree] for subtree_num in list(subtree_dict.keys()): preprocessed_expanded_folds[fold][tree][subtree_num] = {} subtree = subtree_dict[subtree_num]
def prep_pipeline(dataset='RumEval2019', feature_set=['avgw2v']): path = 'saved_data' + dataset folds = {} folds = load_dataset() reddit = load_data() folds['train'].extend(reddit['train']) folds['dev'].extend(reddit['dev']) folds['test'].extend(reddit['test']) help_prep_functions.loadW2vModel() #%% for fold in folds.keys(): print(fold) feature_fold = [] tweet_ids = [] fold_stance_labels = [] labels = [] ids = [] for conversation in folds[fold]: thread_feature_dict = extract_thread_features_incl_response( conversation) thread_features_array, thread_stance_labels, branches = transform_feature_dict( thread_feature_dict, conversation, feature_set=feature_set) fold_stance_labels.extend(thread_stance_labels) tweet_ids.extend(branches) feature_fold.extend(thread_features_array) for i in range(len(thread_features_array)): labels.append(convert_label(conversation['veracity'])) ids.append(conversation['id']) #% if feature_fold != []: feature_fold = pad_sequences(feature_fold, maxlen=None, dtype='float32', padding='post', truncating='post', value=0.) fold_stance_labels = pad_sequences(fold_stance_labels, maxlen=None, dtype='float32', padding='post', truncating='post', value=0.) labels = np.asarray(labels) path_fold = os.path.join(path, fold) if not os.path.exists(path_fold): os.makedirs(path_fold) np.save(os.path.join(path_fold, 'train_array'), feature_fold) np.save(os.path.join(path_fold, 'labels'), labels) np.save(os.path.join(path_fold, 'fold_stance_labels'), fold_stance_labels) np.save(os.path.join(path_fold, 'ids'), ids) np.save(os.path.join(path_fold, 'tweet_ids'), tweet_ids)
def prep_pipeline(dataset='15', seq=True, feature_set=['avgw2v'], timeline=False): #%% path = 'saved_data_' + dataset # if not os.path.exists(path): # os.makedirs(path) # folds = {} if dataset == '15': folds_test = read_Twitter15_16(dataset="15", set='test') folds_train = read_Twitter15_16(dataset="15", set='train') else: folds_test = read_Twitter15_16(dataset="16", set='test') folds_train = read_Twitter15_16(dataset="16", set='train') help_prep_functions.loadW2vModel() for fold in folds_test.keys(): print(fold) feature_fold = [] labels = [] ids = [] for conversation in folds_test[fold]: #print (conversation['id']) thread_feature_dict = extract_thread_features_incl_response( conversation) # Optionally here can collect feature dict and save it to read it instead of raw data? thread_features_array = transform_feature_dict( thread_feature_dict, conversation, sequential=seq, feature_set=feature_set, timeline=timeline) # print thread_features_array.shape if seq: feature_fold.extend(thread_features_array) for i in range(len(thread_features_array)): labels.append(convert_label(conversation['label'])) ids.append(conversation['id']) else: feature_fold.append(thread_features_array) labels.append(convert_label(conversation['label'])) ids.append(conversation['id']) feature_fold = pad_sequences(feature_fold, maxlen=None, dtype='float32', padding='post', truncating='post', value=0.) labels = np.asarray(labels) # # categorical_labels = to_categorical(labels, num_classes=None) path_fold = os.path.join(path, str(fold), 'test') if not os.path.exists(path_fold): os.makedirs(path_fold) # print feature_fold.shape np.save(os.path.join(path_fold, 'train_array'), feature_fold) np.save(os.path.join(path_fold, 'labels'), labels) np.save(os.path.join(path_fold, 'ids'), ids) for fold in folds_train.keys(): print(fold) feature_fold = [] labels = [] ids = [] for conversation in folds_train[fold]: thread_feature_dict = extract_thread_features_incl_response( conversation) # Optionally here can collect feature dict and save it to read it instead of raw data? thread_features_array = transform_feature_dict( thread_feature_dict, conversation, sequential=seq, feature_set=feature_set, timeline=timeline) # print thread_features_array.shape if seq: feature_fold.extend(thread_features_array) for i in range(len(thread_features_array)): labels.append(convert_label(conversation['label'])) ids.append(conversation['id']) else: feature_fold.append(thread_features_array) labels.append(convert_label(conversation['label'])) ids.append(conversation['id']) feature_fold = pad_sequences(feature_fold, maxlen=None, dtype='float32', padding='post', truncating='post', value=0.) labels = np.asarray(labels) # # categorical_labels = to_categorical(labels, num_classes=None) path_fold = os.path.join(path, str(fold), 'train') if not os.path.exists(path_fold): os.makedirs(path_fold) # print feature_fold.shape np.save(os.path.join(path_fold, 'train_array'), feature_fold) np.save(os.path.join(path_fold, 'labels'), labels) np.save(os.path.join(path_fold, 'ids'), ids)
def prep_pipeline(dataset="RumEval2019", feature_set=["avgw2v"]): use_reddit_data = True path = "saved_data" + dataset folds = {} folds = load_dataset() reddit = load_data() #folds["train"].extend(reddit["train"]) #folds["dev"].extend(reddit["dev"]) #folds["test"].extend(reddit["test"]) folds["test"] = load_test_data_twitter()["test"] if use_reddit_data: reddit = load_data() folds['train'].extend(reddit['train']) folds['dev'].extend(reddit['dev']) reddit_test_data = load_test_data_reddit()['test'] folds["test"].extend(reddit_test_data) help_prep_functions.loadW2vModel() ### #%% for fold in list(reversed(list(folds.keys()))): print(fold) feature_fold = [] tweet_ids = [] fold_stance_labels = [] labels = [] ids = [] for conversation in folds[fold]: thread_feature_dict = extract_thread_features_incl_response( conversation) if fold == "test": # if it's in the test set it wont have veracity, assign it. conversation['veracity'] = TEST_DATA_LABELS["subtaskbenglish"][ conversation['id']] conversation['source']['label'] = TEST_DATA_LABELS[ "subtaskaenglish"][conversation['id']] for reply in conversation['replies']: reply['label'] = TEST_DATA_LABELS["subtaskaenglish"][ reply['id_str']] ( thread_features_array, thread_stance_labels, branches, ) = transform_feature_dict(thread_feature_dict, conversation, feature_set=feature_set) fold_stance_labels.extend(thread_stance_labels) tweet_ids.extend(branches) feature_fold.extend(thread_features_array) for i in range(len(thread_features_array)): labels.append(convert_label(conversation["veracity"])) ids.append(conversation["id"]) #% if feature_fold != []: feature_fold = pad_sequences( feature_fold, maxlen=None, dtype="float32", padding="post", truncating="post", value=0.0, ) fold_stance_labels = pad_sequences( fold_stance_labels, maxlen=None, dtype="float32", padding="post", truncating="post", value=0.0, ) labels = np.asarray(labels) path_fold = os.path.join(path, fold) if not os.path.exists(path_fold): os.makedirs(path_fold) np.save(os.path.join(path_fold, "train_array"), feature_fold) np.save(os.path.join(path_fold, "labels"), labels) np.save(os.path.join(path_fold, "fold_stance_labels"), fold_stance_labels) np.save(os.path.join(path_fold, "ids"), ids) np.save(os.path.join(path_fold, "tweet_ids"), tweet_ids)