def read_jsonl_folder(json_folder): """ Read the instance.jsonl and truth.jsonl the folder json_folder: the path to the folder that contain the two files write_folder: the path to the folder that contain the outfile Return the name of the outfile """ inst_columns = ['id',"targetTitle","targetParagraphs"]#, 'postMedia','postText'] truth_columns = ["id","truthClass"]#, "truthMode","truthJudgments"] path_inst_file = json_folder+"/instances.jsonl" path_truth_file = json_folder+"/truth.jsonl" merged_df = prepare_json_data(path_inst_file, path_truth_file, inst_columns, truth_columns) merged_df["targetTitle"] = merged_df["targetTitle"].progress_map(lambda x: str(x).strip("[").strip(']').strip("\'").strip('\"')) #merged_df['postText'] = merged_df['postText'].progress_map(lambda x: ' '.join(map(str, x))) #merged_df['postMedia'] = merged_df['postMedia'].progress_map(lambda x: 0 if x == "[]" else 1) merged_df['targetParagraphs'] = merged_df['targetParagraphs'].progress_map(lambda x: ' '.join(map(str, x))) #merged_df["truthScale"] = merged_df["truthMode"].progress_map(lambda x: "non" if x == 0.0 else ("slightly" if 0.3<x<0.6 else ("considerable" if 0.6<x<1 else "heavy"))) merged_df["truthClass"] = merged_df["truthClass"].progress_map(lambda x: "CB" if x == "clickbait" else "Non") drop_df = merged_df[~merged_df.targetTitle.str.contains("Sections Shows Live Yahoo!")] final_df = drop_df[~drop_df.targetTitle.str.contains("Top stories Top stories")] write_csv_file(final_df, json_folder) pk_file = save_pk_file(final_df, json_folder) #split_json_data(final_df, save_to) print(final_df[:3]) return pk_file
def read_gz_folder(gz_folder): """ read .gz files and return a dataframe contain the data in the file gz_folder: path to folder containing .gz files """ df_list = [] for read_file in tqdm(glob.glob(os.path.join(gz_folder, '*.gz'))): file_name = read_file.replace(".gz", ".txt") gz_to_txt(read_file, file_name) df = read_txt(file_name) df_list.append(df) merged_df = pd.concat(df_list) write_csv_file(merged_df, gz_folder) pk_file = save_pk_file(merged_df, gz_folder) print(merged_df[:5]) return pk_file
def headline_features(in_folder, w2v_model): """ Extract features from headlines in_folder: path to folder containing data file w2v_model: word embedding model """ sty_vecs = [] w2v_vecs = [] labels = [] for file_name in glob.glob(os.path.join(in_folder, '*.pk')): print(f"\nReading {file_name}\n") df = load_pk_file(file_name) sty_vec = extract_sty_feat(df) sty_vecs.append(sty_vec) w2v_vec = create_w2v(w2v_model, df["targetTitle"]) w2v_vecs.append(w2v_vec) labels.append(df["truthClass"]) print("Concatenating feature vectors") X_sty = np.concatenate(sty_vecs, axis=0) print(f"Stylometric: {X_sty.shape}") X_w2v = np.concatenate(w2v_vecs, axis=0) print(f"Word2vec: {X_w2v.shape}") X_cmb = np.concatenate((X_w2v, X_sty), axis=1) print(f"Combined: {X_cmb.shape}") y = np.asarray(pd.concat(labels)) print("Splitting data") print("Stylometric") sty_file = "Vector/sty" save_pk_file((X_sty, y), sty_file) print("Word2vec") w2v_file = "Vector/w2v" save_pk_file((X_w2v, y), w2v_file) print("Combined") cmb_file = "Vector/cmb" save_pk_file((X_cmb, y), cmb_file) print("Done")
def preprocess(in_folder, out_folder): """ Preprocess the data file in in_folder and return the processed data to out_folder in_folder: path to folder containing the data file out_folder: path to folder in which the processed data file is saved """ processed_folder = create_folder_path(out_folder) for file_name in glob.glob(os.path.join(in_folder, '*.pk')): print(f"Reading {file_name}") load_data = load_pk_file(file_name) headlines = load_data["targetTitle"] processed_headlines = preprocess_text(headlines) processed_data = pd.concat([headlines, processed_headlines], axis=1) processed_data["truthClass"] = load_data["truthClass"] if len(load_data.columns) > 2: processed_data["targetParagraphs"] = load_data["targetParagraphs"] docs = processed_data["targetParagraphs"].progress_map( lambda text: [] if text == [] else Text_process(text)) processed_data["cont_sent"] = docs.progress_map(lambda doc: [ ] if doc == [] else Text_process.tokenised_sentencier(doc)) processed_data["cont_num_sent"] = processed_data[ "cont_sent"].progress_map(lambda x: len(x)) processed_data["cont_avr_sent_len"] = processed_data[ "cont_sent"].progress_map(lambda x: 0 if len( x) == 0 else round(sum(len(i) for i in x) / len(x))) processed_data["cont_token"] = docs.progress_map( lambda doc: [] if doc == [] else Text_process.tokenizer(doc)) processed_data["cont_num_token"] = processed_data[ "cont_token"].progress_map(lambda x: len(x)) processed_data["cont_avr_token_len"] = processed_data[ "cont_token"].progress_map(lambda x: 0 if len( x) == 0 else round(sum(len(i) for i in x) / len(x))) processed_data["cont_arg"] = docs.progress_map( lambda doc: Text_process.get_arg(doc)) processed_data["cont_root"] = docs.progress_map( lambda doc: Text_process.get_root(doc)) processed_data["cont_ent"] = docs.progress_map( lambda doc: Text_process.get_ent(doc)) processed_data["cont_ent_label"] = docs.progress_map( lambda doc: Text_process.get_ent_label(doc)) processed_data["cont_senti_score"] = docs.progress_map( lambda doc: Text_process.senti_score(doc)) sim_scores = [] for i, row in processed_data[["token", "cont_sent"]].iterrows(): if row["cont_sent"] == []: score = "NA" sim_scores.append(score) else: sim_score = similarity_calculator(row["token"], row["cont_sent"]) sim_scores.append(sim_score) #processed_data.at[i,'sim_score'] = sim_score processed_data["sim_score"] = sim_scores processed_data["avr_sim_score"] = processed_data[ "sim_score"].progress_map( lambda score: "NA" if score == "NA" else float(np.sum(score) / len(score))) processed_data["sim_pct"] = processed_data[ "sim_score"].progress_map( lambda score: "NA" if score == "NA" else round( np.count_nonzero(score) / len(score) * 100)) processed_data.drop(columns=['sim_score']) save_file_name = processed_folder + '/' + os.path.basename( file_name).replace(".pk", "") #write_csv_file(processed_data,save_file_name) save_pk_file(processed_data, save_file_name) print(save_file_name)
import helpers from sklearn.utils import shuffle from helpers import load_pk_file, save_pk_file, create_folder_path if __name__ == "__main__": ## reduce the number of non-clickbait samples in the dataset by randomly select n non-clickbait sample with n is the number of clickbait samples folder = create_folder_path("Train") Potthast_corpus = [] Chakraborty_corpus = [] for file_name in glob.glob(os.path.join("Processed_data", '*.pk')): df = load_pk_file(file_name) cb = df.loc[df['truthClass'] == "CB"] non = df.loc[df['truthClass'] == "Non"] if "clickbait17" in file_name: Potthast_corpus.append(cb) Potthast_corpus.append(non.sample(n = len(cb))) else: Chakraborty_corpus.append(cb.sample(n = 5000)) Chakraborty_corpus.append(non.sample(n = 5000)) save_Potthast = pd.concat(Potthast_corpus) save_Chakraborty = pd.concat(Chakraborty_corpus) Potthast_data = shuffle(save_Potthast, random_state=5) Chakraborty_data = shuffle(save_Chakraborty, random_state=5) save_pk_file(Potthast_data, folder+"/Potthast_data") save_pk_file(Chakraborty_data, folder+"/Chakraborty_data")
def content_features(in_folder, w2v_model, d2v_model): """ Extract features from contents in_folder: path to folder containing data file w2v_model: word embedding model d2v_model: document embedding model """ sty_vecs = [] w2v_vecs = [] d2v_vecs = [] labels = [] for file_name in glob.glob(os.path.join(in_folder, '*.pk')): print(f"\nReading {file_name}\n") df = load_pk_file(file_name) if "targetParagraphs" in df.columns: filtered_df = df[df["targetParagraphs"].apply( lambda x: len(x) > 0)] sty_vec = extract_sty_feat(filtered_df) sty_vecs.append(sty_vec) w2v_vec = create_w2v(w2v_model, filtered_df["targetTitle"]) w2v_vecs.append(w2v_vec) labels.append(filtered_df["truthClass"]) sent_vec = filtered_df["cont_sent"].progress_map( lambda x: np.asarray([create_d2v(d2v_model, i) for i in x])) avr_sent_vec = sent_vec.progress_map(lambda x: np.mean(x, axis=0)) d2v_vec = np.asarray(list(avr_sent_vec)) features = [] for i, row in filtered_df.iterrows(): feat = dict() feat["cont_num_token"] = row["cont_num_token"] feat["cont_avr_token_len"] = row["cont_avr_token_len"] feat["cont_senti_score"] = row["cont_senti_score"] feat["avr_sim_score"] = row["avr_sim_score"] feat["sim_pct"] = row["sim_pct"] features.append(feat) dict_vtrz = DictVectorizer(sparse=False) dict_vect = dict_vtrz.fit_transform(features) d2v_vecs.append(np.concatenate((d2v_vec, dict_vect), axis=1)) print("Concatenating feature vectors") X_sty = np.concatenate(sty_vecs, axis=0) print(f"Stylometry: {X_sty.shape}") X_w2v = np.concatenate(w2v_vecs, axis=0) print(f"Word2vec: {X_w2v.shape}") X_d2v = np.concatenate(d2v_vecs, axis=0) print(f"Doc2vec: {X_d2v.shape}") X_cmb = np.concatenate((X_w2v, X_sty, X_d2v), axis=1) print(f"Combined: {X_cmb.shape}") y = list(pd.concat(labels)) print("Splitting data") print("Doc2vec") cmb_file = "Vector/d2v" save_pk_file((X_cmb, y), cmb_file) print("Done")