cv = tp.build_vectorizer("cv", min_df, max_df, n_features) analyzer = cv.build_analyzer() vectorizer = tp.build_vectorizer("tfidf") # preprocess text data if not use_preprocessed: [positive_samples, negative_samples] = tp.read_samples("data/philosophy.csv", "data/no_philosophy.csv") num = negative_samples.count()[0] - 1 rand_indexes = [randint(0,num) for _ in range(len(positive_samples)*negative_ratio)] negative_samples = negative_samples.iloc[rand_indexes] print("Positive samples:",positive_samples.count()[0]) print("Negative samples:",negative_samples.count()[0]) phil_subj = pd.DataFrame(tp.lemmatize_data(positive_samples[['subject']],"subject"), columns=['subject']) phil_subj = tp.preprocess_subjects(phil_subj) phil_titles = pd.DataFrame(tp.lemmatize_data(positive_samples[['title']],"title"), columns=['title']) phil_titles = tp.preprocess_dataframe(phil_titles, analyzer) phil_abs = pd.DataFrame(tp.lemmatize_data(positive_samples[['abstract']],"abstract"), columns=['abstract']) phil_abs = tp.preprocess_dataframe(phil_abs, analyzer) nphil_subj = pd.DataFrame(tp.lemmatize_data(negative_samples[['subject']],"subject"), columns=['subject']) nphil_subj = tp.preprocess_subjects(nphil_subj) nphil_titles = pd.DataFrame(tp.lemmatize_data(negative_samples[['title']],"title"), columns=['title']) nphil_titles = tp.preprocess_dataframe(nphil_titles, analyzer) nphil_abs = pd.DataFrame(tp.lemmatize_data(negative_samples[['abstract']],"abstract"), columns=['abstract']) nphil_abs = tp.preprocess_dataframe(nphil_abs, analyzer) phil_text = phil_subj.merge(phil_titles,left_index=True,right_index=True).merge(phil_abs,left_index=True,right_index=True) nphil_text = nphil_subj.merge(nphil_titles,left_index=True,right_index=True).merge(nphil_abs,left_index=True,right_index=True)
if use_preprocessed: data_abs = pd.read_csv("data/UK_abs_id.csv") data_no_abs = pd.read_csv("data/UK_no_abs_id.csv") print("Number of samples:", data_abs.count()[0], data_no_abs.count()[0]) for data, label in zip([data_abs, data_no_abs], ["abs", "no_abs"]): TDmatrix = vectorizer.transform(feed_preprocessed_data(data)) res = clf.predict(TDmatrix) probs = clf.predict_proba(TDmatrix) res_df = add_columns_to_df(data, res, probs) res_df.to_csv("results/classification_" + label + ".csv", index=None) print(label, "salvato") else: #abstracts data = dsu.read_dataset_UK_id(True) data_subj = pd.DataFrame(tp.lemmatize_data(data[['argomento']], "argomento"), columns=['argomento']) data_subj = tp.preprocess_subjects(data_subj) data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"), columns=['titolo']) data_titles = tp.preprocess_dataframe(data_titles, analyzer) data_abs = pd.DataFrame(tp.lemmatize_data(data[['abs']], "abs"), columns=['abs']) data_abs = tp.preprocess_dataframe(data_abs, analyzer) data_text = data_subj.merge(data_titles, left_index=True, right_index=True).merge(data_abs, left_index=True, right_index=True) print("samples:", data_text.count()[0])
# read and preprocess text data positive_samples_train = pd.read_csv("data/new_philosophy_train.csv") positive_samples_train = pd.read_csv("data/nophilosophy_train.csv") test_samples = pd.read_csv("data/test_set_1000.tsv", delimiter="\t", names=[ 'title', 'creator', 'university', 'publisher', 'year', 'abstract', 'type', 'subject', 'id', 'philosophy' ]) print("Positive samples_train:", positive_samples_train.count()[0]) print("Negative samples_train:", negative_samples_train.count()[0]) print("Test samples:", test_samples.count()[0]) phil_titles = pd.DataFrame(tp.lemmatize_data(positive_samples_train[['title']], "title"), columns=['title']) phil_titles = tp.preprocess_dataframe(phil_titles, analyzer) nphil_titles = pd.DataFrame(tp.lemmatize_data( negative_samples_train[['title']], "title"), columns=['title']) nphil_titles = tp.preprocess_dataframe(nphil_titles, analyzer) phil_text = phil_titles nphil_text = nphil_titles test_titles = pd.DataFrame(tp.lemmatize_data(test_samples[['title']], "title"), columns=['title']) test_titles = tp.preprocess_dataframe(test_titles, analyzer) test_text = test_titles # transform text data into vector space
vectorizer = tp.build_vectorizer("cv", n_features=200000) analyzer = vectorizer.build_analyzer() # read and preprocess text data test_df = pd.read_csv("data/test_set_1000.tsv", delimiter="\t", names=[ 'title', 'creator', 'university', 'publisher', 'year', 'abstract', 'type', 'subject', 'id', 'philosophy' ]) test_df = test_df.fillna("nannn") test_df = test_df[test_df['abstract'] != "nannn"] print("test size:", test_df.count()[0]) test_df = test_df.fillna("") test_titles = pd.DataFrame(tp.lemmatize_data(test_df[['title']], "title"), columns=['title']) test_titles = tp.preprocess_dataframe(test_titles, analyzer) test_abs = pd.DataFrame(tp.lemmatize_data(test_df[['abstract']], "abstract"), columns=['abstract']) test_abs = tp.preprocess_dataframe(test_abs, analyzer) test_text = test_titles.merge(test_abs, left_index=True, right_index=True) preprocessed_data = [] for index, row in test_text.iterrows(): preprocessed_data.append(row['title'] + " " + row['abstract']) print("preprocessed data size:", len(preprocessed_data)) test_df.loc[:, "preprocessed_data"] = preprocessed_data orcid_abs = pd.read_csv("preprocessed_data/orcid_abs_preprocessed.csv")[[
from sklearn.externals import joblib import text_processing as tp import dataset_utils as dsu import pandas as pd vectorizer = joblib.load("models/vectorizer_noAbs.pkl") analyzer = vectorizer.build_analyzer() data = dsu.read_dataset_UK_ethos(True) print("data with abs size:", data.count()) data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"), columns=['titolo']) data_titles = tp.preprocess_dataframe(data_titles, analyzer) data_text = data_titles preprocessed_data = [] for index, row in data.iterrows(): preprocessed_data.append(row['titolo']) print("preprocessed data size:", len(preprocessed_data)) data.loc[:, "preprocessed_data"] = preprocessed_data data.to_csv("preprocessed_data/ethos_abs_preprocessed.csv", index=None, columns=[ "id", "titolo", "autore", "univ", "publisher", "anno", "abs", "tipo", "argomento", "preprocessed_data" ]) data = dsu.read_dataset_UK_ethos(False) print("data with no abs size:", data.count())