示例#1
0
cv = tp.build_vectorizer("cv", min_df, max_df, n_features)
analyzer = cv.build_analyzer()
vectorizer = tp.build_vectorizer("tfidf")

# preprocess text data
if not use_preprocessed:
    [positive_samples, negative_samples] = tp.read_samples("data/philosophy.csv", "data/no_philosophy.csv")
    num = negative_samples.count()[0] - 1
    rand_indexes = [randint(0,num) for _ in range(len(positive_samples)*negative_ratio)]
    negative_samples = negative_samples.iloc[rand_indexes]

    print("Positive samples:",positive_samples.count()[0])
    print("Negative samples:",negative_samples.count()[0])

    phil_subj = pd.DataFrame(tp.lemmatize_data(positive_samples[['subject']],"subject"), columns=['subject'])
    phil_subj = tp.preprocess_subjects(phil_subj)
    phil_titles = pd.DataFrame(tp.lemmatize_data(positive_samples[['title']],"title"), columns=['title'])
    phil_titles = tp.preprocess_dataframe(phil_titles, analyzer)
    phil_abs = pd.DataFrame(tp.lemmatize_data(positive_samples[['abstract']],"abstract"), columns=['abstract'])
    phil_abs = tp.preprocess_dataframe(phil_abs, analyzer)

    nphil_subj = pd.DataFrame(tp.lemmatize_data(negative_samples[['subject']],"subject"), columns=['subject'])
    nphil_subj = tp.preprocess_subjects(nphil_subj)
    nphil_titles = pd.DataFrame(tp.lemmatize_data(negative_samples[['title']],"title"), columns=['title'])
    nphil_titles = tp.preprocess_dataframe(nphil_titles, analyzer)
    nphil_abs = pd.DataFrame(tp.lemmatize_data(negative_samples[['abstract']],"abstract"), columns=['abstract'])
    nphil_abs = tp.preprocess_dataframe(nphil_abs, analyzer)

    phil_text = phil_subj.merge(phil_titles,left_index=True,right_index=True).merge(phil_abs,left_index=True,right_index=True)
    nphil_text = nphil_subj.merge(nphil_titles,left_index=True,right_index=True).merge(nphil_abs,left_index=True,right_index=True)
示例#2
0
if use_preprocessed:
    data_abs = pd.read_csv("data/UK_abs_id.csv")
    data_no_abs = pd.read_csv("data/UK_no_abs_id.csv")
    print("Number of samples:", data_abs.count()[0], data_no_abs.count()[0])

    for data, label in zip([data_abs, data_no_abs], ["abs", "no_abs"]):
        TDmatrix = vectorizer.transform(feed_preprocessed_data(data))
        res = clf.predict(TDmatrix)
        probs = clf.predict_proba(TDmatrix)
        res_df = add_columns_to_df(data, res, probs)
        res_df.to_csv("results/classification_" + label + ".csv", index=None)
        print(label, "salvato")
else:
    #abstracts
    data = dsu.read_dataset_UK_id(True)
    data_subj = pd.DataFrame(tp.lemmatize_data(data[['argomento']],
                                               "argomento"),
                             columns=['argomento'])
    data_subj = tp.preprocess_subjects(data_subj)
    data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"),
                               columns=['titolo'])
    data_titles = tp.preprocess_dataframe(data_titles, analyzer)
    data_abs = pd.DataFrame(tp.lemmatize_data(data[['abs']], "abs"),
                            columns=['abs'])
    data_abs = tp.preprocess_dataframe(data_abs, analyzer)

    data_text = data_subj.merge(data_titles, left_index=True,
                                right_index=True).merge(data_abs,
                                                        left_index=True,
                                                        right_index=True)
    print("samples:", data_text.count()[0])
示例#3
0
# read and preprocess text data
positive_samples_train = pd.read_csv("data/new_philosophy_train.csv")
positive_samples_train = pd.read_csv("data/nophilosophy_train.csv")
test_samples = pd.read_csv("data/test_set_1000.tsv",
                           delimiter="\t",
                           names=[
                               'title', 'creator', 'university', 'publisher',
                               'year', 'abstract', 'type', 'subject', 'id',
                               'philosophy'
                           ])

print("Positive samples_train:", positive_samples_train.count()[0])
print("Negative samples_train:", negative_samples_train.count()[0])
print("Test samples:", test_samples.count()[0])

phil_titles = pd.DataFrame(tp.lemmatize_data(positive_samples_train[['title']],
                                             "title"),
                           columns=['title'])
phil_titles = tp.preprocess_dataframe(phil_titles, analyzer)
nphil_titles = pd.DataFrame(tp.lemmatize_data(
    negative_samples_train[['title']], "title"),
                            columns=['title'])
nphil_titles = tp.preprocess_dataframe(nphil_titles, analyzer)
phil_text = phil_titles
nphil_text = nphil_titles

test_titles = pd.DataFrame(tp.lemmatize_data(test_samples[['title']], "title"),
                           columns=['title'])
test_titles = tp.preprocess_dataframe(test_titles, analyzer)
test_text = test_titles

# transform text data into vector space
示例#4
0
vectorizer = tp.build_vectorizer("cv", n_features=200000)
analyzer = vectorizer.build_analyzer()

# read and preprocess text data
test_df = pd.read_csv("data/test_set_1000.tsv",
                      delimiter="\t",
                      names=[
                          'title', 'creator', 'university', 'publisher',
                          'year', 'abstract', 'type', 'subject', 'id',
                          'philosophy'
                      ])
test_df = test_df.fillna("nannn")
test_df = test_df[test_df['abstract'] != "nannn"]
print("test size:", test_df.count()[0])
test_df = test_df.fillna("")
test_titles = pd.DataFrame(tp.lemmatize_data(test_df[['title']], "title"),
                           columns=['title'])
test_titles = tp.preprocess_dataframe(test_titles, analyzer)
test_abs = pd.DataFrame(tp.lemmatize_data(test_df[['abstract']], "abstract"),
                        columns=['abstract'])
test_abs = tp.preprocess_dataframe(test_abs, analyzer)
test_text = test_titles.merge(test_abs, left_index=True, right_index=True)

preprocessed_data = []
for index, row in test_text.iterrows():
    preprocessed_data.append(row['title'] + " " + row['abstract'])
print("preprocessed data size:", len(preprocessed_data))

test_df.loc[:, "preprocessed_data"] = preprocessed_data

orcid_abs = pd.read_csv("preprocessed_data/orcid_abs_preprocessed.csv")[[
from sklearn.externals import joblib
import text_processing as tp
import dataset_utils as dsu
import pandas as pd

vectorizer = joblib.load("models/vectorizer_noAbs.pkl")
analyzer = vectorizer.build_analyzer()

data = dsu.read_dataset_UK_ethos(True)
print("data with abs size:", data.count())
data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"),
                           columns=['titolo'])
data_titles = tp.preprocess_dataframe(data_titles, analyzer)
data_text = data_titles

preprocessed_data = []
for index, row in data.iterrows():
    preprocessed_data.append(row['titolo'])
print("preprocessed data size:", len(preprocessed_data))

data.loc[:, "preprocessed_data"] = preprocessed_data

data.to_csv("preprocessed_data/ethos_abs_preprocessed.csv",
            index=None,
            columns=[
                "id", "titolo", "autore", "univ", "publisher", "anno", "abs",
                "tipo", "argomento", "preprocessed_data"
            ])

data = dsu.read_dataset_UK_ethos(False)
print("data with no abs size:", data.count())