Exemplo n.º 1
0
            yield df2.iloc[index-size1]['preprocessed_data']
        else:
            #print_sample(df1.iloc[index]['preprocessed_data'], True)
            yield df1.iloc[index]['preprocessed_data']


cv = tp.build_vectorizer("tfidf", min_df, max_df, n_features)
analyzer = cv.build_analyzer()
vectorizer = cv

# read and preprocess text data
test_df = pd.read_csv("data/test_set_1000.tsv", delimiter="\t", names=['title','creator','university','publisher', 'year','abstract','type','subject','id','philosophy'])
print("test size:",test_df.count()[0])
test_df = test_df.fillna("")
test_titles = pd.DataFrame(tp.lemmatize_data(test_df[['title']],"title"), columns=['title'])
test_titles = tp.preprocess_dataframe(test_titles, analyzer)
test_abs = pd.DataFrame(tp.lemmatize_data(test_df[['abstract']],"abstract"), columns=['abstract'])
test_abs = tp.preprocess_dataframe(test_abs, analyzer)
test_text = test_titles.merge(test_abs,left_index=True,right_index=True)

preprocessed_data = []
for index,row in test_text.iterrows():
    preprocessed_data.append(row['title'] + " " + row['abstract'])
print("preprocessed data size:",len(preprocessed_data))

test_df.loc[:,"preprocessed_data"] = preprocessed_data

orcid_abs = pd.read_csv("preprocessed_data/orcid_abs_preprocessed.csv")[['preprocessed_data']]
doiboost_abs = pd.read_csv("preprocessed_data/doiboost_abs_preprocessed.csv")[['preprocessed_data']]
data_abs = pd.concat([orcid_abs, doiboost_abs])
orcid_no_abs = pd.read_csv("preprocessed_data/orcid_no_abs_preprocessed.csv")[['preprocessed_data']]
Exemplo n.º 2
0
        "preprocessed_data/ethos_no_abs_preprocessed.csv")
    print("Number of samples:", data_abs.count()[0], data_no_abs.count()[0])

    for data, label in zip([data_abs, data_no_abs], ["abs", "no_abs"]):
        TDmatrix = vectorizer.transform(feed_preprocessed_data(data))
        res = clf.predict(TDmatrix)
        probs = clf.predict_proba(TDmatrix)
        res_df = add_columns_to_df(data, res, probs)
        res_df.to_csv("results/classification_" + label + ".csv", index=None)
        print(label, "saved")
else:
    #abstracts
    data = dsu.read_dataset_UK_ethos(True)
    data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"),
                               columns=['titolo'])
    data_titles = tp.preprocess_dataframe(data_titles, analyzer)
    data_text = data_titles
    print("samples:", data_text.count()[0])

    TDmatrix = vectorizer.transform(feed_data(data_text, True))
    res = clf.predict(TDmatrix)
    probs = clf.predict_proba(TDmatrix)
    res_df = add_columns_to_df(data, res, probs)
    res_df.to_csv("results/classification_abs.csv", index=None)
    print("abs saved")

    #no abstracts
    data_no_abs = dsu.read_dataset_UK_id(False)
    data_titles = pd.DataFrame(tp.lemmatize_data(data[['titolo']], "titolo"),
                               columns=['titolo'])
    data_titles = tp.preprocess_dataframe(data_titles, analyzer)
Exemplo n.º 3
0
vectorizer = tp.build_vectorizer("tfidf")

# preprocess text data
if not use_preprocessed:
    [positive_samples, negative_samples] = tp.read_samples("data/philosophy.csv", "data/no_philosophy.csv")
    num = negative_samples.count()[0] - 1
    rand_indexes = [randint(0,num) for _ in range(len(positive_samples)*negative_ratio)]
    negative_samples = negative_samples.iloc[rand_indexes]

    print("Positive samples:",positive_samples.count()[0])
    print("Negative samples:",negative_samples.count()[0])

    phil_subj = pd.DataFrame(tp.lemmatize_data(positive_samples[['subject']],"subject"), columns=['subject'])
    phil_subj = tp.preprocess_subjects(phil_subj)
    phil_titles = pd.DataFrame(tp.lemmatize_data(positive_samples[['title']],"title"), columns=['title'])
    phil_titles = tp.preprocess_dataframe(phil_titles, analyzer)
    phil_abs = pd.DataFrame(tp.lemmatize_data(positive_samples[['abstract']],"abstract"), columns=['abstract'])
    phil_abs = tp.preprocess_dataframe(phil_abs, analyzer)

    nphil_subj = pd.DataFrame(tp.lemmatize_data(negative_samples[['subject']],"subject"), columns=['subject'])
    nphil_subj = tp.preprocess_subjects(nphil_subj)
    nphil_titles = pd.DataFrame(tp.lemmatize_data(negative_samples[['title']],"title"), columns=['title'])
    nphil_titles = tp.preprocess_dataframe(nphil_titles, analyzer)
    nphil_abs = pd.DataFrame(tp.lemmatize_data(negative_samples[['abstract']],"abstract"), columns=['abstract'])
    nphil_abs = tp.preprocess_dataframe(nphil_abs, analyzer)

    phil_text = phil_subj.merge(phil_titles,left_index=True,right_index=True).merge(phil_abs,left_index=True,right_index=True)
    nphil_text = nphil_subj.merge(nphil_titles,left_index=True,right_index=True).merge(nphil_abs,left_index=True,right_index=True)
    tp.save_lemmatized_data(phil_text, nphil_text, "preprocessed_data/phil_text.csv", "preprocessed_data/nphil_text.csv")
else:
    [phil_text, nphil_text] = tp.read_lemmatized_data("preprocessed_data/phil_text.csv", "preprocessed_data/nphil_text.csv")
Exemplo n.º 4
0
data = tp.remove_missing_abstract(pd.read_csv("no_philosophy.csv"))
#data = tp.select_missing_abstract(pd.read_csv("no_philosophy.csv"))

vectorizer = joblib.load("vectorizer.pkl")
analyzer = vectorizer.build_analyzer()
#clf = joblib.load("randomforestCLF.pkl")
clf = joblib.load("LinearSVC_CLF.pkl")

while True:
    index = randint(0, min(data.count()))
    row = data.iloc[[index]]

    abstract_df = pd.DataFrame(tp.lemmatize_data(row, "abstract", False),
                               columns=['abstract'])
    abstract = tp.preprocess_dataframe(abstract_df,
                                       analyzer).iloc[0]["abstract"]

    title_df = pd.DataFrame(tp.lemmatize_data(row, "title", False),
                            columns=['title'])
    title = tp.preprocess_dataframe(title_df, analyzer).iloc[0]["title"]

    subject_df = pd.DataFrame(tp.lemmatize_data(row, "subject", False),
                              columns=['subject'])
    subject = re.sub(r'[\.\,\(\)\[\]\;\']', '', subject_df.iloc[0]["subject"])

    text = ' '.join([subject, title, abstract])
    vec = vectorizer.transform([text])
    res = clf.predict(vec)
    #probs = clf.predict_proba(vec)
    if res[0] == 1:
        print("\n")
Exemplo n.º 5
0
test_samples = pd.read_csv("data/test_set_1000.tsv",
                           delimiter="\t",
                           names=[
                               'title', 'creator', 'university', 'publisher',
                               'year', 'abstract', 'type', 'subject', 'id',
                               'philosophy'
                           ])

print("Positive samples_train:", positive_samples_train.count()[0])
print("Negative samples_train:", negative_samples_train.count()[0])
print("Test samples:", test_samples.count()[0])

phil_titles = pd.DataFrame(tp.lemmatize_data(positive_samples_train[['title']],
                                             "title"),
                           columns=['title'])
phil_titles = tp.preprocess_dataframe(phil_titles, analyzer)
nphil_titles = pd.DataFrame(tp.lemmatize_data(
    negative_samples_train[['title']], "title"),
                            columns=['title'])
nphil_titles = tp.preprocess_dataframe(nphil_titles, analyzer)
phil_text = phil_titles
nphil_text = nphil_titles

test_titles = pd.DataFrame(tp.lemmatize_data(test_samples[['title']], "title"),
                           columns=['title'])
test_titles = tp.preprocess_dataframe(test_titles, analyzer)
test_text = test_titles

# transform text data into vector space
vectorizer.fit(feed_data())
joblib.dump(vectorizer, "models/vectorizer.pkl")