Exemplo n.º 1
0
def bag(data, num_dimensions):
    """
    Transforms a list of strings into a Bag of Words.
    :param data: a list of strings
    :param num_dimensions:
    :return:
    """

    data = [preprocessing.clean(row) for row in data]

    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words='english',
                                 max_features=num_dimensions)

    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    train_data_features = vectorizer.fit_transform(data)

    # Numpy arrays are easy to work with, so convert the result to an
    # array
    train_data_features = train_data_features.toarray()

    return train_data_features, vectorizer.get_feature_names()
Exemplo n.º 2
0
    def get_test_data(self, input_path):
        test_text = load_text(input_path)
        test_label = load_labels(input_path)
        if self.tree_type == "FT":
            test_text2 = np.array([clean(t) for t in test_text])
            return test_text2, test_label

        else:
            test_text = self.vectorizer.transform(test_text)
            return test_text, test_label
def return_tfidf(text_data):
    """ runs the sklearn tf-idf vecotrizer method to compute the table of the word vs. frequency 
        the data should be such that it has text attributes plus the associated """
        
    aTFIDF_model = ml_feature_extract.text.TfidfVectorizer(analyzer = 'word', ngram_range = (2,3))
    text_data = text_data.apply(lambda x: x.lower())
    text_data = text_data.apply(lambda x: preproc.clean(x))
    aTFIDF_model.fit(text_data)
    text_data_tfidf = aTFIDF_model.transform(text_data)
    words = aTFIDF_model.get_feature_names()
    print aTFIDF_model.get_feature_names()

    return text_data_tfidf, words
Exemplo n.º 4
0
    def extract_vectors(self, stories, cap):
        """
        Extracts truncated semantic relevance vectors (top n most relevant words in order of relevance).
        :param stories:
        :param cap:
        :return:
        """

        stories = [preprocessing.clean(row, True, True).split(' ') for row in stories]
        sorted_vecs = []

        for story in stories:
            relevance = [0] * len(story)
            for i in range(len(story)):
                for j in range(len(story)):
                    if i != j:
                        relevance[i] += self.compute(story[i], story[j])
            relevance = [round(x, 2) for x in relevance]
            vec = sorted(set(zip(relevance, story)), reverse=True)
            sorted_vecs.append(vec[:cap])

        return sorted_vecs
Exemplo n.º 5
0
#Main should just call the other files

import support_vector_machine as svm
import knn as knn
import preprocessing as preprocessing
import CSV_creator as csv_maker
from pylab import *

print('starting...')

file_path = csv_maker.read()

data_frame = preprocessing.read_file(file_path)
data_frame = preprocessing.clean(data_frame)

score_averages = svm.get_plot_feature_scores(data_frame)

plt.plot(score_averages)
plt.ylabel("score averages")
plt.xlabel("number of features")

plt.show()

score_averages = knn.get_plot_feature_scores(data_frame)

plt.plot(score_averages)
plt.ylabel("score averages")
plt.xlabel("number of features")

plt.show()
Exemplo n.º 6
0
 def get_train_data(self, node):
     """
     get the data for this node from the text list or the vectorizer.
     :param node:
     :return:
     """
     if self.tree_type == "FT":
         node_txt = self.text[node.input]
         node_labels = self.labels[node.input]
         with open("./temp.txt", "w") as f:
             for i, sentence in enumerate(node_txt):
                 new_sentence = sentence
                 ls = node_labels[i].split('/')
                 if node.level < len(ls) -1:
                     f.write("__label__" + node_labels[i].split("/")[node.level+1] + " " + clean(sentence) + "\n")
                 else:
                     f.write("__label__" + "* " + sentence + "\n")
         return "./temp.txt"
     else:
         node_labels = self.labels[node.input]
         next_level_labels = []
         for label in node_labels:
             ls = label.split('/')
             if node.level < len(ls) - 1:
                 next_level_labels.append(ls[node.level + 1])
             else:
                 next_level_labels.append('*')
         return self.matrix[node.input], next_level_labels
Exemplo n.º 7
0
def preprocessed(documents):
    for document in documents:
        document = clean(document)
        document = extract_words(document)
        yield document
Exemplo n.º 8
0
def model(q, ra, sa, mtype):
    # # add preprocessing calls here
    #
    # # load the saved model and return prediction
    # pred = ""
    data = pd.DataFrame([(q, ra, sa)],
                        columns=['question', 'ref_answer', 'stu_answer'])

    q_basic = ['q_word_count', 'q_char_count', 'q_avg_word']
    a_basic = [
        'r_word_count', 'r_char_count', 'r_avg_word', 's_word_count',
        's_char_count', 's_avg_word'
    ]
    q_pos_basic = ['q_nouns', 'q_adjectives', 'q_verbs']
    q_pos_adv = [
        'q_nouns_vs_length', 'q_adjectives_vs_length', 'q_verbs_vs_length',
        'q_nouns_vs_words', 'q_adjectives_vs_words', 'q_verbs_vs_words'
    ]
    a_pos_basic = [
        'r_nouns',
        'r_adjectives',
        'r_verbs',
        's_nouns',
        's_adjectives',
        's_verbs',
    ]
    a_pos_adv = [
        'r_nouns_vs_length', 'r_adjectives_vs_length', 'r_verbs_vs_length',
        'r_nouns_vs_words', 'r_adjectives_vs_words', 'r_verbs_vs_words',
        's_nouns_vs_length', 's_adjectives_vs_length', 's_verbs_vs_length',
        's_nouns_vs_words', 's_adjectives_vs_words', 's_verbs_vs_words'
    ]
    similarity = ['Jaccard', 'bm25']
    rouge1 = ['r1_f', 'r1_p', 'r1_r']
    rouge2 = ['r2_f', 'r2_p', 'r2_r']
    rougel = ['rlcs_f', 'rlcs_p', 'rlcs_r']
    new_pos1 = [
        's_verbs_vs_r_verbs', 's_nouns_vs_r_nouns',
        's_adjectives_vs_r_adjectives', 's_word_count_vs_r_word_count',
        's_nouns_vs_words_vs_r_nouns_vs_words',
        's_verbs_vs_words_vs_r_verbs_vs_words',
        's_adjectives_vs_words_vs_r_adjectives_vs_words'
    ]
    new_pos2 = [
        'rs_word_diff', 'rs_noun_vs_words_diff', 'rs_verb_vs_words_diff',
        'rs_adjectives_vs_words_diff'
    ]
    ibm_feat = ['precision', 'recall', 'F1_score']
    q_tags = [
        'how_flag', 'what_flag', 'why_flag', 'who_flag', 'which_flag',
        'when_flag', 'where_flag', 'whom_flag'
    ]

    features = q_basic + a_basic + q_pos_basic + q_pos_adv + a_pos_basic + a_pos_adv + similarity + rouge1 + rouge2 + rougel + new_pos1 + ibm_feat + q_tags

    columns = ['question', 'ref_answer', 'stu_answer']

    temp = pp.get_basic_features(data, columns)

    cleaning_tasks = ['lemma', 'num']
    temp = pp.clean(temp, cleaning_tasks, columns)

    temp = pp.get_basic_POS(temp, columns)
    temp = pp.get_advanced_POS(temp, columns)

    sim_columns = ['ref_answer', 'stu_answer']
    temp = pp.get_Jaccard(temp, sim_columns)

    temp['bm25'] = 0

    scores = pp.get_Rogue(temp, sim_columns)
    r1 = pd.DataFrame(scores)['rouge-1'].apply(pd.Series)
    r2 = pd.DataFrame(scores)['rouge-2'].apply(pd.Series)
    r3 = pd.DataFrame(scores)['rouge-l'].apply(pd.Series)
    r = pd.concat(
        [r1, r2, r3],
        axis=1,
    )
    r.columns = [
        'r1_f', 'r1_p', 'r1_r', 'r2_f', 'r2_p', 'r2_r', 'rlcs_f', 'rlcs_p',
        'rlcs_r'
    ]
    temp = pd.concat([temp, r], axis=1)

    temp = pp.get_new_POS1(temp)
    temp = pp.get_new_POS2(temp)

    temp['precision'] = 0
    temp['recall'] = 0
    temp['F1_score'] = 0

    temp = pp.get_question_tags(temp)

    temp.drop(['question', 'ref_answer', 'stu_answer'], axis=1, inplace=True)
    temp = temp[features]

    inp_feat = np.array(temp)
    if mtype == "classifier":
        loaded_model = joblib.load("classifier.sav")
        pred = loaded_model.predict(inp_feat).ravel()[0]
        if pred >= 0.5:
            return "Correct"
        else:
            return "Incorrect"
    else:
        loaded_model = joblib.load("regressor.sav")
        pred = loaded_model.predict(inp_feat).ravel()[0]
        return str(pred)
Exemplo n.º 9
0
def predict(review: Review, model=Depends(load_model())):
    text_clean = preprocessing.clean(review.text)
    text_tfidf = vectorizer.transform([text_clean])
    sentiment = prediction_model.predict(text_tfidf)
    review.sentiment = Sentiment(sentiment.item()).name
    return review
Exemplo n.º 10
0
"""Create an orthography profile for grapheme tokenization."""

from collections import OrderedDict

from segments import Profile

from filenames import GRAPHEME_PROFILE
from preprocessing import clean
from utils import read

# Read in all EvaLatin training data into a single pyconll CoNLL structure
conll = read()

# Collect all the word forms
text = ""
for sentence in conll:
    for token in sentence:
        text += clean(token.form) + " "

# Create orthography profile
profile = Profile.from_text(text)
profile.column_labels.remove("frequency")
profile.graphemes.pop(" ")
for key in ["ch", "qu", "th", "rh", "ph", "gn"]:
    profile.graphemes[key] = OrderedDict([("mapping", key[0].upper())])
    profile.graphemes.move_to_end(key, last=False)
with open(GRAPHEME_PROFILE, "w") as file:
    file.write(str(profile))
Exemplo n.º 11
0
import pandas as pd
import vector as v
import preprocessing as p
import cluster2 as c
import classifier as r
a = pd.read_csv("Z:/TermPaper/twitter_cred-master/data.csv")
print("cleaning....")
doc, id1 = p.clean(a)
print("vectorizing....")
dvec, global_vector = v.vectorize(doc)
print("clustering....")
g, t = c.cluster(dvec, global_vector, id1)
cnt = 0
x = []
print(len(t))
print("credibility calculating")
r.classifier(g)
Exemplo n.º 12
0
 def test_clean_text_fun(self):
     self.assertEqual(clean("</a>This :) is :( a test :-)!"), 'this is a test :) :( :)')
Exemplo n.º 13
0
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 02 11:23:43 2016

@author: Sandip Baishnab
"""

#importing modules
import pandas as pd
from preprocessing import clean 
from feature_extract import feature_class
from classifier import classification

#reading data
training_data=pd.read_csv("C:/Sandip_Debjani/Sandip/Git/program/data/Sem_Eval/train.txt",header=0,sep='\t')
test_data=pd.read_csv("C:/Sandip_Debjani/Sandip/Git/program/data/Sem_Eval/test.txt",header=0,sep='\t')

#creating object for clean,feature generation,
cl=clean()
ft=feature_class()
clf=classification()

#preprocess
preprocessed_train=cl.preprocess(training_data['tweet'])
preprocessed_test=cl.preprocess(training_data['tweet'])
features_train,features_test=ft.feature_function(preprocessed_train,preprocessed_test)
result=clf.model_svm(features_train,list(training_data['polarity']),features_test)
print result
Exemplo n.º 14
0

def run():
    time = dt.datetime.now()
    print "Fold {} start  {:%H:%M:%S %d-%m-%Y}".format(n_fold, time)
    results.append(clf.classification(train, test, train_lengths,
                                      test_lengths))
    print "Fold {} end  {:%H:%M:%S %d-%m-%Y}".format(n_fold, dt.datetime.now())


read_input()
orders, data, test_data = [], [], [
]  # data = list of tuples (sim_name, has_damage, sim)
load_dataset()

sims, sims_labels = preprocessing.clean(data, min(orders))
if conf.separated_test:
    test_sims, _ = preprocessing.clean(test_data, min(orders))
del orders, data, test_data

results = []
n_fold = 1
if conf.separated_test:
    train, train_lengths = numpy.concatenate(
        sims, axis=0), [len(sim) for sim in sims]
    test, test_lengths = numpy.concatenate(
        test_sims, axis=0), [len(sim) for sim in test_sims]
    train[:, :-1], test[:, :-1] = preprocessing.normalization(
        train[:, :-1], test[:, :-1])
    run()
else:
Exemplo n.º 15
0
def checkPost(post):
    cleanedPost = clean(post)
    return classification(cleanedPost)