def bag(data, num_dimensions): """ Transforms a list of strings into a Bag of Words. :param data: a list of strings :param num_dimensions: :return: """ data = [preprocessing.clean(row) for row in data] # Initialize the "CountVectorizer" object, which is scikit-learn's # bag of words tool. vectorizer = CountVectorizer(analyzer="word", tokenizer=None, preprocessor=None, stop_words='english', max_features=num_dimensions) # fit_transform() does two functions: First, it fits the model # and learns the vocabulary; second, it transforms our training data # into feature vectors. The input to fit_transform should be a list of # strings. train_data_features = vectorizer.fit_transform(data) # Numpy arrays are easy to work with, so convert the result to an # array train_data_features = train_data_features.toarray() return train_data_features, vectorizer.get_feature_names()
def get_test_data(self, input_path): test_text = load_text(input_path) test_label = load_labels(input_path) if self.tree_type == "FT": test_text2 = np.array([clean(t) for t in test_text]) return test_text2, test_label else: test_text = self.vectorizer.transform(test_text) return test_text, test_label
def return_tfidf(text_data): """ runs the sklearn tf-idf vecotrizer method to compute the table of the word vs. frequency the data should be such that it has text attributes plus the associated """ aTFIDF_model = ml_feature_extract.text.TfidfVectorizer(analyzer = 'word', ngram_range = (2,3)) text_data = text_data.apply(lambda x: x.lower()) text_data = text_data.apply(lambda x: preproc.clean(x)) aTFIDF_model.fit(text_data) text_data_tfidf = aTFIDF_model.transform(text_data) words = aTFIDF_model.get_feature_names() print aTFIDF_model.get_feature_names() return text_data_tfidf, words
def extract_vectors(self, stories, cap): """ Extracts truncated semantic relevance vectors (top n most relevant words in order of relevance). :param stories: :param cap: :return: """ stories = [preprocessing.clean(row, True, True).split(' ') for row in stories] sorted_vecs = [] for story in stories: relevance = [0] * len(story) for i in range(len(story)): for j in range(len(story)): if i != j: relevance[i] += self.compute(story[i], story[j]) relevance = [round(x, 2) for x in relevance] vec = sorted(set(zip(relevance, story)), reverse=True) sorted_vecs.append(vec[:cap]) return sorted_vecs
#Main should just call the other files import support_vector_machine as svm import knn as knn import preprocessing as preprocessing import CSV_creator as csv_maker from pylab import * print('starting...') file_path = csv_maker.read() data_frame = preprocessing.read_file(file_path) data_frame = preprocessing.clean(data_frame) score_averages = svm.get_plot_feature_scores(data_frame) plt.plot(score_averages) plt.ylabel("score averages") plt.xlabel("number of features") plt.show() score_averages = knn.get_plot_feature_scores(data_frame) plt.plot(score_averages) plt.ylabel("score averages") plt.xlabel("number of features") plt.show()
def get_train_data(self, node): """ get the data for this node from the text list or the vectorizer. :param node: :return: """ if self.tree_type == "FT": node_txt = self.text[node.input] node_labels = self.labels[node.input] with open("./temp.txt", "w") as f: for i, sentence in enumerate(node_txt): new_sentence = sentence ls = node_labels[i].split('/') if node.level < len(ls) -1: f.write("__label__" + node_labels[i].split("/")[node.level+1] + " " + clean(sentence) + "\n") else: f.write("__label__" + "* " + sentence + "\n") return "./temp.txt" else: node_labels = self.labels[node.input] next_level_labels = [] for label in node_labels: ls = label.split('/') if node.level < len(ls) - 1: next_level_labels.append(ls[node.level + 1]) else: next_level_labels.append('*') return self.matrix[node.input], next_level_labels
def preprocessed(documents): for document in documents: document = clean(document) document = extract_words(document) yield document
def model(q, ra, sa, mtype): # # add preprocessing calls here # # # load the saved model and return prediction # pred = "" data = pd.DataFrame([(q, ra, sa)], columns=['question', 'ref_answer', 'stu_answer']) q_basic = ['q_word_count', 'q_char_count', 'q_avg_word'] a_basic = [ 'r_word_count', 'r_char_count', 'r_avg_word', 's_word_count', 's_char_count', 's_avg_word' ] q_pos_basic = ['q_nouns', 'q_adjectives', 'q_verbs'] q_pos_adv = [ 'q_nouns_vs_length', 'q_adjectives_vs_length', 'q_verbs_vs_length', 'q_nouns_vs_words', 'q_adjectives_vs_words', 'q_verbs_vs_words' ] a_pos_basic = [ 'r_nouns', 'r_adjectives', 'r_verbs', 's_nouns', 's_adjectives', 's_verbs', ] a_pos_adv = [ 'r_nouns_vs_length', 'r_adjectives_vs_length', 'r_verbs_vs_length', 'r_nouns_vs_words', 'r_adjectives_vs_words', 'r_verbs_vs_words', 's_nouns_vs_length', 's_adjectives_vs_length', 's_verbs_vs_length', 's_nouns_vs_words', 's_adjectives_vs_words', 's_verbs_vs_words' ] similarity = ['Jaccard', 'bm25'] rouge1 = ['r1_f', 'r1_p', 'r1_r'] rouge2 = ['r2_f', 'r2_p', 'r2_r'] rougel = ['rlcs_f', 'rlcs_p', 'rlcs_r'] new_pos1 = [ 's_verbs_vs_r_verbs', 's_nouns_vs_r_nouns', 's_adjectives_vs_r_adjectives', 's_word_count_vs_r_word_count', 's_nouns_vs_words_vs_r_nouns_vs_words', 's_verbs_vs_words_vs_r_verbs_vs_words', 's_adjectives_vs_words_vs_r_adjectives_vs_words' ] new_pos2 = [ 'rs_word_diff', 'rs_noun_vs_words_diff', 'rs_verb_vs_words_diff', 'rs_adjectives_vs_words_diff' ] ibm_feat = ['precision', 'recall', 'F1_score'] q_tags = [ 'how_flag', 'what_flag', 'why_flag', 'who_flag', 'which_flag', 'when_flag', 'where_flag', 'whom_flag' ] features = q_basic + a_basic + q_pos_basic + q_pos_adv + a_pos_basic + a_pos_adv + similarity + rouge1 + rouge2 + rougel + new_pos1 + ibm_feat + q_tags columns = ['question', 'ref_answer', 'stu_answer'] temp = pp.get_basic_features(data, columns) cleaning_tasks = ['lemma', 'num'] temp = pp.clean(temp, cleaning_tasks, columns) temp = pp.get_basic_POS(temp, columns) temp = pp.get_advanced_POS(temp, columns) sim_columns = ['ref_answer', 'stu_answer'] temp = pp.get_Jaccard(temp, sim_columns) temp['bm25'] = 0 scores = pp.get_Rogue(temp, sim_columns) r1 = pd.DataFrame(scores)['rouge-1'].apply(pd.Series) r2 = pd.DataFrame(scores)['rouge-2'].apply(pd.Series) r3 = pd.DataFrame(scores)['rouge-l'].apply(pd.Series) r = pd.concat( [r1, r2, r3], axis=1, ) r.columns = [ 'r1_f', 'r1_p', 'r1_r', 'r2_f', 'r2_p', 'r2_r', 'rlcs_f', 'rlcs_p', 'rlcs_r' ] temp = pd.concat([temp, r], axis=1) temp = pp.get_new_POS1(temp) temp = pp.get_new_POS2(temp) temp['precision'] = 0 temp['recall'] = 0 temp['F1_score'] = 0 temp = pp.get_question_tags(temp) temp.drop(['question', 'ref_answer', 'stu_answer'], axis=1, inplace=True) temp = temp[features] inp_feat = np.array(temp) if mtype == "classifier": loaded_model = joblib.load("classifier.sav") pred = loaded_model.predict(inp_feat).ravel()[0] if pred >= 0.5: return "Correct" else: return "Incorrect" else: loaded_model = joblib.load("regressor.sav") pred = loaded_model.predict(inp_feat).ravel()[0] return str(pred)
def predict(review: Review, model=Depends(load_model())): text_clean = preprocessing.clean(review.text) text_tfidf = vectorizer.transform([text_clean]) sentiment = prediction_model.predict(text_tfidf) review.sentiment = Sentiment(sentiment.item()).name return review
"""Create an orthography profile for grapheme tokenization.""" from collections import OrderedDict from segments import Profile from filenames import GRAPHEME_PROFILE from preprocessing import clean from utils import read # Read in all EvaLatin training data into a single pyconll CoNLL structure conll = read() # Collect all the word forms text = "" for sentence in conll: for token in sentence: text += clean(token.form) + " " # Create orthography profile profile = Profile.from_text(text) profile.column_labels.remove("frequency") profile.graphemes.pop(" ") for key in ["ch", "qu", "th", "rh", "ph", "gn"]: profile.graphemes[key] = OrderedDict([("mapping", key[0].upper())]) profile.graphemes.move_to_end(key, last=False) with open(GRAPHEME_PROFILE, "w") as file: file.write(str(profile))
import pandas as pd import vector as v import preprocessing as p import cluster2 as c import classifier as r a = pd.read_csv("Z:/TermPaper/twitter_cred-master/data.csv") print("cleaning....") doc, id1 = p.clean(a) print("vectorizing....") dvec, global_vector = v.vectorize(doc) print("clustering....") g, t = c.cluster(dvec, global_vector, id1) cnt = 0 x = [] print(len(t)) print("credibility calculating") r.classifier(g)
def test_clean_text_fun(self): self.assertEqual(clean("</a>This :) is :( a test :-)!"), 'this is a test :) :( :)')
# -*- coding: utf-8 -*- """ Created on Sat Jul 02 11:23:43 2016 @author: Sandip Baishnab """ #importing modules import pandas as pd from preprocessing import clean from feature_extract import feature_class from classifier import classification #reading data training_data=pd.read_csv("C:/Sandip_Debjani/Sandip/Git/program/data/Sem_Eval/train.txt",header=0,sep='\t') test_data=pd.read_csv("C:/Sandip_Debjani/Sandip/Git/program/data/Sem_Eval/test.txt",header=0,sep='\t') #creating object for clean,feature generation, cl=clean() ft=feature_class() clf=classification() #preprocess preprocessed_train=cl.preprocess(training_data['tweet']) preprocessed_test=cl.preprocess(training_data['tweet']) features_train,features_test=ft.feature_function(preprocessed_train,preprocessed_test) result=clf.model_svm(features_train,list(training_data['polarity']),features_test) print result
def run(): time = dt.datetime.now() print "Fold {} start {:%H:%M:%S %d-%m-%Y}".format(n_fold, time) results.append(clf.classification(train, test, train_lengths, test_lengths)) print "Fold {} end {:%H:%M:%S %d-%m-%Y}".format(n_fold, dt.datetime.now()) read_input() orders, data, test_data = [], [], [ ] # data = list of tuples (sim_name, has_damage, sim) load_dataset() sims, sims_labels = preprocessing.clean(data, min(orders)) if conf.separated_test: test_sims, _ = preprocessing.clean(test_data, min(orders)) del orders, data, test_data results = [] n_fold = 1 if conf.separated_test: train, train_lengths = numpy.concatenate( sims, axis=0), [len(sim) for sim in sims] test, test_lengths = numpy.concatenate( test_sims, axis=0), [len(sim) for sim in test_sims] train[:, :-1], test[:, :-1] = preprocessing.normalization( train[:, :-1], test[:, :-1]) run() else:
def checkPost(post): cleanedPost = clean(post) return classification(cleanedPost)