def genfeature(self, ls_x): ''' a. Shallow features 1. number of words in the sentence (normalize) 2. average number of characters in the words 3. percentage of stop words 4. minimum, maximum and average inverse document frequency :param ls_x: sencences X without label :return: ''' vectorizer = TfidfVectorizer(stop_words='english',smooth_idf=True, sublinear_tf=False, use_idf=True) tfidf = vectorizer.fit_transform(ls_x) array = tfidf.toarray() X = [] append = X.append maxtoken = 0 for idx,l in enumerate(ls_x): ws = l.split() maxtoken = max(len(ws),maxtoken) try: stops = round(reduce(lambda x,y: x+1 if y in self.tweetmanager.stop else x, ws,0)/(len(ws)+1e-10),2) except: pass append([len(ws),self.avgch(ws), stops, min(array[idx]), max(array[idx]), sum(array[idx])/len(array[idx])]) return [[round(x[0]*1.0/maxtoken,2)] + x[1:] for x in X]
def cosine_similarity(city1_content, city2_content): """Determines the tf-idf (term frequency-inverse document frequency) and then calculates the cosine similarity between between the two Wikipedia pages.""" vectorizer = TfidfVectorizer(tokenizer=generate_stemmed_tokens, stop_words='english') tfidf = vectorizer.fit_transform([city1_content, city2_content]) return ((tfidf * tfidf.T).A)[0, 1]
def getData(): train_data= load_files('dataset/train') test_data=load_files("dataset/test") count_Vec=TfidfVectorizer(min_df=1,decode_error="replace") doc_train=count_Vec.fit_transform(train_data.data) doc_test=count_Vec.transform(test_data.data) return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
def trainClassifier(classifier, X, y): vectorizer = TfidfVectorizer(analyzer='char', use_idf=True, sublinear_tf=True, stop_words='english', ngram_range=(1,3), lowercase=True) # vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(X).toarray() if classifier == "SVC": clf = LinearSVC() # parameters = {'kernel':['linear', 'rbf'], 'C':[0.1, 1, 10]} # clf = grid_search.GridSearchCV(clf, parameters) clf.fit(X, y) # print clf.best_params_ clf_vect = [clf, vectorizer] f = open('svm.pkl', 'wb') pickle.dump(clf_vect, f) return clf, vectorizer elif classifier == "RF": clf = RandomForestClassifier() clf.fit(X, y) return clf, vectorizer elif classifier == "MNB": clf = MultinomialNB() clf.fit(X, y) return clf, vectorizer elif classifier == "LDA": clf = LDA() clf.fit(X, y) return clf, vectorizer elif classifier == "KNN": clf = KNeighborsClassifier() clf.fit(X, y) return clf, vectorizer
def tfidf_vectorize(corpus) : ## This object does all the job. For more information about ## the semantics of the arguments, please read the documentation at ## http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html ## TfidfVectorizer takes care of the stop-words. corpus_specific_stopwords.extend(ENGLISH_STOP_WORDS) vectorizer = TfidfVectorizer(max_df = 0.5, token_pattern='[A-Za-z]{3,}', # we restrict to words with 3 or more characters min_df = 50, # words occurring less than this number of times are discarded stop_words= corpus_specific_stopwords, # We use a standard set of stop words for the English language plus some words we already identified use_idf = True # Use tf/idf, clusterIdx.e., term frequency divided by the term's document frequency ) ## We store the documents and the text of the articles in different lists. texts = [] documents = [] ## Iterating the corpus for doc in corpus : texts.append(doc.text) ## We store the title and the category as a pair ## The category is the topic of the article. We will use ## it as ground truth when calculating purity and entropy documents.append(doc) ## This call constructs the document-term matrix. It returns a sparse matrix of ## type http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html doc_term_matrix = vectorizer.fit_transform(texts); ## We return the ground truth clustering, the document-term matrix and the vectorizer object return documents, doc_term_matrix, vectorizer
def get_data(self, abstract=False): data = self.mongo.get_all(order_by='id_doc') data = [doc for doc in data] if abstract: only_text = self.get_data_with_abstract(data) else: only_text = [doc['text'] for doc in data] only_labels = [doc['label'] for doc in data] tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000, min_df=2, stop_words='english', strip_accents='unicode', use_idf=True, ngram_range=(1, 1), norm='l2', tokenizer=TextUtils.tokenize_and_stem) tfidf_matrix = tfidf_vectorizer.fit_transform(only_text) print 'After tfidf vectorizer: found %s documents and %s terms' \ % (tfidf_matrix.shape[0], tfidf_matrix.shape[1]) dict_out = {} for l in sorted(set(only_labels)): dict_out[l] = { 'docs': [], 'fscore': '' } for doc in data: dict_out[doc['label']]['docs'].append(doc['id_doc']) return tfidf_matrix, dict_out
def preprocess(article_file, lable_file, k): features = pickle.load(open(article_file)) features = np.array(features) # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels lables = pickle.load(open(lable_file)) le = preprocessing.LabelEncoder() le.fit(lables) lables = le.transform(lables) # print le.inverse_transform([0]) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1, stop_words='english') features_train_transformed = vectorizer.fit_transform(features) # selector : SelectPercentile selector = SelectPercentile(f_classif, percentile=k) selector.fit(features_train_transformed, lables) # selector : chi2 # selector = SelectPercentile(score_func=chi2) # selector.fit(features_train_transformed, lables) features_train_transformed = selector.transform(features_train_transformed).toarray() return features_train_transformed, lables, vectorizer, selector, le, features
def tfidf(synopses): tfidf_vectorizer=TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3)) tfidf_matrix = tfidf_vectorizer.fit_transform(synopses) terms=tfidf_vectorizer.get_feature_names() print("terms:",terms) print(tfidf_matrix.shape) return terms,tfidf_matrix # 返回tfidf矩阵
def compute_tf_idf_vectorizer(data_path="/Users/HyNguyen/Documents/Research/Data/stories", save_path="exsum/tf_idf_vectorizer_200_05.pickle", min_df = 200, max_df = 0.5): """ Detail: Params: data_path: data directory save_path: idfs save to, suffix: 200_05: min_df= 200, max_df = 0.5(len(documents)) min_df: lower bound max_df: upper bound """ dataset = loadData(data_path) documents = [] for counter, sample in enumerate(dataset): filename, contents, highlights = sample content_str = "" for content in contents: if content[-1] != ".": content += "." content_str += " " + content documents.append(content_str) tf_idf_vectorizer = TfidfVectorizer(max_df=max_df,min_df=min_df,stop_words=stopwords.words('english')) tf_idf_vectorizer.fit(documents) with open(save_path, mode="wb") as f: pickle.dump(tf_idf_vectorizer,f) print ("Tf-idf Vectorizer: length of vocabulary: ", len(tf_idf_vectorizer.vocabulary))
def t_test_accuracy(topic_id, n_runs, estimator_params_votes_per_doc_tuples): """ Test if accuracy for estimators with given parameters is significantly better than that of the first estimator in the tuple """ texts, vote_lists, truths = texts_vote_lists_truths_by_topic_id[topic_id] vectorizer = TfidfVectorizer() text_similarity = cosine_similarity(vectorizer.fit_transform(texts)) accuracy_arrays = [] for estimator, args, votes_per_doc in estimator_params_votes_per_doc_tuples: stop_idx = votes_per_doc * len(texts) # Now get n_runs accuracies and put then into numpy arrays accuracies = Parallel(n_jobs=4)( delayed(get_accuracy_sequence)(estimator, stop_idx, texts, vote_lists, truths, text_similarity, idx, True, *args) for idx in xrange(n_runs) ) accuracy_arrays.append( np.array( filter(lambda x: x is not None, accuracies) ) ) # Baseline result_row = [] result_row.append( "%0.2f" % np.mean(accuracy_arrays[0]) ) # T-tests for accuracy_array in accuracy_arrays[1:]: _, pval = ttest_ind(accuracy_array, accuracy_arrays[0], equal_var=False) significance_indicator = lambda p: "*" if p < 0.01 else " " is_better = "$" if np.mean(accuracy_array) > np.mean(accuracy_arrays[0]) else " " result_row.append( "%0.2f %s %s" % (np.mean(accuracy_array), significance_indicator(pval), is_better)) return "|".join(result_row)
def MMR(docs, count): # Setup select_lst = [docs.pop(0)] candidates = [] tfidf_vectorizer = TfidfVectorizer() relevance_weight = 0.9 # Start recalculating scores while len(select_lst) != len(docs): select_sen = [] for i in select_lst: select_sen.append(i.sentence) for candidate in docs: old_score = candidate.rating stemmed_sen = stemming([candidate]) stemmed_lst = stemming(select_lst) tfidf_matrix = tfidf_vectorizer.fit_transform(stemmed_lst) target = tfidf_vectorizer.transform(stemmed_sen) similarities = cosine_similarity(target,tfidf_matrix).flatten() similarities.sort() similarity = similarities[-1] new_score = old_score * relevance_weight - similarity * (1 - relevance_weight) candidate.rating = new_score docs = sorted(docs, key=attrgetter("rating"), reverse=True) select_lst.append(docs.pop(0)) return select_lst
def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3) print("latent_dirichlet_allocation_cos: fit and transform body") t0 = time() lda_body_matrix = lda_body.fit_transform(X_train_body) print("done in %0.3fs." % (time() - t0)) print("latent_dirichlet_allocation_cos: transform head") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar lda_head_matrix = lda_body.transform(X_train_head) #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100) print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body') # calculate cosine distance between the body and head X = [] for i in range(len(lda_head_matrix)): X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X
def kmeans(content_list): tfidf_vectorizer = TfidfVectorizer(tokenizer=jieba_tokenize, \ lowercase=False) ''' tokenizer: 指定分词函数 lowercase: 在分词之前将所有的文本转换成小写,因为涉及到中文文本处理, 所以最好是False ''' tfidf_matrix = tfidf_vectorizer.fit_transform(content_list) num_clusters = 20 km_cluster = KMeans(n_clusters=num_clusters, max_iter=300, n_init=8, \ init='k-means++',n_jobs=8) ''' n_clusters: 指定K的值 max_iter: 对于单次初始值计算的最大迭代次数 n_init: 重新选择初始值的次数 init: 制定初始值选择的算法 n_jobs: 进程个数,为-1的时候是指默认跑满CPU 注意,这个对于单个初始值的计算始终只会使用单进程计算, 并行计算只是针对与不同初始值的计算。比如n_init=10,n_jobs=40, 服务器上面有20个CPU可以开40个进程,最终只会开10个进程 ''' #返回各自文本的所被分配到的类索引 result = km_cluster.fit_predict(tfidf_matrix) print "Predicting result: ", result return result
def read_examples(filename, sparm): """Parses an input file into an example sequence.""" # This reads example files of the type read by SVM^multiclass. examples = [] text = [] count = 0 # Open the file and read each example. for line in file(filename): # Get rid of comments. if line.find('#'): line = line[:line.find('#')] target, tokens = line.split('::')[0], line.split('::')[1:] # If the line is empty, who cares? if not tokens: continue # Get the target. text[count] = target # Get the features. tokens = [t.split(':') for t in tokens] features = [(0,1)]+[(int(k),float(v)) for k,v in tokens] # Add the example to the list examples.append((svmapi.Sparse(features), count)) count += 1 # Print out some very useful statistics. vectorizer = TfidfVectorizer(stop_words='english') global tf_idf_transformed_matrix tf_idf_transformed_matrix = vectorizer.fit_transform(text) print len(examples),'examples read' return examples
def createTDIDF(): ## Bag of words with open("./data/movies.csv") as f: train_set1 = [line.lower().rstrip() for line in f] with open("./data/dvd.csv") as f: train_set2 = [line.lower().rstrip() for line in f] train_set = sorted(list(set(train_set1 + train_set2))) # Create dictionary to find movie dictTrain = dict() for i,movie in enumerate(train_set): dictTrain[movie] = i # Find weitghts tfidf_vectorizer = TfidfVectorizer() tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set) ## Tri-grams lenGram = 3 train_setBigrams = [] for mov in train_set: temp = [mov[i:i+lenGram] for i in range(len(mov)-1)] temp = [elem for elem in temp if len(elem) == lenGram] train_setBigrams.append(' '.join(temp)) train_setBigrams = sorted(list(set(train_setBigrams))) dictTrainBigrams = dict() for i,movie in enumerate(train_setBigrams): dictTrainBigrams[movie] = i tfidf_vectorizerBigrams = TfidfVectorizer() tfidf_matrix_trainBigrams = tfidf_vectorizerBigrams.fit_transform(train_setBigrams) return [tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram]
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin): MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)] def __init__(self): self._vec = TfidfVectorizer(max_df=0.95, min_df=2) def get_feature_names(self): return [x + "_TFIDF" for x in self._vec.get_feature_names()] def get_data_array(self, df): return df[self.MEDICAL_KEYWORDS] \ .apply(lambda x: " ".join(x[x == 1].index), axis=1).values def fit(self, df, y=None): data_arr = self.get_data_array(df) self._vec.fit(data_arr) return self def transform(self, df): data_arr = self.get_data_array(df) return self._vec.transform(data_arr).toarray()
def preprocess(word_data, targets): print("\n### PREPROCESSING DATA ###") # vectorize print("-- Vectorization") vectorizer = TfidfVectorizer(sublinear_tf=True) # , stop_words='english' data_transformed = vectorizer.fit_transform(word_data) # feature selection print("-- Feature Selection") selector = SelectPercentile(percentile=5) data_selected = selector.fit_transform(data_transformed, targets) if data_selected.shape[1] == 0: data_selected = data_transformed else: print("Top {} features were selected".format(data_selected.shape[1])) # print top features nr_features = 30 i = selector.scores_.argsort()[::-1][:nr_features] top_features = np.column_stack((np.asarray(vectorizer.get_feature_names())[i], selector.scores_[i], selector.pvalues_[i])) print("\nTop %i Features:" % nr_features) print(pd.DataFrame(top_features, columns=["token", "score", "p-val"]), "\n") features_train, features_test, labels_train, labels_test = \ train_test_split(data_selected, targets, test_size=0.2, stratify=targets) return features_train, features_test, labels_train, labels_test
def test_tfidfvectorizer_invalid_idf_attr(): vect = TfidfVectorizer(use_idf=True) vect.fit(JUNK_FOOD_DOCS) copy = TfidfVectorizer(vocabulary=vect.vocabulary_, use_idf=True) expected_idf_len = len(vect.idf_) invalid_idf = [1.0] * (expected_idf_len + 1) assert_raises(ValueError, setattr, copy, 'idf_', invalid_idf)
def readFile(filename): global vectorizer train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3) train_size = train_data.shape[0] clean_train = [] for i in xrange(0,train_size): clean_train.append(filter(train_data['review'][i])) #if i%1000 ==0: # print '%d reviews processed...' %i #vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000) if vectorizer==None: vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.5, max_features = 50000) train_data_feature = vectorizer.fit_transform(clean_train) else: vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_) train_data_feature = vec.fit_transform(clean_train) print train_data_feature.shape if 'test' in filename: return train_data['id'], train_data_feature else: return train_data['id'], train_data_feature, train_data['sentiment']
def get_peronalpreference_vectors(vocab, user_pref_values): vectorizer = TfidfVectorizer(vocabulary=vocab, lowercase=False) vectors = vectorizer.fit_transform(user_pref_values).toarray() words = vectorizer.get_feature_names() # idf = vectorizer.idf_ # print dict(zip(vectorizer.get_feature_names(), idf)) return words, vectors
def main(): if os.path.exists(args.out_svd_result_matrix): print("Loading SVD matrix from file") X = np.load(args.out_svd_result_matrix) print("Loading corpus") _, file_index = LoadCorpus(args.training_dir) else: print("Loading corpus") corpus, file_index = LoadCorpus(args.training_dir) print("Building TF-IDF") tf_idf = TfidfVectorizer(input="content", lowercase=False) X = tf_idf.fit_transform(corpus) del corpus print("Running LSA") svd = TruncatedSVD(args.dimentionality) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) print("Saving SVD results") np.save(args.out_svd_result_matrix, X) if ( os.path.exists(args.out_inv_idx) and os.path.exists(args.out_unique_kmeans_labels) and os.path.exists(args.out_idx) ): print("Loading labels") unique_labels = np.load(args.out_unique_kmeans_labels) inv_idx = np.load(args.out_inv_idx) idx = np.load(args.out_idx) unique_X = X[idx] else: print("Unique matrix") b = np.ascontiguousarray(X).view(np.dtype((np.void, X.dtype.itemsize * X.shape[1]))) _, idx, inv_idx = np.unique(b, return_index=True, return_inverse=True) print("Saving inv_idx") np.save(args.out_inv_idx, inv_idx) print("Saving idx") np.save(args.out_idx, idx) unique_X = X[idx] print("Running K-Means") unique_labels, _ = KMeans(unique_X) print("Save unique K-Means labels") np.save(args.out_unique_kmeans_labels, unique_labels) print("Re-label non-unique") labels = unique_labels[inv_idx] for l in range(unique_labels.max() + 1): out_filename = args.out_unique_distance_matrix_prefix + str(l) + ".npy" if os.path.exists(out_filename): continue print("Calculating distance matrix for label:", l) D = CalcDistances(unique_labels, l, unique_X) print("Saving to distance matrix to file") np.save(out_filename, D) if not os.path.exists(args.out_corpus_index): print("Calculating corpus index") corpus_index = GetCorpusIndex(file_index, labels, unique_labels, inv_idx) print("Saving corpus index") json.dump(corpus_index, open(args.out_corpus_index, "w"))
class RecommenderNB: min_score = None stop_words = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves"] def __init__(self, num_hashtags=40): RecommenderNB.min_score = float(1/(float(num_hashtags)-1.0)) self.tl = TweetLib() print "Generating classifier ... " documents = self.tl.get_hashtag_documents(num_hashtags) corpus = [b for a, b in documents] self.hashtags = [a for a,b in documents] all_classes = range(len(documents)) self.vectorizer = TfidfVectorizer(stop_words='english') self.xtrain = self.vectorizer.fit_transform(corpus) self.ytrain = all_classes self.parameters = {'alpha': 0.01} self.clf = MultinomialNB(**self.parameters).partial_fit(self.xtrain, self.ytrain, self.ytrain) print "Classifier has been generated..." def recommend(self, tweet): tweet = " ".join([w.lower() for w in tweet.split() if not w.lower() in RecommenderNB.stop_words]) xtest = self.vectorizer.transform([tweet]) pred = self.clf.predict_proba(xtest)[0] sorted_pred = sorted(enumerate(pred), key=lambda x:x[1]) max_score = max([b for a,b in sorted_pred]) if max_score < RecommenderNB.min_score: return None else: return list(reversed([self.hashtags[i[0]] for i in sorted_pred]))
def gen_data(self, fname): """ :fname : input file, every line means a single data :rtype : List[List[float]]: data matrix """ lines = [ self.langConvHandler.convert(line.strip().lower()) for line in codecs.open(fname, "rb","utf-8") if len(line) > 6] # lines = list(set(lines)) # remove duplicates logging.info("number of data %d " % len(lines)) cut_lines = [" ".join(jieba.cut(line)) for line in lines] # transform to tfidfVec tfidfVec = TfidfVectorizer(max_features = 3000) tfidf_data = tfidfVec.fit_transform(cut_lines) tfidf_data = tfidf_data.toarray() # save origin text with open("./output/origin_lines.txt", "wb") as fw: json.dump(lines, fw) # save vectorize data np.save("./output/tfidf.corpus.npy", tfidf_data) self.lines = lines self.tfidf_data = tfidf_data
def get_top_terms(self, stops=STOPS): # vecotrize using only 1-grams vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(1,3)) tfidf = vectorizer.fit_transform(self.docs) # enumerate feature names, ie. the actual words self.feature_names = vectorizer.get_feature_names() # convert to dense array dense = tfidf.todense() # container for top terms per doc self.features = [] for doc in dense: doc = doc.tolist()[0] # creates a list of tuples, (term_id, score) phrase_scores = [pair for pair in zip(range(0, len(doc)), doc) if pair[1] > 0] # feature_ids = sorted(phrase_scores, key=lambda t: t[1] * -1) doc_features = [] for f_ in phrase_scores: fname = self.feature_names[f_[0]] fscore = f_[1] doc_features.append((fscore, fname)) top_terms = sorted(doc_features, reverse=True) #[:n_terms] # top_terms = ",".join([ x[1] for x in top_terms ]) self.features.append(top_terms)
def tfidf_ize(train, test, node_info): vectorizer = TfidfVectorizer(ngram_range=(1,1)) vectorizer.fit(node_info.abstract.as_matrix()) for table in [train, test]: table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna('')) table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna('')) table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna('')) table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna('')) #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1) table.loc[:, 'temp22'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel table.loc[:, 'temp23'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1) table.loc[:, 'temp24'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \ + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1) vectorizer = TfidfVectorizer(ngram_range=(2,2)) vectorizer.fit(node_info.abstract.as_matrix()) for table in [train, test]: table_tfidf_abstract_1 = vectorizer.transform(table.abstract_1.fillna('')) table_tfidf_abstract_2 = vectorizer.transform(table.abstract_2.fillna('')) table_tfidf_title_1 = vectorizer.transform(table.title_1.fillna('')) table_tfidf_title_2 = vectorizer.transform(table.title_2.fillna('')) #table['temp27'] = table_tfidf_abstract_1.multiply(table_tfidf_abstract_2).sum(1) table.loc[:, 'temp27'] = table_tfidf_abstract_1.minimum(table_tfidf_abstract_2).sum(1) # Intersection kernel table.loc[:, 'temp28'] = table_tfidf_title_1.minimum(table_tfidf_title_2).sum(1) table.loc[:, 'temp29'] = table_tfidf_abstract_1.minimum(table_tfidf_title_2).sum(1) \ + table_tfidf_abstract_2.minimum(table_tfidf_title_1).sum(1) return train, test
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20): # convert the training data text to features using TF-IDF vectorization vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english') X_train = vectorizer.fit_transform(chapter_contents_train) # X_train_array = X_train.toarray() # print "tfidf vector length: ", len(X_train_array) #dbg # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg # use only the best k features according to chi-sq selection ch2 = SelectKBest(chi2, k=k) X_train = ch2.fit_transform(X_train, y_train) # determine the actual features used after best-k selection feature_names = np.asarray(vectorizer.get_feature_names()) chisq_mask = ch2.get_support() features_masks = zip(feature_names,chisq_mask) selected_features = [z[0] for z in features_masks if z[1]] # train the classifier clf.fit(X_train, y_train) # convert the test data text into features using the same vectorizer as for training X_test = vectorizer.transform(chapter_contents_test) X_test = ch2.transform(X_test) # obtain binary class predictions for the test set preds = clf.predict(X_test) return preds, selected_features, clf
def train_classifier(download=True, parameters=None, ngram_range=(1, 1)): """Train the intent classifier.""" if download: download_wiki() path = os.path.join(l.TOPDIR, 'train.json') training_set = json.load(open(path)) path = os.path.join(l.TOPDIR, 'wiki.json') wiki_set = json.load(open(path)) target_names = list(set([i['unit'] for i in training_set + wiki_set])) train_data, train_target = [], [] for example in training_set + wiki_set: train_data.append(clean_text(example['text'])) train_target.append(target_names.index(example['unit'])) tfidf_model = TfidfVectorizer(sublinear_tf=True, ngram_range=ngram_range, stop_words='english') matrix = tfidf_model.fit_transform(train_data) if parameters is None: parameters = {'loss': 'log', 'penalty': 'l2', 'n_iter': 50, 'alpha': 0.00001, 'fit_intercept': True} clf = SGDClassifier(**parameters).fit(matrix, train_target) obj = {'tfidf_model': tfidf_model, 'clf': clf, 'target_names': target_names} path = os.path.join(l.TOPDIR, 'clf.pickle') pickle.dump(obj, open(path, 'w'))
def fit(self, docs, clean=False): ''' pipeline: clean, tokenize, tfidf, nmf, kmeans ''' if clean: print 'cleaning raw docs ......' clean_docs = self.clean(docs) else: clean_docs = docs print 'running tfidf ......' if 'tokenizer' not in self.kw_tfidf: self.tfidf = TfidfVectorizer(tokenizer=self.tokenize, **self.kw_tfidf) else: self.tfidf = TfidfVectorizer(**self.kw_tfidf) X = self.tfidf.fit_transform(clean_docs) print 'running NMF ......' self.nmf = NMF(**self.kw_nmf) H = self.nmf.fit_transform(X) W = self.nmf.components_ print 'fetching top 50 words for each topic ......' self.top_n_words(50, W) return X, H, W
def tfidf_covariance(texts, savepath): if not savepath.endswith("/"): savepath = savepath + "/" if os.path.exists(savepath + "__linkage_average.npy"): Z = np.load(savepath + "__linkage_average.npy") else: if not os.path.exists(savepath): os.makedirs(savepath) from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(input = str, strip_accents = 'ascii', analyzer ='word', max_features=5000) y = vectorizer.fit_transform(" ".join(text) for text in texts) Z = linkage(y.todense(), method='average', metric='euclidean') np.save(savepath + "__linkage_average.npy", Z) if os.path.exists(savepath + "__covariance__.npy"): Cov = np.load(savepath + "__covariance__.npy") observables = HierarchicalObservation(Cov) else: root, nodes = to_tree(Z, rd=True) assign_parents(root) adj_mat = get_adjacency_matrix(nodes) deg_mat = get_degree_matrix(nodes) sigma = 5 laplacian = np.diag(deg_mat) - adj_mat + 1/(sigma**2) * np.eye(len(deg_mat)) Cov = np.linalg.inv(laplacian)[:len(texts), :len(texts)] np.save(savepath + "__covariance__.npy", Cov) observables = HierarchicalObservation(Cov) return observables
def simple_tfidf_alldocs(): qs = Posts.objects.all() docs,post_index_map = vectorize_docs(n_samples=n_samples,log_batch_size=log_batch_size, qs=qs) #Get the doc bodies tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features = n_features_init,ngram_range=(1,n_gram),max_df=0.8) tfidf_matrix_raw = tfidf_vectorizer.fit_transform(docs) #docs x n-gram-features tfidf_matrix_scaled = scale(tfidf_matrix_raw, with_mean = False) #Can't use sparse matrices unless with_mean=False return tfidf_matrix_scaled, post_index_map
labels_for_removal=[k for i,k in enumerate(ngramMat.columns) if "cwjobscouk" in k] + \ [k for i,k in enumerate(ngramMat.columns) if "such" in k] + \ [k for i,k in enumerate(ngramMat.columns) if "please" in k] + \ [k for i,k in enumerate(ngramMat.columns) if "job" in k] + \ [k for i,k in enumerate(ngramMat.columns) if "london" == k] + \ [k for i,k in enumerate(ngramMat.columns) if "be" == k] + \ [k for i,k in enumerate(ngramMat.columns) if "is" == k] + \ [k for i,k in enumerate(ngramMat.columns) if "are" == k] + \ [k for i,k in enumerate(ngramMat.columns) if "more" == k] ngramMat.drop(labels_for_removal, inplace=True, axis=1) #%% add R and C++. python filters out single-character words and punctuation. num_docs = ngramMat.shape[0] ngram_vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(1, 3), min_df=1,sublinear_tf=True,lowercase=False) tf = ngram_vectorizer.fit_transform(job_docs) fnames = ngram_vectorizer.get_feature_names() dense = tf.todense() Cpp = [i for i,k in enumerate(fnames) if "C++" == k] new=np.reshape(np.array(dense[:,Cpp]),num_docs) ngramMat['C++'] = pd.Series(new,index=ngramMat.index) R = [i for i,k in enumerate(fnames) if " R" == k] new=np.reshape(np.array(dense[:,R]),num_docs) ngramMat['R'] = pd.Series(new,index=ngramMat.index) #%% remove duplicate docs DM_docs = cosine_distances(ngramMat)
continue count+=1 if count==185: break # post=col[5]+" "+col[6]+" "+col[7]+" "+col[8]+" "+col[9]+" "+col[10]+" "+col[11]+" "+col[12]+" "+col[13]+" "+col[14] # trainData.append(post) for i in range(10): trainData.append(col[i+5]) if col[3]=='Female': t=str(col[4])+"0" else: t=str(col[4])+"1" trainTarget.append(int(t)) # Creating input feature vector using TfidfVectorizer vectorizer=TfidfVectorizer(use_idf=True, token_pattern='[^ \n,".\':()ঃ‘?’।“”!;a-zA-Z0-9#০১২৩৪৫৬৭৮৯*&_><+=%$-`~|^·]+') #০১২৩৪৫৬৭৮৯ trainData=vectorizer.fit_transform(trainData) features=vectorizer.get_feature_names() # Initializing the Support Vector Machine model model = svm.SVC(kernel='linear', C=1, gamma=1) # Analyzing with 5 iteration for i in range(5): x_train, x_test, y_train, y_test = train_test_split(trainData, trainTarget, test_size=0.3) # Fitting Support Vector Machine model with trainData and trainTarget model.fit(x_train, y_train) predicted2 = model.predict(x_test) count2 = 0 for i in range(len(predicted2)): if (predicted2[i]-y_test[i])==0:
#Word Cloud (not working) wordcloud = WordCloud(max_font_size=40).generate(text) plt.figure() plt.imshow(wordcloud) plt.axis("off") plt.show() #topic modeling #use may be final list as it is encoded #Vectorize the text and #Make pairwise document distance based on TF-IDF #check unique words from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer vectorizer = TfidfVectorizer(stop_words='english', min_df=2) dtm = vectorizer.fit_transform(final) #print(dtm.shape) vocab = vectorizer.get_feature_names( ) # list of unique vocab, we will use this later print(len(vocab), '# of unique words') #print vocab[-10:] #print vocab[:10] #NMF Decomposition using term-document matrix from sklearn import decomposition #print 'num of documents, num of unique words' #print dtm.shape num_topics = 5
def create_bow_model(wdir, corpusdir, outfile, **kwargs): """ Creates a bow model (a matrix of token counts) from a collection of full text files. Arguments: wdir (str): path to the working directory corpusdir (str): relative path to the input directory (the collection of text files) outfile (str): relative path to the output file (the bow matrix) optional: mfw (int): how many of the most frequent terms to use, if this is 0, all the terms are used mode (str): should the counts be normalized? options: "count" (default), "tf-idf" vocab_file (bool): if True, the vocabulary of the corpus is stored as a list in a text file stopword_file (str): relative path to a file containing a list of stop words """ print("creating bow model...") mfw = kwargs.get("mfw", 0) mode = kwargs.get("mode", "count") vocab_file = kwargs.get("vocab_file", False) stopword_file = kwargs.get("stopword_file") if stopword_file: stopwords = pd.read_csv(join(wdir, stopword_file), header=None) stopwords = list(stopwords.iloc[:, 0]) if mode == "tf-idf": if mfw == 0: if stopword_file: vectorizer = TfidfVectorizer(input='filename', stop_words=stopwords) else: vectorizer = TfidfVectorizer(input='filename') else: if stopword_file: vectorizer = TfidfVectorizer(input='filename', max_features=mfw, stop_words=stopwords) else: vectorizer = TfidfVectorizer(input='filename', max_features=mfw) else: if mfw == 0: if stopword_file: vectorizer = CountVectorizer(input='filename', stop_words=stopwords) else: vectorizer = CountVectorizer(input='filename') else: if stopword_file: vectorizer = CountVectorizer(input='filename', max_features=mfw, stop_words=stopwords) else: vectorizer = CountVectorizer(input='filename', max_features=mfw) # possible parameters and attributes for the CountVectorizer: # lowercase by default # stop_words: for a list of stop words # token_pattern: regex denoting what constitutes a token # ngram_range: tuple (min_n,max_n) # analyzer: word, char, char_wb # max_df: default 1.0, float in range 0.1.-1.0 or integer (absolute counts), ignore terms that have a document frequency higher than this # min_df: default 1, float or integer (absolute counts), ignore terms that have a document frequency lower than this, "cut-off" # max_features: only top max features ordered by term frequency across the corpus # vocabulary # attributes: # vocabulary_: a mapping of terms to feature indices # stop_words_: terms that were ignored because of max_features, max_df or min_df # possible parameters and attributes for the TfidfVectorizer: # see also above # use_idf: Enable inverse-document-frequency reweighting. Default: true # smooth_idf: Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions. Default: true # sublinear_tf: Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). # idf_: The inverse document frequency (IDF) vector filenames = sorted(glob.glob(join(wdir, corpusdir, "*.txt"))) # bow: sparse representation bow = vectorizer.fit_transform(filenames) bow = bow.toarray() #print(bow.size) #print(bow.shape) vocab = vectorizer.get_feature_names() if (vocab_file == True): vocab_fr = pd.DataFrame(data=vocab) vocab_fr.to_csv(join(wdir, "vocab.txt"), encoding="UTF-8", header=False, index=False) print("created vocabulary file...") #print(vocab[:100]) #exit() #print(vocab[:100]) # save to file idnos = [re.split(r"\.", re.split(r"/", f)[-1])[0] for f in filenames] bow_frame = pd.DataFrame(columns=vocab, index=idnos, data=bow) bow_frame.to_csv(join(wdir, outfile), sep=",", encoding="utf-8") print("Done! Number of documents and vocabulary: ", bow.shape) print("Number of tokens: ", bow.sum())
] # 去掉停用词 contents = " ".join([txt for txt in contents if txt not in stopwords]) print(contents) return contents def main(): path = '258' txt_processing(path) if __name__ == '__main__': main() tfidf = TfidfVectorizer() x_train, x_test, y_train, y_test = train_test_split(contents, class_list, test_size=0.2) # # 保存数组 # np.save('conttnes.npy',contents) # np.save('class_list.npy',class_list) X_train = tfidf.fit_transform(x_train) X_test = tfidf.transform(x_test) # 贝叶斯模型 mulp = MultinomialNB() mulp_NB = mulp.fit(X_train, y_train)
import pandas as pd, numpy as np from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn import svm column = "word_seg" train = pd.read_csv('../../data/raw_data/train_set.csv') test = pd.read_csv('../../data/raw_data/test_set.csv') test_id = test["id"].copy() vec = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1) trn_term_doc = vec.fit_transform(train[column]) test_term_doc = vec.transform(test[column]) fid0 = open('baseline.csv', 'w') y = (train["class"] - 1).astype(int) lin_clf = svm.LinearSVC() lin_clf.fit(trn_term_doc, y) preds = lin_clf.predict(test_term_doc) i = 0 fid0.write("id,class" + "\n") for item in preds: fid0.write(str(i) + "," + str(item + 1) + "\n") i = i + 1 fid0.close()
### append the text to word_data word_data.append(parsed_email_stripped_common_words) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == 'sara': from_data.append(0) elif name == 'chris': from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump( word_data, open("your_word_data.pkl", "w") ) pickle.dump( from_data, open("your_email_authors.pkl", "w") ) print word_data[152] ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words = "english") vectorizer.fit_transform(word_data) vocab_list = vectorizer.get_feature_names() print "num different words:", len(vocab_list) print vocab_list[34597]
Democrats pounced on President Trump’s criticism of Robart, with Democratic senators flatly saying the President’s comments will factor into the confirmation hearings for Supreme Court nominee Neil Gorsuch. “Attack on federal judge from POTUS is beneath the dignity of that office. That attitude can lead America to calamity,” Washington Gov. Jay Inslee tweeted Saturday. Attack on federal judge from POTUS is beneath the dignity of that office. That attitude can lead America to calamity. — Governor Jay Inslee (@GovInslee) February 4, 2017 “The President’s attack on Judge James Robart, a Bush appointee who passed with 99 votes, shows a disdain for an independent judiciary that doesn’t always bend to his wishes and a continued lack of respect for the Constitution, making it more important that the Supreme Court serve as an independent check on the administration,” Senate Minority Leader Chuck Schumer said in a statement. “With each action testing the Constitution, and each personal attack on a judge, President Trump raises the bar even higher for Judge Gorsuch’s nomination to serve on the Supreme Court. His ability to be an independent check will be front and center throughout the confirmation process.” Vermont. Sen. Patrick Leahy, the ranking member of the Judiciary Committee, said President Trump’s “hostility toward the rule of law is not just embarrassing, it is dangerous.” “We need a nominee for the Supreme Court willing to demonstrate he or she will not cower to an overreaching executive. This makes it even more important that Judge Gorsuch, and every other judge this president may nominate, demonstrates the ability to be an independent check and balance on an administration that shamefully and harmfully seems to reject the very concept.” Robart’s order on Friday was a significant setback to President Trump’s ban and set up the nation for a second straight weekend of confusion about the policy’s legality. The White House said Friday the Department of Justice will challenge the decision. In a statement, White House press secretary Sean Spicer initially called Robart’s order “outrageous” before quickly issuing another statement that dropped that word. Robart has presided in the US District Court for the Western District of Washington state since 2004. He assumed senior status in 2016. """ documents = [news1, news2] tfidf = TfidfVectorizer().fit_transform(documents) pairwise_sim = tfidf * tfidf.T print(pairwise_sim.A)
documents = full_text.split() # check number of documentions print(len(documents)) print('\n') # preprocess documents (remove special characters and lowercase everything) for i in range(len(documents)): documents[i] = " ".join(documents[i].split()) documents[i] = documents[i].replace(r"\[.*\]","") documents[i] = re.sub(r'([^\s\w]|_)+', '', documents[i]) documents[i] = documents[i].lower() # use tfidf to create word vector vectorizer = TfidfVectorizer(stop_words='english') X = vectorizer.fit_transform(documents) # try different k values k = 7 model = KMeans(n_clusters=k, init='k-means++', max_iter=1000, n_init=1) model.fit(X) # print top terms per cluster (code snippet taken from python documentation online) print("Top terms per cluster:") order_centroids = model.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(k): print("Cluster %d:" % i), for ind in order_centroids[i, :50]: print(' %s' % terms[ind]),
df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: ' '.join([w for w in x.split() if len(w)>2])) df['clean_documents'] = df['clean_documents'].fillna('').apply(lambda x: x.lower()) # tokenization tokenized_doc = df['clean_documents'].fillna('').apply(lambda x: x.split()) # de-tokenization detokenized_doc = [] for i in range(len(df)): t = ' '.join(tokenized_doc[i]) detokenized_doc.append(t) df['clean_documents'] = detokenized_doc # TF-IDF vector vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True) X = vectorizer.fit_transform(df['clean_documents']) # SVD represent documents and terms in vectors svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122) lsa = svd_model.fit_transform(X) # Documents - Topic vector pd.options.display.float_format = '{:,.16f}'.format topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"]) topic_encoded_df["documents"] = df['clean_documents'] topic_encoded_df.describe() # display(topic_encoded_df[["documents", "topic_1", "topic_2"]])
# # https://www.cnblogs.com/pinard # # Permission given to modify the code as long as you keep this declaration at the top # # 文本挖掘预处理之TF-IDF https://www.cnblogs.com/pinard/p/6693230.html # In[2]: from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer corpus = [ "I come to China to travel", "This is a car polupar in China", "I love tea and Apple ", "The work is to write some papers in science" ] vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) print(tfidf) # In[4]: from sklearn.feature_extraction.text import TfidfVectorizer tfidf2 = TfidfVectorizer() re = tfidf2.fit_transform(corpus) print(re)
print("%d categories" % len(categories)) print() # split a training set and a test set y_train = data_train.target y_test = data_test.target print( "Extracting features from the training dataset using a sparse vectorizer" ) t0 = time() vectorizer = TfidfVectorizer(encoding=ENCODING, use_idf=True, norm='l2', binary=False, sublinear_tf=True, min_df=0.001, max_df=1.0, ngram_range=(1, 2), analyzer='word', stop_words=None) # the output of the fit_transform (x_train) is a sparse csc matrix. X_train = vectorizer.fit_transform(data_train.data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration)) print("n_samples: %d, n_features: %d" % X_train.shape) print() print( "Extracting features from the test dataset using the same vectorizer")
def tf_idf_classify( input_df, feature_column, label_column, test_size=.2, return_fscore=True, show_info=True, ): """ :param input_df: :param feature_column: :param label_column: :param process_text: :param test_size: :param random_state: :param classifier: RF,DT,LSVC, LR,NB,KNN :param kwargs: :return: """ df = input_df.copy() # nltk.download('stopwords') # nltk.download('wordnet') # stopword removal and lemmatization stopwords = nltk.corpus.stopwords.words('english') lemmatizer = WordNetLemmatizer() msk = np.random.rand(len(df)) < 1 - test_size train_df = df[msk] val_df = df[~msk] train_X = [] test_X = [] train_y = train_df[label_column].tolist() test_y = val_df[label_column].tolist() labels = list(df[label_column]) labels = [str(l) for l in labels] # text pre processing for text in tqdm(train_df[feature_column]): review = re.sub('[^a-zA-Z]', ' ', text) review = review.lower() review = review.split() review = [ lemmatizer.lemmatize(word) for word in review if not word in set(stopwords) ] review = ' '.join(review) train_X.append(review) # text pre processing for text in tqdm(val_df[feature_column]): review = re.sub('[^a-zA-Z]', ' ', text) review = review.lower() review = review.split() review = [ lemmatizer.lemmatize(word) for word in review if not word in set(stopwords) ] review = ' '.join(review) test_X.append(review) # tf idf tf_idf = TfidfVectorizer() # applying tf idf to training data X_train_tf = tf_idf.fit_transform(train_X) # applying tf idf to training data X_train_tf = tf_idf.transform(train_X) # transforming test data into tf-idf matrix X_test_tf = tf_idf.transform(test_X) # naive bayes classifier naive_bayes_classifier = MultinomialNB() naive_bayes_classifier.fit(X_train_tf, train_y) # predicted y y_pred = naive_bayes_classifier.predict(X_test_tf) f, cf = single_label_f_score(y_gold=test_y, y_pred=y_pred) if show_info: print('f-score:', f) print('label wise f-score', cf) conf_mat = confusion_matrix(test_y, y_pred) fig, ax = plt.subplots(figsize=(4, 4)) labels = list(set(labels)) sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d', xticklabels=labels, yticklabels=labels) plt.ylabel('Actual') plt.xlabel('Predicted') plt.title("TFIDF CONFUSION MATRIX", size=16) if return_fscore: return f, cf
class Reader: dir = os.getcwd() # Gets the current working directory words_of_tweets = [ ] # Saves all the tweet cleared from stop-words, stemmed and tokenized called_once = False # Indicates if the GloVe model has been trained (read) or not onehot_encoder = CountVectorizer() scaler = MinMaxScaler(feature_range=(0, 1)) tester = MinMaxScaler(feature_range=(0, 1)) def dummy_fun(self, doc): return doc vectorizer = TfidfVectorizer(lowercase=False, analyzer='word', tokenizer=dummy_fun, preprocessor=dummy_fun) # min_df : float in range [0.0, 1.0] or int, default=1 # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. # This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, # integer absolute counts. This parameter is ignored if vocabulary is not None. vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) # sg: CBOW if 0, skip-gram if 1 # ‘min_count’ is for neglecting infrequent words. # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) model = Word2Vec() # dm: DBOW if 0, distributed-memory if 1 # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) modeldoc = Doc2Vec() # GloVe model glove_model = {} # Feature Selection # Univariate_Selection test = SelectKBest(score_func=chi2, k=100) # Feature Extraction with RFE# Feature Extraction with Recursive Feature Elimination rfe = RFE(model, 100) # Feature Extraction with PCA pca = PCA(n_components=100) # Feature Extraction with TruncatedSVD svd = TruncatedSVD(n_components=100) # Feature Importance with Extra Trees Classifier sfm = RandomForestClassifier() models = SelectFromModel(sfm) train_A = None train_A_emoji = None train_A_emoji_hash = None train_B = None train_B_emoji = None train_B_emoji_hash = None input_A = None input_A_emoji = None input_B = None input_B_emoji = None ############################################################################################################################################################## # Pre-processing and convert the input using one hot encoding, TF-IDF and other encoders ############################################################################################################################################################## def tokenize(self, text): # Tokenize tweets words = word_tokenize(text) # remove punctuation from each word table = str.maketrans('', '', string.punctuation) words = [w.translate(table) for w in words] # remove all tokens that are not alphabetic words = [word for word in words if word.isalpha()] # Delete Stop-Words whitelist = ["n't", "not"] # Keep the words "n't" and "not" stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words or w in whitelist] stopwords_wordcloud = set(STOPWORDS) words = [ w for w in words if w not in stopwords_wordcloud or w in whitelist ] return words # Print the counts of the top 85 most used words and print a graph with the words of the data set def wordcloud(self): stopwords_wordcloud = set(STOPWORDS) # Print the counts of the top 85 most used words in tweets vectorizer = CountVectorizer(analyzer='word', tokenizer=self.tokenize, lowercase=True, stop_words=stopwords_wordcloud, max_features=85) corpus_words = vectorizer.fit_transform(self.train_A['tweet']) corpus_words = corpus_words.toarray() vocab = vectorizer.get_feature_names() # Sum up the counts of each vocabulary word dist = np.sum(corpus_words, axis=0) # For each, print the vocabulary word and the number of times it # appears in the data set for tag, count in zip(vocab, dist): print(count, ' ', tag) # Print a scheme with most used words that are not stopwords wordcloud = WordCloud(background_color="black", stopwords=stopwords_wordcloud, random_state=500, relative_scaling=1.0, colormap='summer').generate(" ".join( [i for i in self.train_A['tweet']])) plt.figure(facecolor='k') plt.imshow(wordcloud) plt.axis("off") plt.title("Most used words in tweets") plt.show() ############################################################################################################################################################## # Pre-processing of the tweets def pre_processing(self): # Feature Extraction data = Feature_Extraction.TwitterData_ExtraFeatures() data.build_features(self.train_A) self.extra_features = data.processed_data # Clearing training dataset and Integer Encoding self.train_A['tweet'] = self.train_A['tweet'].str.replace( 'http\S+|www.\S+', '', case=False) # Delete URLs self.train_A['tweet'] = self.train_A['tweet'].str.replace( r'@\S+', '', case=False) # Delete Usernames self.train_A['tweet'] = self.train_A['tweet'].str.replace( r'#', ' ', case=False ) # Replace hashtags with space to deal with the case where the tweet appears to be one word but is consisted by more seperated from hashtags # print('Average number of words per sentence: ', np.mean([len(s.split(" ")) for s in self.train_A.tweet])) for i in range(0, len(self.train_A)): # Tokenize tweets words = word_tokenize(self.train_A.iloc[i][2]) # remove punctuation from each word table = str.maketrans('', '', string.punctuation) words = [w.translate(table) for w in words] # remove all tokens that are not alphabetic words = [word for word in words if word.isalpha()] # stemming of words porter = PorterStemmer() words = [porter.stem(word) for word in words] # Delete Stop-Words whitelist = ["n't", "not", 'nor', "nt" ] # Keep the words "n't" and "not", 'nor' and "nt" stop_words = set(stopwords.words('english')) words = [w for w in words if w not in stop_words or w in whitelist] # Keep the tokenized tweets self.words_of_tweets.append(words) # self.wordcloud() # Print number of 85 most used words and a scheme with most used words that are not stopwords ############################################################################################################################################### ############################################################################################################################################### # Select the proper encoding and Feature Selection # x_enc: training data set or test data set # train_test: whether x_enc is training set or test set # y: the irony labels of either the training set or the test set # dataset_index: the indexes of train set or test set # extra_features: Added features from feature extraction # feature_selection: number that indicates what feature selection algorithm will be used # encoding: number that indicates what encoding algorithm will be used # print_file: the file name that the print will be written def get_enc(self, x_enc, train_test, y, dataset_index, extra_features, feature_selection, encoding, print_file): # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Encodings encoded_tweets = [] # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # TF-IDF if encoding == 1: encoded_tweets = self.tf_idf(x_enc, train_test).toarray( ) # Used to convert sparse matrix (produced from TF-IDF) to dense matrix (needed for concatenate) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # One hot encoding if encoding == 2: encoded_tweets = self.one_hot_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Bi-grams if encoding == 3: encoded_tweets = self.bigrams_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Word2Vec if encoding == 4: encoded_tweets = self.Word2Vec_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Doc2Vec if encoding == 5: encoded_tweets = self.Doc2Vec_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # GloVe if encoding == 6: encoded_tweets = self.GloVe_enc(x_enc, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Feature Selection # Format the features from Feature Extraction extra_features = zip( *extra_features ) # * in used to unzip the list, result is transposed rows with columns. Rows changed to number of tweets and columns changed to number of features extra_features = list(extra_features) extra_features = np.array(extra_features) extra_features = extra_features[dataset_index] print("features chosen shape: ", extra_features.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features chosen shape: " + str(extra_features.shape) + '\n') # Normalize each of the columns of the added features form Feature Selection with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features before normalization: " + str(extra_features) + '\n') if train_test == 1: # Train set # train the normalization self.scaler = MinMaxScaler(feature_range=(0, 1)) self.scaler = self.scaler.fit(extra_features) # normalize the train dataset extra_features = self.scaler.transform(extra_features) if train_test == 0: # Test set # normalize the test dataset extra_features = self.scaler.transform(extra_features) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("features after normalization: " + str(extra_features) + '\n') # Adding features to encoded_tweets print("encoded_tweets before tweets shape: ", encoded_tweets.shape) print("before tweets extra_features shape: ", extra_features.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("encoded_tweets before tweets shape: " + str(encoded_tweets.shape) + '\n' + "before tweets extra_features shape: " + str(extra_features.shape) + '\n' + "before encoded_tweets: " + str(encoded_tweets) + '\n') encoded_tweets = numpy.concatenate((encoded_tweets, extra_features), axis=1) encoded_tweets = np.array(encoded_tweets) print("final encoded_tweets shape: ", encoded_tweets.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("final encoded_tweets shape: " + str(encoded_tweets.shape) + '\n' + "final encoded_tweets: " + str(encoded_tweets) + '\n') # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Univariate Selection # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 7: encoded_tweets = self.Univariate_Selection(encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Recursive Feature Elimination # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 8: encoded_tweets = self.Recursive_Feature_Elimination( encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Principal Component Analysis # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 9: encoded_tweets = self.Principal_Component_Analysis( encoded_tweets, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Truncated SVD (alternative of PCA for TF-IDF) # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 10: encoded_tweets = self.TruncatedSVD(encoded_tweets, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Feature Importance # One-hot-encoding, TF-IDF, Bigrams if feature_selection == 11: encoded_tweets = self.Feature_Importance(encoded_tweets, y, train_test) # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- print("Final encoded_tweets, after feature selection, shape: ", encoded_tweets.shape) with open(print_file, "a") as myfile: # Write above print into output file myfile.write( "Final encoded_tweets, after feature selection, shape: " + str(encoded_tweets.shape) + '\n') return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # Create a dictionary for one hot encoding and encode with one hot encoding def one_hot_enc(self, x_enc, train_test): encoded_tweets = [] x_enc = list(x_enc) if train_test == 1: # Train set self.onehot_encoder = CountVectorizer(analyzer='word', tokenizer=self.dummy_fun, lowercase=False, binary=True) xenc = [] for x in x_enc: xenc.append(x) encoded_tweets = self.onehot_encoder.fit_transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() print(np.array(vocab).shape) for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) if train_test == 0: # Test set xenc = [] for x in x_enc: xenc.append(x) encoded_tweets = self.onehot_encoder.transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # TF-IDF def tf_idf(self, x_enc, train_test): encoded_tweets = [] if (train_test == 1): # train self.vectorizer = TfidfVectorizer(lowercase=False, analyzer='word', tokenizer=self.dummy_fun, preprocessor=self.dummy_fun) encoded_tweets = self.vectorizer.fit_transform(x_enc) if (train_test == 0): # test encoded_tweets = self.vectorizer.transform(x_enc) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def bigrams_enc(self, x_enc, train_test): bigrams = [] # Bi-grams of all tweets # Use the pre-processing done above for y in range(0, len(x_enc)): bigrams.append(list(ngrams(x_enc[y], 2))) encoded_tweets = [] if train_test == 1: # Train set self.onehot_encoder = CountVectorizer(analyzer='word', tokenizer=self.dummy_fun, lowercase=False, binary=True) xenc = [] for x in bigrams: xenc.append(x) encoded_tweets = self.onehot_encoder.fit_transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) if train_test == 0: # Test set xenc = [] for x in bigrams: xenc.append(x) encoded_tweets = self.onehot_encoder.transform(xenc) encoded_tweets = encoded_tweets.toarray() vocab = self.onehot_encoder.get_feature_names() for i in range(0, len(encoded_tweets[0])): if encoded_tweets[0][i] == 1: print("i: ", i, " ", encoded_tweets[0][i], ' = ', vocab[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def Word2Vec_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN') vector_size = 100 if train_test == 1: # Train set # sg: CBOW if 0, skip-gram if 1 # ‘min_count’ is for neglecting infrequent words. # negative (int) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used. # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) self.model = Word2Vec(size=vector_size, min_count=0, sg=1) self.model.build_vocab([x.words for x in encoded_tweets]) self.model.train([x.words for x in encoded_tweets], total_examples=len(encoded_tweets), epochs=10) self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) self.vectorizer1.fit_transform([x.words for x in encoded_tweets]) if train_test == 0: # Data set self.vectorizer1.transform([x.words for x in encoded_tweets]) tfidf = dict( zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_)) train_vecs_w2v = np.concatenate([ self.buildWordVector(self.model, tweet, vector_size, tfidf) for tweet in map(lambda x: x.words, encoded_tweets) ]) encoded_tweets = scale(train_vecs_w2v) print(encoded_tweets) return encoded_tweets # Used for computing the mean of word2vec and implementing the transform function def buildWordVector(self, model, tweet, size, tfidf): vec = np.zeros(size).reshape((1, size)) count = 0. for word in tweet: try: vec += model[word].reshape((1, size)) * tfidf[word] count += 1. except KeyError: # handling the case where the token is not # in the corpus. useful for testing. continue if count != 0: vec /= count return vec def labelizeTweets(self, tweets, label_type): LabeledSentence = gensim.models.doc2vec.LabeledSentence labelized = [] for i, v in enumerate(tweets): label = '%s_%s' % (label_type, i) labelized.append(LabeledSentence(v, [label])) return labelized ############################################################################################################################################### ############################################################################################################################################### def Doc2Vec_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets(x_enc, 'TRAIN') vector_size = 100 if train_test == 1: # Train set # dm: DBOW if 0, distributed-memory if 1 # window: number of words accounted for each context( if the window size is 3, 3 word in the left neighorhood and 3 word in the right neighborhood are considered) self.modeldoc = Doc2Vec(vector_size=vector_size, min_count=0, dm=0) self.modeldoc.build_vocab([x for x in encoded_tweets]) self.modeldoc.train(utils.shuffle([x for x in encoded_tweets]), total_examples=len(encoded_tweets), epochs=10) # Get the vectors created for each tweet encoded_tweets = np.zeros((len(x_enc), vector_size)) for i in range(0, len(x_enc)): prefix_train_pos = 'TRAIN_' + str(i) encoded_tweets[i] = self.modeldoc.docvecs[prefix_train_pos] if train_test == 0: # Test set encoded_tweets = np.zeros((len(x_enc), vector_size)) for i in range(0, len(x_enc)): encoded_tweets[i] = self.modeldoc.infer_vector(x_enc[i]) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### def GloVe_enc(self, x_enc, train_test): encoded_tweets = self.labelizeTweets( x_enc, 'TRAIN' ) # Different encoding of tweets (One Hot Encoding, TF-IDF, One hot encoding of ngrams) if train_test == 1: # Train set if not self.called_once: # Used to ensure that training-reading the GloVe model is done just once self.called_once = True gloveFile = self.dir + '\\GloVe_train\\glove.twitter.27B\\glove.twitter.27B.200d.txt' print("Loading Glove Model") f = open(gloveFile, 'r', encoding="utf8") self.glove_model = {} for line in f: splitLine = line.split() word = splitLine[0] embedding = np.array([float(val) for val in splitLine[1:]]) self.glove_model[word] = embedding self.vectorizer1 = TfidfVectorizer(analyzer=lambda x: x, min_df=7) self.vectorizer1.fit_transform([x.words for x in encoded_tweets]) if train_test == 0: # Data set self.vectorizer1.transform([x.words for x in encoded_tweets]) tfidf = dict( zip(self.vectorizer1.get_feature_names(), self.vectorizer1.idf_)) vector_size = 200 # Dimensions of vectors are stated at the name of the GloVe txt files train_vecs_w2v = np.concatenate([ self.buildWordVector(self.glove_model, tweet, vector_size, tfidf) for tweet in map(lambda x: x.words, encoded_tweets) ]) encoded_tweets = scale(train_vecs_w2v) return encoded_tweets ############################################################################################################################################### ############################################################################################################################################### # Feature Selection ############################################################################################################################################### ############################################################################################################################################### def Univariate_Selection(self, x, y, train_test): # Feature Extraction with Univariate Statistical Tests (Chi-squared for classification) features = [] if train_test == 1: # Train set # feature extraction self.test = SelectKBest(score_func=chi2, k=100) features = self.test.fit_transform(x, y) # summarize scores numpy.set_printoptions( precision=3) # Format print to show only 3 decimals of floats if train_test == 0: # Test set features = self.test.transform(x) # summarize scores numpy.set_printoptions( precision=3) # Format print to show only 3 decimals of floats return features def Recursive_Feature_Elimination(self, x, y, train_test): # Feature Extraction with RFE features = [] if train_test == 1: # Train set # feature extraction model = RandomForestClassifier(n_estimators=250, max_features=7, max_depth=30, min_samples_split=2, random_state=0, n_jobs=-1) self.rfe = RFE(model, 100) features = self.rfe.fit_transform(x, y) if train_test == 0: # Test set features = self.rfe.transform(x) return features def Principal_Component_Analysis(self, x, train_test): # Feature Extraction with PCA features = [] if train_test == 1: # Train set # feature extraction self.pca = PCA(n_components=100) features = self.pca.fit_transform(x) if train_test == 0: # Test set features = self.pca.transform(x) return features def TruncatedSVD(self, x, train_test): # Feature Extraction with TruncatedSVD features = [] if train_test == 1: # Train set # feature extraction self.svd = TruncatedSVD(n_components=100) features = self.svd.fit_transform(x) if train_test == 0: # Test set features = self.svd.transform(x) return features def Feature_Importance(self, x, y, train_test): # Feature Importance with Extra Trees Classifier features = [] if train_test == 1: # Train set # feature extraction # Create a random forest classifier with the following Parameters self.sfm = RandomForestClassifier(n_estimators=250, max_features=7, max_depth=30) self.sfm.fit(x, y) # Select features which have higher contribution in the final prediction self.models = SelectFromModel(self.sfm, threshold="9*mean") self.models.fit(x, y) features = self.models.transform(x) if train_test == 0: # Test set features = self.models.transform(x) return features ############################################################################################################################################### ############################################################################################################################################### ############################################################################################################################################################## # Read the training files for task (with emojis) # train_A ############################################################################################################################################################## def readTrain(self): # Read the training file for task A with emojis train_file_A = self.dir + '\\dataset\\train\\SemEval2018-T3-train-taskA_emoji.txt' data_fields = ['id', 'label', 'tweet'] # Define the names of the columns self.train_A = pd.read_csv( train_file_A, sep='\t', header=None, names=data_fields, quoting=3 ) # quoting=3 tells Python to ignore doubled quotes, header=None defines that the first line of the file is not the names of the columnsv # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # Pre-processing self.pre_processing() # --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ############################################################################################################################################################## # Check if the dataset is imbalanced ############################################################################################################################################################## def checkImbalance(self): # Checking if file A with emojis is imbalanced counter0 = 0 counter1 = 0 counter_all = 0 for i in range(0, len(self.train_A)): counter_all += 1 if (self.train_A.iloc[i][1] == 1): counter0 += 1 else: counter1 += 1 print( 'File A without emojis -> Percentage of tweets classified as 0: ' + str((counter0 / counter_all) * 100)) print( 'File A without emojis -> Percentage of tweets classified as 1: ' + str((counter1 / counter_all) * 100) + '\n ----------------------------------------')
# [0] [1] [2] [3] [4] [5] print "**** ", "['ID', 'Year', 'Title', 'Authors', 'Journal name (O)', 'Abstract']" for i in range(5): print " ", node_info[i] IDs = [element[0] for element in node_info] # this holds a vertical list of only the IDs # compute TFIDF vector of each paper corpus = [element[5] for element in node_info] # this holds a vertical list of the Abstracts # vectorizer initializes the TfidfVectorizer() & we can pass more parameters. see webpage in top # stop_words="english": remove common 'english' words # min_df=0: # ngram_range=(1,3): generate 2 and 3 word phrases along with the single words from the corpus # analyzer='word': vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words="english") # each row is a node in the order of node_info # fit_transform(): Learn vocabulary and idf, return term-document matrix. features_TFIDF_Abstract = vectorizer.fit_transform(corpus) # print type(features_TFIDF) | will print <class 'scipy.sparse.csr.csr_matrix'> # https://docs.scipy.org/doc/scipy-0.19.0/reference/generated/scipy.sparse.csr_matrix.html # compute TFIDF vector of each title corpusTitle = [element[2] for element in node_info] # each row is a node in the order of node_info features_TFIDF_Title = vectorizer.fit_transform(corpusTitle) # compute TFIDF vector of each author corpusAuthor = [element[3] for element in node_info] # each row is a node in the order of node_info
def ngrams(text, n): return zip(*[normalize(text).split()[i:] for i in range(n)]) nltk.download('punkt') # if necessary... stemmer = nltk.stem.porter.PorterStemmer() remove_punctuation_map = dict((ord(char), None) for char in string.punctuation) def stem_tokens(tokens): return [stemmer.stem(item) for item in tokens] '''remove punctuation, lowercase, stem''' def normalize(text): return stem_tokens(nltk.word_tokenize(MLStripper.strip_tags(text).lower().translate(remove_punctuation_map))) vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english') def cosine_sim(text1, text2): tfidf = vectorizer.fit_transform([text1, text2]) return ((tfidf * tfidf.T).A)[0,1] print(cosine_sim('a little bird', 'a little bird')) print(cosine_sim('a little bird', 'a little bird chirps')) print(cosine_sim('a little bird', 'a big dog barks')) # print(strip_tags("""<p>Deep clone an {@code Object} using serialization.</p> # # <p>This is many times slower than writing clone methods by hand # on all objects in your object graph. However, for complex object # graphs, or for those that don't support deep cloning this can
def update_figure(n_clicks, episode_slider, word_input, speaker_input): fullmerge = {} data_subset = data[data['episode'] >= episode_slider[0]][data['episode'] <= episode_slider[1]] #Merge all episodes into one long string peeps = list(filter(None,speaker_input.upper().replace(" ","").split(","))) for peep in peeps: fullmerge.update({peep:' '.join(data_subset[data_subset.Speaker == peep]['cleaned'])}) finaldata = pd.DataFrame.from_dict(fullmerge, orient = 'index').reset_index().rename(columns = {'index':'speaker',0:'text'}) totalwords = pd.DataFrame() for peep in peeps: totalwords = totalwords.append(pd.Series([peep, ' ', finaldata[finaldata['speaker'] == peep].text.str.count(' ').iloc[0]]), ignore_index = True) totalwords = totalwords.rename(index=str, columns={0: "speaker", 1: "word",2: "total"}) words = list(filter(None,word_input.lower().split(","))) #Calculate frequency of words in finaldata df = pd.DataFrame() for peep in peeps: for word in words: df = df.append(pd.Series([peep, word, finaldata[finaldata['speaker'] == peep].text.str.count(word).iloc[0]]), ignore_index = True) df = df.rename(index=str, columns={0: "speaker", 1: "word",2: "amount"}) #Calculate rate per 1000 words df = pd.merge(df, totalwords[['speaker','total']], on='speaker') df['Number of times said per 1000 words'] = (df['amount']/df['total'])*1000 #Sort data by rate and then speaker df = df.sort_values(by=['Number of times said per 1000 words','speaker'], ascending = [True,False]) #Graph if len(words) != 1: if len(peeps) != 1: #Apply the tfidf function, and find the most "distinguishing" word among the given words and speakers tfidf = TfidfVectorizer(stop_words='english', vocabulary = words) tfs = tfidf.fit_transform(finaldata['text']) matrix = pd.DataFrame(tfs.todense(), index = peeps, columns = tfidf.get_feature_names()).transpose() matrix['word'] = matrix.index matrix = pd.melt(matrix, id_vars = 'word') matrix = matrix.rename(index=str, columns={'value': "tfidf",'variable': "speaker"}) distWord = matrix.loc[matrix['tfidf'].idxmax()]['word'] distSpeaker = matrix.loc[matrix['tfidf'].idxmax()]['speaker'] tfidfSent = ("Most distinguishing: '" + distWord + "' by " + distSpeaker + ".*") fig = ff.create_facet_grid( df, x='Number of times said per 1000 words', y='word', facet_col='speaker', color_name='speaker', trace_type='bar', orientation = 'h', scales = 'free', width = 1200 ) for i in range(len(peeps)+1): if i == 0: fig.layout.xaxis.update({'range': [df['Number of times said per 1000 words'].min(), (df['Number of times said per 1000 words'].max()+(.15 * df['Number of times said per 1000 words'].max()))]}) else: exec('fig.layout.xaxis' + str(i)+".update({'range': [df['Number of times said per 1000 words'].min(), (df['Number of times said per 1000 words'].max()+(.15 * df['Number of times said per 1000 words'].max()))]})") fig.layout.xaxis.title = tfidfSent fig.layout.update(plot_bgcolor='rgba(230,230,230,90)') elif len(peeps) == 1: fig = ff.create_facet_grid( df, x='word', y='Number of times said per 1000 words', color_name='word', trace_type='bar', scales = 'free', width = 1200 ) fig.layout.update(plot_bgcolor='rgba(230,230,230,90)') elif len(words) == 1: fig = ff.create_facet_grid( df, x='speaker', y='Number of times said per 1000 words', color_name='speaker', trace_type='bar', scales = 'free', width = 1200 ) fig.layout.update(plot_bgcolor='rgba(230,230,230,90)') return { 'data': fig }
##label vector labels=dat.label labels.head() ####Create training and test data sets x_train,x_test,y_train,y_test=train_test_split(dat['text'].apply(lambda x: np.str_(x)), labels, test_size=0.2, random_state=7) ##Count Vectorizer count_vectorizer = CountVectorizer(stop_words='english') ##Fit and train count vectorizer count_train = count_vectorizer.fit_transform(x_train) count_test = count_vectorizer.transform(x_test) ##TFIDF Vectorizer tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7) ####Fit to tfidf data and transform test and training sets to normalized tfidf vector tfidf_train=tfidf_vectorizer.fit_transform(x_train) tfidf_test=tfidf_vectorizer.transform(x_test) ####Hashing Vectorizer hash_vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False) ###Fit hashing vectorizer to data and transform both test and training sets hash_train = hash_vectorizer.fit_transform(x_train) hash_test = hash_vectorizer.transform(x_test) ###Feature names #print(tfidf_vectorizer.get_feature_names()[-10:])
query = "permanently get rid jock itch" # print(questions) simi = -1 matched_ques = [] for strr1 in questions: xx1 = lemmatized_string(strr1) xx2 = lemmatized_string(query) simi1 = jaccard_similarity(xx1, xx2) if (simi1 > 0.20): matched_ques.append((simi1, strr1)) # if(simi1>simi): # simi=simi1 # matched_ques=strr1 tfidf_vectorizer = TfidfVectorizer() query1 = (" ").join(x for x in lemmatized_string(query)) documents = [] documents.append(query1) for i in questions: i1 = (" ").join(x for x in lemmatized_string(i)) documents.append(i1) documents = tuple(documents) tfidf_matrix = tfidf_vectorizer.fit_transform(documents) cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix) xxx = [] xxx.append((query, 1.0)) for x in np.where(cos_sim[0] >= 0.4)[0]:
print(string) #%% #Count Vector count_vect = CountVectorizer() count = count_vect.fit_transform(string) tf_feature_names = count_vect.get_feature_names() print(count) #%% string = map(' '.join, sw) tfidf_vectorizer = TfidfVectorizer() tfidf = tfidf_vectorizer.fit_transform(string) tfidf_feature_names = tfidf_vectorizer.get_feature_names() #%% no_topics = 29 # Coba algoritma NMF nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) # Coba algoritma LDA lda = LatentDirichletAllocation(n_topics=no_topics,
print(df) # %% [markdown] # Create the feature extraction object # %% codecell vocab_size = 1000 embedding_dim = 64 # Set up some sklearn objects that are going to be in the pipeline # SMOTE for class balancing via oversampling the minority class smt = SMOTE(random_state=12) # TF-IDF Vectorizer: https://www.quora.com/How-does-TfidfVectorizer-work-in-laymans-terms?share=1 # Define the importance of words in the corpus depending on frequency AND "uniqueness" tfidf = TfidfVectorizer(sublinear_tf=True, max_features=vocab_size, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 1), stop_words='english') # %% [markdown] # Separate the data in train validate and test # %% codecell train, validate, test = np.split(df.sample(frac=1), [int(.6 * len(df)), int(.8 * len(df))]) features = tfidf.fit_transform(train.text).toarray() # Binary classification result labels = train.is_asshole X_train, y_train = smt.fit_resample(features, labels) # TF-IDF on test set X_validate = tfidf.transform(validate.text).toarray()
processed_feature = re.sub(r'\s+[a-zA-z]\s+', ' ', processed_feature) processed_feature = re.sub(r'\^[a-zA-z]\s+', ' ', processed_feature) processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I) processed_feature = re.sub(r'^b\s+', '', processed_feature) processed_feature = processed_feature.lower() processed_features.append(processed_feature) return processed_features processed_features = process(features) # print(processed_features[:10]) # Tfidf vectorizer vectorizer = TfidfVectorizer(max_features=2500, min_df=2, max_df=0.8, stop_words=stopwords.words('english')) processed_features = vectorizer.fit_transform(processed_features).toarray() # split into testing and training sets X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0) # Naive Bayes' classifier model = GaussianNB() model.fit(X_train, y_train) # Test predictions
(usage[2]*resource.getpagesize())/1000000.0 ) u_start = using('start') data = raw_input("folder containing the folders. Don't add / in the end.\n") close_folder = raw_input("folder to dump output files. Don't add / in the end.\n") categories = [f for f in listdir(data) if not isfile(join(data, f))] #print categories #categories = ["business" ,"sport", 'entertainment', 'tech', 'politics'] dataset = sklearn.datasets.load_files( data + '/', description=None, categories=categories , load_content=True, shuffle=True, encoding = 'utf-8',decode_error='ignore', random_state=179863) true_k = len(categories) #no of groups u_dataRead = using() vectorizer = TfidfVectorizer(max_df=.5, max_features=310 ,min_df=10 , stop_words='english',use_idf=True) dat = vectorizer.fit_transform(dataset.data) dat = dat.toarray() u_Vect = using() factor = int(len(dataset.data) / 5) X = dat[:factor] Y = dat[factor:] km = KMeans(n_clusters=true_k) km.fit(X) cur = Y clusters = defaultdict(list) vishal = defaultdict(list)
return result.strip() ### set X_text and y ### X_text = [] y = [] for intent, intent_data in BOT_CONFIG['intents'].items(): for example in intent_data['examples']: X_text.append(example) y.append(intent) ### Веторизация ### vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 3)) X = vectorizer.fit_transform(X_text) vectorizer.get_feature_names() ### Классификация ### clf = LinearSVC(random_state=0) clf.fit(X, y) def get_failure_phrase(): failure_phrases = BOT_CONFIG['failure_phrases'] return random.choice(failure_phrases) def classify_intent(replica):
def tfidf_dimReduced(corpus, log_id="<<id>>", number_features=10000): """ Vectorizes the corpus using -IDF metric and then reduces the dimension of the features to number_features. Vectorizer results are normaliTFzed. Since LSA/SVD results are not normalized, we have to redo the normalization. Parameters ---------------- corpus (array): array of text documents. login_id [optional] (string): id to display in the logs. Usually represents the fold. number_features [optional](int): numbers of features to keep after SVD. Returns ---------------- lsa (object): the learned model. X (object): the matrix of learned features representing the corpus. """ wnl = nltk.stem.PorterStemmer() logging.info(log_id + "Running Stemming...") t0 = time() corporea = [] for doc in corpus: new_doc = [] # removing autism and asd words. doc = doc.lower().replace("autism", "").replace("asd", "") tokens = nltk.word_tokenize(doc) for token in tokens: new_doc.append(wnl.stem(token)) corporea.append(' '.join(new_doc)) logging.info(log_id + "Stemming done in %fs" % (time() - t0)) logging.info(log_id + "Running tfidf...") t0 = time() vectorizer = TfidfVectorizer( max_df=0.5, #max_features=20000, min_df=1, stop_words='english', use_idf=True) logging.info(log_id + "tf-idf done in %fs" % (time() - t0)) logging.info(log_id + "Running SVD Dim Reduction...") t0 = time() svd = TruncatedSVD(number_features) normalizer = Normalizer(copy=False) lsa = make_pipeline(vectorizer, svd, normalizer) X = lsa.fit_transform(corporea) explained_variance = svd.explained_variance_ratio_.sum() logging.info( log_id + "SVD explained variance: {}%".format(int(explained_variance * 100))) logging.info(log_id + "SVD done in %fs" % (time() - t0)) return lsa, X
class LSI(GenericModel): def __init__(self, **kwargs): self._svd_matrix = None self._query_vector = None self.vectorizer = None self.svd_model = None super().__init__() self.similarity_measure = None self.set_basic_params(**kwargs) self.set_vectorizer(**kwargs) self.set_svd_model(**kwargs) def set_name(self, name): super().set_name(name) def set_model_gen_name(self, gen_name): super().set_model_gen_name(gen_name) def set_basic_params(self, **kwargs): self.set_name('LSI' if LSI_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[LSI_Model_Hyperp.NAME.value]) self.set_similarity_measure(SimilarityMeasure.COSINE) self.set_model_gen_name('lsi') def set_similarity_measure(self, sim_measure): self.similarity_measure = sim_measure def set_vectorizer(self, **kwargs): self.vectorizer = TfidfVectorizer( stop_words='english', use_idf=True, smooth_idf=True ) if LSI_Model_Hyperp.VECTORIZER.value not in kwargs.keys( ) else kwargs[LSI_Model_Hyperp.VECTORIZER.value] vec_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__vectorizer__' in key } self.vectorizer.set_params(**vec_params) def set_svd_model(self, **kwargs): self.svd_model = TruncatedSVD( n_components=100, algorithm='randomized', n_iter=10, random_state=42 ) if LSI_Model_Hyperp.SVD_MODEL.value not in kwargs.keys() else kwargs[ LSI_Model_Hyperp.SVD_MODEL.value] svd_model_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__svd_model__' in key } self.svd_model.set_params(**svd_model_params) def recover_links(self, corpus, query, test_cases_names, bug_reports_names): starttime = time.time() if self.similarity_measure == SimilarityMeasure.COSINE: self._recover_links_cosine(corpus, query, test_cases_names, bug_reports_names) elif self.similarity_measure == SimilarityMeasure.JACCARD_INDEX: self._recover_links_jaccard(corpus, query, test_cases_names, bug_reports_names) elif self.similarity_measure == SimilarityMeasure.EDIT_DISTANCE: self._recover_links_edit(corpus, query, test_cases_names, bug_reports_names) self._record_docs_feats(corpus, query, test_cases_names, bug_reports_names) endtime = time.time() print( f' ..Total processing time: {round(endtime-starttime, 2)} seconds', ) def _record_docs_feats(self, corpus, query, test_cases_names, bug_reports_names): self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus) self.mrw_brs = self._recover_mrw_list(bug_reports_names, query) self.dl_tcs = self._recover_dl_list(test_cases_names, corpus) self.dl_brs = self._recover_dl_list(bug_reports_names, query) index = list(test_cases_names) + list(bug_reports_names) self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl']) for tc_name, mrw in self.mrw_tcs: self.docs_feats_df.at[tc_name, 'mrw'] = mrw for tc_name, dl in self.dl_tcs: self.docs_feats_df.at[tc_name, 'dl'] = dl for br_name, mrw in self.mrw_brs: self.docs_feats_df.at[br_name, 'mrw'] = mrw for br_name, dl in self.dl_brs: self.docs_feats_df.at[br_name, 'dl'] = dl def _recover_dl_list(self, artf_names, artf_descs): tokenizer = WordNetBased_LemmaTokenizer() dl_list = [] for artf_name, artf_desc in zip(artf_names, artf_descs): dl_list.append((artf_name, len(tokenizer.__call__(artf_desc)))) return dl_list def _recover_mrw_list(self, artf_names, artf_descs): N_REL_WORDS = 6 mrw_list = [] # list of tuples (artf_name, mrw_list={}) for artf_name, artf_desc in zip(artf_names, artf_descs): X = self.vectorizer.transform([artf_desc]) df1 = pd.DataFrame(X.T.toarray()) df1['token'] = self.vectorizer.get_feature_names() df1.sort_values(by=0, ascending=False, inplace=True) mrw = list(df1.iloc[0:N_REL_WORDS, 1].values) mrw_list.append((artf_name, mrw)) return mrw_list def _recover_links_cosine(self, corpus, query, test_cases_names, bug_reports_names): svd_transformer = Pipeline([('vec', self.vectorizer), ('svd', self.svd_model)]) self._svd_matrix = svd_transformer.fit_transform(corpus) self._query_vector = svd_transformer.transform(query) self._sim_matrix = pairwise.cosine_similarity(X=self._svd_matrix, Y=self._query_vector) #self._sim_matrix = super().normalize_sim_matrix(self._sim_matrix) self._sim_matrix = pd.DataFrame(data=self._sim_matrix, index=test_cases_names, columns=bug_reports_names) def _recover_links_jaccard(self, corpus, query, test_cases_names, bug_reports_names): tokenizer = self.vectorizer.tokenizer corpus_tokens = [tokenizer.__call__(doc) for doc in corpus] query_tokens = [tokenizer.__call__(doc) for doc in query] self._sim_matrix = pd.DataFrame(index=test_cases_names, columns=bug_reports_names, data=np.zeros( shape=(len(test_cases_names), len(bug_reports_names)), dtype='int8')) for br_id, doc_query_tset in zip(bug_reports_names, query_tokens): for tc_id, doc_corpus_tset in zip(test_cases_names, corpus_tokens): self._sim_matrix.at[tc_id, br_id] = nltk.jaccard_distance( set(doc_corpus_tset), set(doc_query_tset)) def _recover_links_edit(self, corpus, query, test_cases_names, bug_reports_names): self._sim_matrix = pd.DataFrame(index=test_cases_names, columns=bug_reports_names, data=np.zeros( shape=(len(test_cases_names), len(bug_reports_names)), dtype='int8')) for br_id, doc_query in zip(bug_reports_names, query): for tc_id, doc_corpus in zip(test_cases_names, corpus): self._sim_matrix.at[tc_id, br_id] = nltk.edit_distance( doc_corpus, doc_query) normalizer = Normalizer(copy=False).fit(self._sim_matrix.values) self._sim_matrix = pd.DataFrame(data=normalizer.transform( self._sim_matrix.values), index=test_cases_names, columns=bug_reports_names) def model_setup(self): return { "Setup": [{ "Name": self.get_name() }, { "Similarity Measure": self.get_similarity_measure() }, { "SVD Model": self.svd_model.get_params() }, { "Vectorizer": self.vectorizer.get_params() }, { "Vectorizer Type": type(self.vectorizer) }] } def get_query_vector(self): return self._query_vector def get_svd_matrix(self): return self._svd_matrix def get_vectorizer_type(self): return type(self.vectorizer) def get_tokenizer_type(self): return type(self.vectorizer.tokenizer) def get_name(self): return super().get_name() def get_model_gen_name(self): return super().get_model_gen_name() def get_similarity_measure(self): return self.similarity_measure def get_sim_matrix(self): return super().get_sim_matrix() def save_sim_matrix(self): super().save_sim_matrix()
def buildTfIdf(self): self.tfIdf = TfidfVectorizer(max_features=1000, min_df=3, max_df=0.7) self.vectors = self.tfIdf.fit_transform(self.documentsDf['content'])
# In[5]: # sentence pair #for c in range(len(corpus)): # corpus[c] = pre_process(corpus[c]) # corpus[c] = lemmatize_sentence(corpus[c]) # print(corpus[c]) # In[6]: # creating vocabulary using uni-gram and bi-gram tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2)) tfidf_vectorizer.fit(corpus) # In[38]: # Importing the two csv files as dataframes (the original and the modified one) df1 = pandas.read_csv('LongestDf.csv') df2 = pandas.read_csv('Frames_caption.csv') # In[39]:
class TextProcessor(object): def __init__(self, repo): self.repo = repo def loadAllDocumets(self): documents = self.repo.search('', False) documents = [(self.preProcess(d.description), 'Advertiser-friendly' if d.isAdvertizerFriendly == '\x01' else 'Not suitable for ads') for d in documents] self.documentsDf = pd.DataFrame(documents, columns = ['content', 'label']) def preProcess(self, text): #normalize case result = text.lower() #remove html tags result = re.sub('<.*?>', '', result) #remove links result = re.sub('(www|http)\S+', '', result) #remove words with numbers in them result = re.sub(r'\w*\d\w*', '', result) #remove punctuation and special characters from words #(but leave apostrophre as it will be easier to remove stopwords) whitelist = set("abcdefghijklmnopqrstuvwxyz '") result = ''.join(filter(whitelist.__contains__, result)) words = result.split() #remove stop-words stop_words = stopwords.words('english') words = [w for w in words if not w in stop_words] #stem words porter = PorterStemmer() words = [porter.stem(w) for w in words] return ' '.join(words) def analyze(self, textToAnalyze): self.loadAllDocumets() self.encoder = LabelEncoder() self.documentsDf['label'] = self.encoder.fit_transform(self.documentsDf['label']) self.buildTfIdf() preProcessed = self.preProcess(textToAnalyze) prediction = self.predict(preProcessed).tolist()[0] scores = self.get_scores(self.tfIdf, self.vectors) index = prediction.index(max(prediction)) label = self.encoder.inverse_transform([index])[0] return ProcessingResult (preProcessed, scores, label, max(prediction)) def buildTfIdf(self): self.tfIdf = TfidfVectorizer(max_features=1000, min_df=3, max_df=0.7) self.vectors = self.tfIdf.fit_transform(self.documentsDf['content']) def predict(self, testString): naive = MultinomialNB() naive.fit(self.vectors, self.documentsDf['label']) testDocumentDf = pd.DataFrame([(testString)], columns = ['content']) testData = self.tfIdf.transform(testDocumentDf['content']) prediction = naive.predict_proba(testData) return prediction def get_scores(self, vectorizer, vectors): scores = zip(vectorizer.get_feature_names(), np.asarray(vectors.sum(axis=0)).ravel()) sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True) formatted_scores = ["{}: {}".format(item[0], item[1]) for item in sorted_scores[:200]] return formatted_scores def getMetrics(self): self.loadAllDocumets() self.encoder = LabelEncoder() self.documentsDf['label'] = self.encoder.fit_transform(self.documentsDf['label']) Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(self.documentsDf['content'],self.documentsDf['label'],test_size=0.3) Tfidf_vect = TfidfVectorizer(max_features=5000) Tfidf_vect.fit(self.documentsDf['content']) Train_X_Tfidf = Tfidf_vect.transform(Train_X) Test_X_Tfidf = Tfidf_vect.transform(Test_X) # fit the training dataset on the NB classifier Naive = MultinomialNB() Naive.fit(Train_X_Tfidf,Train_Y) # predict the labels on validation dataset predictions_NB = Naive.predict(Test_X_Tfidf) # Use accuracy_score function to get the accuracy acuracy = accuracy_score(predictions_NB, Test_Y) recall = recall_score(predictions_NB, Test_Y) roc_auc = roc_auc_score(predictions_NB, Test_Y) precision = average_precision_score(predictions_NB, Test_Y) f1 = f1_score(predictions_NB, Test_Y) return Metrics(acuracy, recall, roc_auc, precision, f1) def buildRocCurve(self): self.loadAllDocumets() self.encoder = LabelEncoder() self.documentsDf['label'] = self.encoder.fit_transform(self.documentsDf['label']) Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(self.documentsDf['content'],self.documentsDf['label'],test_size=0.3) Tfidf_vect = TfidfVectorizer(max_features=5000) Tfidf_vect.fit(self.documentsDf['content']) Train_X_Tfidf = Tfidf_vect.transform(Train_X) Test_X_Tfidf = Tfidf_vect.transform(Test_X) # fit the training dataset on the NB classifier Naive = MultinomialNB() Naive.fit(Train_X_Tfidf,Train_Y) # predict the labels on validation dataset predictions_NB = Naive.predict(Test_X_Tfidf) # Compute fpr, tpr, thresholds and roc auc fpr, tpr, thresholds = roc_curve(predictions_NB, Test_Y) roc_auc = roc_auc_score(predictions_NB, Test_Y) # Plot ROC curve plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') # random predictions curve plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic') plt.legend(loc="lower right") #plt.show() labels = [0,1] cm = confusion_matrix(predictions_NB, Test_Y, labels) print(cm) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(cm) for i in range(len(cm)): for j in range(len(cm[0])): c = cm[j,i] ax.text(i, j, str(c), va='center', ha='center', bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3')) plt.title('Confusion matrix of the classifier') fig.colorbar(cax) ax.set_xticklabels([''] + ['Not suitable', 'Ad-friendly']) ax.set_yticklabels([''] + ['Not suitable', 'Ad-friendly']) plt.xlabel('Predicted') plt.ylabel('True') plt.show() def buildCompressionChart(self): documents = self.repo.search('', False) processedDocuments = [self.preProcess(d.description) for d in documents] compressionResults = [None] * len(documents) for i in range(len(documents)): words1 = len(re.findall(r'\w+', documents[i].description)) words2 = len(re.findall(r'\w+', processedDocuments[i])) compressionResults[i] = CompressionResult(words1, len(documents[i].description), words2, len(processedDocuments[i])) plt.hist([r.wordsCompressionRate for r in compressionResults], bins=25, histtype='bar', ec='black') plt.title('Compression Rate (Words)') plt.xlabel('Compression Rate, %') plt.ylabel('Number of Documents') fig = plt.figure() plt.hist([r.charsCompressionRate for r in compressionResults], bins=25, histtype='bar', ec='black') plt.title('Compression Rate (Characters)') plt.xlabel('Compression Rate, %') plt.ylabel('Number of Documents') plt.show()
# Library from preprocess_data import read_predata from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity, linear_kernel # count = CountVectorizer(analyzer='word', stop_words='english') score = TfidfVectorizer(analyzer='word', stop_words='english') # movies_matrix = count.fit_transform(read_predata()['list_bag']) movies_matrix = score.fit_transform(read_predata()['list_bag']) # cosine_sim = cosine_similarity(movies_matrix) cosine_sim = linear_kernel(movies_matrix, movies_matrix) def get_title_from_index(index): return read_predata()[read_predata()['Unnamed: 0'] == index]["movie"].values[0] def get_index_from_title(title): return read_predata()[read_predata()["movie"] == title]["Unnamed: 0"].values[0] def recommendations(movie_user_likes): movie_index = get_index_from_title(movie_user_likes) similar_movies = list(enumerate(cosine_sim[movie_index])) sorted_similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[0:6]