def __init__(self, **kwargs): self.vectorizer = text.TfidfVectorizer(**kwargs)
# In[30]: skip_gram = SGHS('/DATA/1_DataCache/FinCorpus/skip_gram.model') # In[33]: skip_gram.key_words(['保险']) # In[34]: from sklearn.feature_extraction import text # In[37]: l = text.TfidfVectorizer() # In[42]: skip_gram.model.wv.most_similar('保险') # In[27]: skip_gram = SGHS(skip_gram) # In[ ]: skip_gram.key_words(ji) # In[6]:
def detect_topic(instances, labels, sentence, ndim=5, n_gram_range=(1,1), n_max_features=None): highlight_word = "" svd_model = TruncatedSVD(n_components=ndim, algorithm='randomized', n_iter=10, random_state=42) preprocessor = TokenHandler.TrTokenHandler(stopword=True, more_stopwords=None, stemming=True, remove_numbers=True, deasciify=False, remove_punkt=True) tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=preprocessor, ngram_range=n_gram_range, max_features=n_max_features) svd_transformer = skpipeline.Pipeline([('vectorizer', tfidf_vectorizer), #('normalizer', skprep.Normalizer()), ('scaler', skprep.StandardScaler(with_mean=False)), ('svd', svd_model)]) docmatrix = svd_transformer.fit_transform(instances) input_ = preprocessor(sentence) if(len(input_) < 1 or len("".join(input_)) < 1): highlight_word = "" return highlight_word inputmatrix = svd_transformer.transform(input_) termmatrix = svd_model.components_.T print(termmatrix.shape) print(inputmatrix.shape) print(docmatrix.shape) # closest docs # @TODO different similarity metrics docsim, docindices = list_utils.matrix_similarity(inputmatrix, docmatrix, top_N=10) for i,w in enumerate(input_): print(w) sim_docs = [labels[j] for j in docindices[i]] print("most similar docs: ", ", ".join(sim_docs)) sim_vals = docsim[i] print(sim_vals) print() # closest terms -> the input word which has the largest similarity value termsim, termindices = list_utils.matrix_similarity(inputmatrix, termmatrix, top_N=10) allterms = tfidf_vectorizer.get_feature_names() for i,w in enumerate(input_): print(w) sim_terms = [allterms[j] for j in termindices[i]] print("most similar terms: ", ", ".join(sim_terms)) sim_vals = termsim[i] print(sim_vals) print(sum(sim_vals)) # the heaviest term similarity_threshold = 0.0 # @TODO should be inferred from the data_matrix total_termsim_per_instance = np.sum(termsim, axis=1) max_sim = total_termsim_per_instance.max() max_index = total_termsim_per_instance.argmax() #print("max -> ", input_[max_index], " : ",max_sim) if max_sim <= similarity_threshold: highlight_word = "" return highlight_word highlight_word = input_[max_index] return highlight_word
test_size=0.25, random_state=53) print([(len(data), type(data)) for data in [X_train, X_test, y_train, y_test]]) #============================================================================== # LA: Linear SVC (l2 regularized, l2-loss, dual optimization) # Preprocessing knobs: min_df(0.02,0.033,0.04, 0.05), max_df(0.95) # ngram_range: (1, 1), (1, 2) # use_idf: True(tf-idf), False(tf) # binary: False, True(use_idf=False, norm=None) # LA knobs: C: {0.0001, 0.01, 0.1, 1.0, 10.0, 100.0, 500.0, 700.0, 1000.0} # loss: l1, l2 # CV: stratified 10-fold # stratified shuffle split, n_iter = 10, test_size = 0.25 #============================================================================== tfidf = text.TfidfVectorizer() tfidf.set_params(analyzer="word", max_df=0.95, ngram_range=(1, 1), use_idf=True) svc = svm.LinearSVC() svc.set_params(verbose=1, loss="l2", random_state=53) pip_svm = pipes.Pipeline([("tfidf", tfidf), ("lalg", svc)]) parameter_grid = [{ "tfidf__min_df": [7, 0.02, 0.033, 0.04, 0.05], #"tfidf__ngram_range":[(1, 1), (1, 2)], #"tfidf__use_idf":[True, False], #"tfidf__binary":[False, True], "lalg__C": [0.0001, 0.01, 0.1, 1.0, 10.0, 100.0, 500.0, 700.0, 1000.0], #"lalg__loss": ["l2", "l1"] }]
if args.ngrams > 0: ngram_range = (1, args.ngrams) else: ngram_range = None # Verify that the hyperparameter values are valid. assert n_estimators > 0 assert min_child_samples > 1 assert type(ngram_range) is tuple and len(ngram_range) == 2 assert ngram_range[0] > 0 and ngram_range[0] <= ngram_range[1] # Define the pipeline that featurizes the text columns. featurization = [ (column, make_pipeline(ItemSelector(column), text.TfidfVectorizer(ngram_range=ngram_range))) for column in feature_columns ] features = FeatureUnion(featurization) # Define the estimator that learns how to classify duplicate-original question pairs. estimator = lgb.LGBMClassifier(n_estimators=n_estimators, min_child_samples=min_child_samples, verbose=args.verbose) # Define the model pipeline as feeding the features into the estimator. model = Pipeline([('features', features), ('model', estimator)]) # Fit the model. print('Training...') model.fit(train_X, train_y, model__sample_weight=sample_weight)
job_keyword = data["api_data"]["job_keywords"][0] job_keyword = job_keyword.replace(' ', '_') tags.append(job_keyword) # break import sklearn.preprocessing as preprocessing import sklearn.feature_extraction.text as text encoder = preprocessing.LabelEncoder() y = encoder.fit_transform(tags) # all the tags that will be given labels encoder.classes_ tfidf = text.TfidfVectorizer(corpus, stop_words="english", max_features=5000) tfidf_matrix = tfidf.fit_transform(corpus) tfidf_matrix.shape import sklearn.model_selection as model_selection x_train, x_test, y_train, y_test = model_selection.train_test_split( tfidf_matrix, y, test_size=0.20, random_state=42) from keras.layers import Conv2D, MaxPool2D, Flatten from keras.layers import Dense, Activation from keras.models import Sequential
# Get Kanyes's Tweets kanyeTweets = getTweets('kanyewest') # Get Elon's Tweets elonTweets = getTweets('elonmusk') # ******************* Begin Setup for Logistic Regression Model ******************* kanyes = ' '.join( kanyeTweets ) # For the Tfidf - Long string of every Kanye Tweet connected by spaces elons = ' '.join( elonTweets ) # For the Tfidf - Lost string of every Elon Tweet connected by spaces total = [kanyes, elons] vector = sk.TfidfVectorizer( ) # Create a Tfidf Vectorizer object - Used to compare importance of each word tweeted by Elon and Kanye vector.fit(total) kanyeTweets = zip( kanyeTweets, [0] * (len(kanyeTweets) - 1) ) # For the Regression model, makes a tuple connecting a 0 to each Kanye Tweet kanyeTweets = [tweet for tweet in kanyeTweets] elonTweets = zip( elonTweets, [1] * (len(elonTweets) - 1) ) # For the Regression model, makes a tuple connecting a 1 to each Kanye Tweet elonTweets = [tweet for tweet in elonTweets] df = pandas.DataFrame(kanyeTweets + elonTweets, columns=[ 'Text', 'Target' ]) # DataFrame containing each tweet and their respective 1 or 0 # Make a dataframe specific to Kanye and Elon, will be used to send specifc tweet from each public figure to model
def main(config): """ Function to run the training process :param config: SimpleNamespace config object """ train_df = pd.read_csv(config.train_filepath, usecols=config.data["columns"]) train_df.rename(columns=config.data["names"], inplace=True) test_df = pd.read_csv(config.test_filepath, usecols=config.data["columns"]) test_df.rename(columns=config.data["names"], inplace=True) print( f"Train labels distribution:\n{train_df.loc[:,'labels'].value_counts()}" ) print( f"Test labels distribution:\n{test_df.loc[:,'labels'].value_counts()}\n" ) # Create validation folds train_df["fold"] = -1 skf = model_selection.StratifiedKFold(n_splits=config.num_folds) for fold, (train_index, val_index) in enumerate( skf.split(train_df, train_df.loc[:, "labels"])): train_df.loc[val_index, "fold"] = fold # Iterate folds and run train_mdoels model_list = [] folds = np.unique(train_df.loc[:, "fold"].values) print("\n~~~~Running training and cross validation~~~~\n") for fold in folds: start_time = time.perf_counter() train_fold = train_df.loc[train_df.loc[:, "fold"] != fold, :].reset_index( drop=True) val_fold = train_df.loc[train_df.loc[:, "fold"] == fold, :].reset_index( drop=True) text_encoder = text.TfidfVectorizer(tokenizer=word_tokenize) clf = clf = naive_bayes.MultinomialNB() model = pipeline.Pipeline([('text_enc', text_encoder), ('clf', clf)]) model.fit(train_fold.loc[:, "feature"].values, train_fold.loc[:, "labels"].values) # Evaluate score val_probs = model.predict_proba(val_fold.loc[:, "feature"].values) roc_auc_score = metrics.roc_auc_score(val_fold.loc[:, "labels"].values, val_probs[:, 1]) # Calculate accuracy val_tags = model.predict(val_fold.loc[:, "feature"].values) accuracy_score = metrics.accuracy_score( val_fold.loc[:, "labels"].values, val_tags) # Append in the list to evaluate on test model_list.append(model) print( f"fold: {fold}, roc auc socre: {roc_auc_score:.2f}, accuracy_score: {accuracy_score:.2f}, fold time: {(time.perf_counter()-start_time):.2f} seconds" ) # Run evaluation on tests print("\n~~~~Running evaluation on held out test data~~~~\n") running_score, running_accuracy = 0, 0 for fold, model in enumerate(model_list): test_text = test_df.loc[:, "feature"].values test_class = test_df.loc[:, "labels"].values label_probs = model.predict_proba(test_text)[:, 1] label_tags = model.predict(test_text) # Calculate accuracy accuracy_score = metrics.accuracy_score(test_class, label_tags) running_accuracy += accuracy_score roc_auc_score = metrics.roc_auc_score(test_class, label_probs) running_score += roc_auc_score print( f"FOLD: {fold}, auc_roc_score on held out test: {roc_auc_score}, accuracy: {accuracy_score}" ) print( f"Average auc_roc_score acorss all folds on test data is: {running_score/len(model_list)}" ) print( f"Average accuracy acorss all folds on test data is: {running_accuracy/len(model_list)}" )
lambda x: re.sub("[a-zA-Z0-9():\-_ \'\.\/]", "", x)) print(orig_train2.shape) print(orig_test2.shape) # TFIDF on those specific chars tfidf = text.TfidfVectorizer( input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer= 'char', #stop_words=[chr(x) for x in range(97,123)]+[chr(x) for x in range(65,91)]+['_','.',':'], token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) orig_train2 = tfidf.fit_transform(orig_train2) # Simple naive bayes on the text features model = naive_bayes.BernoulliNB() model.fit(orig_train2, orig_target)
if process_all: load_grammar = True else: load_grammar = False if load_grammar: grammar_processed_data = joblib.load('grammar-processed.pkl') else: grammar_processed_data = grammar_processor.fit_transform( loaded_data, train.target) processed_test_data = grammar_processed_data if process_all: joblib.dump(grammar_processed_data, 'grammar-processed.pkl') if False: text_processor = text.TfidfVectorizer(stop_words='english') processed_train_data = text_processor.fit_transform( train.data, train.target) # Cheating, but what we did with eval_auto_classifier.py processed_test_data = processed_train_data # Correct #processed_test_data = text_processor.fit_transform(train.test) test = train def eval_model(name, model, data): print '=' * 20 print name, 'training' model.fit(data, train.target, sample_weight=sample_weights) print name, 'trained'
business = pd.read_json('business.json', lines=True) business.drop(['address','city','state','postal_code','latitude','longitude', 'review_count', 'is_open', 'attributes', 'categories', 'hours'], axis=1, inplace=True) df_review_agg = df.groupby('business_id')['text'].sum() df_ready_for_sklearn = pd.DataFrame({'business_id':df_review_agg.index, 'all_reviews':df_review_agg.values}) df3 = pd.merge(df_ready_for_sklearn, business, on="business_id") df3.drop(['business_id','name'],axis=1,inplace=True) # df3 just contains reviews and the stars associated with each business # model stuff vectorizer = sk_text.CountVectorizer(min_df=1) corpus = df3['all_reviews'] vectorizer = sk_text.TfidfVectorizer(max_features = 5000,min_df=5) matrix = vectorizer.fit_transform(corpus) # x = tfidf_data x = matrix.toarray() y = df3.iloc[:,1].values # # # # # # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=42) # first hidden last has to have input matching the dimension of a row in x
def _get_vectorizers(self, cols_params): return [(col, text.TfidfVectorizer(**params)) for col, params in cols_params]
def generate_tfidf(text_corpora): vectorizer = tf.TfidfVectorizer(lowercase=False) vectorizer.fit(text_corpora) vector = vectorizer.transform(text_corpora) return vector
processedText = processedText.str.lower() ###Stop word removal## stop_words = nltk.corpus.stopwords.words('english') processedText = processedText.apply(lambda x: ' '.join( term for term in x.split() if term not in set(stop_words))) ### STEMMING ### porter = nltk.PorterStemmer() processedText = processedText.apply( lambda x: ' '.join(porter.stem(term) for term in x.split())) ## Feature Extraction : TF-IDF Vectorizer vectorizer = text.TfidfVectorizer(ngram_range=(1, 1)) X_ngrams = vectorizer.fit_transform(processedText) X_train, X_test, y_train, y_test = train_test_split(X_ngrams, encodedLabels, test_size=0.2, random_state=42, stratify=encodedLabels) ### CLASSIFICATION ### #1. Support Vector Machine# clf = svm.LinearSVC(loss='hinge') clf.fit(X_train, y_train) y_pred = clf.predict(X_test) score_svm = metrics.f1_score(y_test, y_pred)
import pandas as pd import re base_dir = "E:\\Kaggle\\ted-talks" os.chdir(base_dir) transcripts = pd.read_csv("transcripts.csv") transcripts['title'] = transcripts['url'].map(lambda x: x.split("/")[-1]) ## Extract key words using tfidf pattern = re.compile(r'\d+') transcripts['transcript'] = transcripts['transcript'].map( lambda x: re.sub(pattern, "", x)) from sklearn.feature_extraction import text Text = transcripts['transcript'].tolist() tfidf = text.TfidfVectorizer(input=Text, stop_words="english") matrix = tfidf.fit_transform(Text) matrix.shape def get_imp_terms(x): x = x.todense() x = x.tolist()[0] x = pd.Series(x, index=tfidf.get_feature_names()) x = x.sort_values(ascending=False) return x.head(4).index.tolist() transcripts['imp_terms'] = [get_imp_terms(x) for x in matrix] transcripts['imp_terms_tfidf'] = transcripts['imp_terms'].map( lambda x: ",".join(x))
print("Reduced vocabulary to %i words" % (len(vocabulary.vocab_reduced)), flush=True) vocabulary.write_voc_reduced(out_vocab_text) with open(out_vocab_pickle, 'wb') as f: print("Pickling reduced vocabulary to", out_vocab_pickle, flush=True) pickle.dump(vocabulary.vocab_reduced, f) vocnow = vocabulary.get_vocab_reduced_dict() vectorizer = fe.TfidfVectorizer(input='content', lowercase=False, preprocessor=preprocessor, token_pattern="", tokenizer=tokenizer, stop_words=None, vocabulary=vocabulary.get_vocab_reduced_dict(), norm='l1') print("Created tf-idf vectorizer\nFitting to the training data", flush=True) tfidf_train = vectorizer.fit_transform( [texts[idnow] for idnow in df_train[idcolname].values]) print("Created tf-idf training vectors\nCreating test vectors", flush=True) tfidf_test = vectorizer.transform( [texts[idnow] for idnow in df_test[idcolname].values], copy=True) print("Created tf-idf test vectors", flush=True)
def Get_Data(self,batch_idx,mscoco,split,caption_path): ''' Show an example of how to read the dataset ''' V_input = np.zeros((self.batch_size,3,64,64)) V_target = np.zeros((self.batch_size,3,32,32)) V_target_64by64 = np.zeros((self.batch_size,3,64,64)) V_Y_Carre = np.zeros((24,self.batch_size,3,32,32)) V_X_Carre = np.zeros((24,self.batch_size,3,32,32)) V_caption_dict = [] x = T.tensor3() f = theano.function([x],outputs=x.dimshuffle(2,0,1)) #m = T.ftensor3() #m_new = T.stack(T.concatenate([V_input[0],m])) #f = theano.function([V_input,m], outputs=[m_new]) data_path = os.path.join(mscoco,split) caption_path = os.path.join(mscoco,caption_path) with open(caption_path,'rb') as fd: caption_dict = pickle.load(fd) #print (data_path + "/*.jpg") imgs = glob.glob(data_path + "/*.jpg") batch_imgs = imgs[batch_idx*self.batch_size:(batch_idx+1)*self.batch_size] j = 0 for i, img_path in enumerate(batch_imgs): #print(i) img = Image.open(img_path) img_array = np.array(img) Input = np.array(img) center = (int(np.floor(img_array.shape[0] / 2.)), int(np.floor(img_array.shape[1] / 2.))) cap_id = os.path.basename(img_path)[:-4] #k=0 ### Get input/target from the images true immage 64*64, image with mask 64*64 true center 32*32 if len(img_array.shape) == 3: V_target_64by64[j,:,:,:] = f(img_array) Input[center[0]-16:center[0]+16, center[1]-16:center[1]+16, :] = 0 V_input[j,:,:,:] = f(Input) ##### ############################################################################## target0 = img_array[center[0]-32:center[0], center[1] - 32:center[1], :] V_Y_Carre[0,j,:,:,:] = f(target0) target0 = Input[center[0]-32:center[0], center[1] - 32:center[1], :] V_X_Carre[0,j,:,:,:] = f(target0) ### target0 = img_array[center[0]-32:center[0], center[1]:center[1]+32, :] V_Y_Carre[1,j,:,:,:] = f(target0) target0 = Input[center[0]-32:center[0], center[1]:center[1]+32, :] V_X_Carre[1,j,:,:,:] = f(target0) ### target0 = img_array[center[0]:center[0]+32, center[1] - 32:center[1], :] V_Y_Carre[2,j,:,:,:] = f(target0) target0 = Input[center[0]:center[0]+32, center[1] - 32:center[1], :] V_X_Carre[2,j,:,:,:] = f(target0) ### target0 = img_array[center[0]:center[0]+32, center[1]:center[1]+32, :] V_Y_Carre[3,j,:,:,:] = f(target0) target0 = Input[center[0]:center[0]+32, center[1]:center[1]+32, :] V_X_Carre[3,j,:,:,:] = f(target0) ### ### ### ### target0 = img_array[center[0]-32+8:center[0]+8, center[1] - 32+8:center[1]+8, :] V_Y_Carre[4,j,:,:,:] = f(target0) target0 = Input[center[0]-32+8:center[0]+8, center[1] - 32+8:center[1]+8, :] V_X_Carre[4,j,:,:,:] = f(target0) ### target0 = img_array[center[0]-32+8:center[0]+8, center[1]-8:center[1]+32-8, :] V_Y_Carre[5,j,:,:,:] = f(target0) target0 = Input[center[0]-32+8:center[0]+8, center[1]-8:center[1]+32-8, :] V_X_Carre[5,j,:,:,:] = f(target0) ### target0 = img_array[center[0]-8:center[0]+32-8, center[1] - 32+8:center[1]+8, :] V_Y_Carre[6,j,:,:,:] = f(target0) target0 = Input[center[0]-8:center[0]+32-8, center[1] - 32+8:center[1]+8, :] V_X_Carre[6,j,:,:,:] = f(target0) ### target0 = img_array[center[0]-8:center[0]+32-8, center[1]-8:center[1]+32-8, :] V_Y_Carre[7,j,:,:,:] = f(target0) target0 = Input[center[0]-8:center[0]+32-8, center[1]-8:center[1]+32-8, :] V_X_Carre[7,j,:,:,:] = f(target0) ############################################################################## ###vtrue immage 64*64, image with mask 64*64 true center 32*32 #input[center[0]-16:center[0]+16, center[1]-16:center[1]+16, :] = 0 target = img_array[center[0]-16:center[0]+16, center[1] - 16:center[1]+16, :] V_target[j,:,:,:] = f(target) V_caption_dict.append(caption_dict[cap_id]) j = j+1 #print (i,cap_id,caption_dict[cap_id]) #print(input) #plt.imshow(input) #eeee V_Temp_I = V_input if (V_Temp_I.shape[0]!=j): V_input = V_input[:j,:,:,:] V_target = V_target[:j,:,:,:] V_target_64by64 = V_target_64by64[:j,:,:,:] V_Y_Carre = V_Y_Carre[:,:j,:,:,:] V_X_Carre = V_X_Carre[:,:j,:,:,:] V_Temp_I = V_Temp_I[:j,:,:,:] Index_Image = j #print("fin first Party",V_input[0],Index_Image) #else: # try: # input = np.copy(img_array) # input[center[0]-16:center[0]+16, center[1]-16:center[1]+16, :] = 0 # target = img_array[center[0]-16:center[0]+16, center[1] - 16:center[1]+16] # except: # pass #Image.fromarray(img_array).show() #Image.fromarray(input).show() #Image.fromarray(target).show() ################################################################### Identification des description identiques # on a 100 Feature pour chacun des 4 blocs de 3*3232 et 100 pour le blocs masque 3*64*64 Feature_Kernel = np.zeros((Index_Image, 5*self.size_Feature)) # suppression nan # Ajust_Data = Input.dropna() # fusion toute les descriptions et combine all to have a corpus Corpus_Train = np.array([''.join(Doc) for Doc in V_caption_dict]) # count and normilize the corpus #Vectorizer = text.TfidfVectorizer(min_df=1,ngram_range=(1,1), stop_words = 'english', strip_accents='unicode',norm='12') #Vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words = stop_words ) Vectorizer = text.TfidfVectorizer(min_df=1,ngram_range=(2,1),stop_words= stop_words) #Vectorizer = text.HashingVectorizer(stop_words = stop_words,non_negative=True, n_features = 2 ** 8) Count_Train_Vector = Vectorizer.fit_transform(Corpus_Train) # projection dim 4 ou alors calcul imilitude direct 2000 comme random simulation LS_Model = TruncatedSVD(n_components = 5,random_state=2000) LS_Model.fit(Count_Train_Vector) # Nouveau vecteur avec dimension reduit Vect_Proj = LS_Model.fit_transform(Count_Train_Vector,y=None) Temp1 = pd.DataFrame(Vect_Proj, columns = ['dim1','dim2','dim3','dim4','dim5']) Temp2 = pd.DataFrame(Vect_Proj[:,0],columns = ['Cos_Sim']) Temp1.index = range(0,Index_Image) Temp2.index = range(0,Index_Image) ############################################# Creation Feature for ind in range(0,Index_Image): Temp = Temp1.iloc[ind:ind+1] temp = pd.DataFrame(cosine_similarity(Temp.values,Temp1.values)) Temp2['Cos_Sim'] = temp.transpose() Temp2 = Temp2.sort_values(by=['Cos_Sim']) #print(Temp2.index[-2],Temp2.index[-3]) #print(Temp2.iloc[-2]) # fusionner avec le prmier, deux premiers, 5 premiers et exploiter extraction feature ds autoencodeur # vecteur de taille 5 ou on ajoute des elts a V_X_Carre_train[:,:Index_Image+1,:,:,:] # 8*Index_Image*3*32*32, Index_Image : nombre image echantilllon d'entrainement 0 a 3 0 a 32 : repartition en 4 de image initiale ; 4 a 7, 16 a 48 V_X_Carre[8:15,ind,:,:,:] = V_X_Carre[0:7,Temp2.index[-2],:,:,:] V_Y_Carre[8:15,ind,:,:,:] = V_Y_Carre[0:7,Temp2.index[-2],:,:,:] V_X_Carre[16:23,ind,:,:,:] = V_X_Carre[0:7,Temp2.index[-3],:,:,:] V_Y_Carre[16:23,ind,:,:,:] = V_Y_Carre[0:7,Temp2.index[-3],:,:,:] #print(J,Temp2.index[-2],Temp2.index[-3]) # Quelques feature pour l'image 64 by 64 avec 0 au Centre Nbre = 0 params = {'bandwidth': np.logspace(-1, 1,20)} grid = GridSearchCV(KernelDensity(), params) temp0 = np.reshape(V_Temp_I[ind,:,:,:],(64,192)) # 3*64*64 temp0 = temp0/(255/2) temp0 = temp0-1 #temp /= (255/2) #temp -= 1 n_components = 5 pca = PCA(n_components = n_components, svd_solver='full') data = pca.fit_transform(temp0) grid.fit(data) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ new_data = kde.sample(self.size_Feature//n_components,random_state=2000) temp00 = new_data.min() temp01 = new_data.max() new_data = new_data - temp00 new_data = new_data/(temp00-temp01) #new_data += 1 new_data = new_data*255 new_data = np.uint8(new_data) new_data = new_data.astype('float32') Feature_Kernel[ind,Nbre:Nbre + self.size_Feature] = new_data.reshape(1,-1) Nbre =Nbre+self.size_Feature # project the 3*32*32 -dimensional data to a lower dimension # use grid search cross-validation to optimize the bandwidth for K in(0,4): params = {'bandwidth': np.logspace(-1, 1,20)} grid = GridSearchCV(KernelDensity(), params) temp0 = np.reshape(V_X_Carre[K,Temp2.index[-1],:,:,:],(32,96)) # un quart du tableau 3*32*32 separation initiale 0*32-0*32; 32*64-0*32; 32*64-0*32; 32*64-32*64 temp0 = temp0/(255/2) temp0 = temp0-1 pca = PCA(n_components = 5,svd_solver='full') data = pca.fit_transform(temp0) grid.fit(data) # use the best estimator to compute the kernel density estimate kde = grid.best_estimator_ new_data = kde.sample(self.size_Feature//5,random_state=2000) #new_data += 1 temp00 = new_data.min() temp01 = new_data.max() new_data = new_data - temp00 new_data = new_data/(temp00-temp01) new_data = new_data*255 new_data = np.uint8(new_data) #print(np.sum(pca.explained_variance_ratio_)) #print(new_data) # sample 44 new points from the data 2000 comme simule dristribution #new_data = pca.inverse_transform(new_data) Feature_Kernel[ind,Nbre:Nbre + self.size_Feature] = new_data.reshape(1,-1) Nbre = Nbre + self.size_Feature return V_input,V_target,V_target_64by64,V_caption_dict,Index_Image,V_X_Carre,V_Y_Carre,Feature_Kernel
def makePredictions(train, test_melt, Windows, look_back=49): r = 1.61803398875 # Windows = np.round(r**np.arange(1,9) * 7) # Windows = [11, 18, 30, 48, 78, 126, 203, 329] # Windows = [7, 13, 20, 33, 53, 86, 139, 225] n = train.shape[1] - 1 # 550 Visits = np.zeros(train.shape[0]) for i, row in train.iterrows(): M = [] start = row[1:].nonzero()[0] if len(start) == 0: continue if n - start[0] < Windows[0]: Visits[i] = row.iloc[start[0] + 1:].median() continue for W in Windows: if W > n - start[0]: break M.append(row.iloc[-W:].median()) Visits[i] = np.median(M) Visits[np.where(Visits < 1)] = 0. train['Predicted'] = Visits #print(train.head()) #test1 = pd.read_csv("../input/key_2.csv") #test1['Page'] = test1.Page.apply(lambda x: x[:-11]) test1 = test_melt.merge(train[['Page', 'Predicted']], on='Page', how='left') #print('MODEL 1 SMAPE: ', smape(test1['Visits'], test1['Predicted'])) # add model 2 #determine idiom with URL train['origine'] = train['Page'].apply( lambda x: re.split(".wikipedia.org", x)[0][-2:]) ''' This is what you get with a value counts on train.origine en 24108 ja 20431 de 18547 fr 17802 zh 17229 ru 15022 es 14069 ts 13556 er 4299 ''' #we have english, japanese, deutch, french, chinese (taiwanese ?), russian, spanish #ts and er are undetermined; in the next lines, I try to replace them by learning from special chars #Note : this step wasn't tuned, and can't be perfect because other idioms are available in those Pages (such as portuguese for example) #let's make a train, target, and test to predict language on ts and er pages orig_train = train.loc[~train.origine.isin(['ts', 'er']), 'Page'] orig_target = train.loc[~train.origine.isin(['ts', 'er']), 'origine'] orig_test = train.loc[train.origine.isin(['ts', 'er']), 'Page'] #keep only interesting chars orig_train2 = orig_train.apply(lambda x: x.split(".wikipedia")[ 0][:-3]).apply(lambda x: re.sub("[a-zA-Z0-9():\-_ \'\.\/]", "", x)) orig_test2 = orig_test.apply(lambda x: x.split(".wikipedia")[ 0][:-3]).apply(lambda x: re.sub("[a-zA-Z0-9():\-_ \'\.\/]", "", x)) #run TFIDF on those specific chars tser_model = True try: tfidf = text.TfidfVectorizer( input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer= 'char', #stop_words=[chr(x) for x in range(97,123)]+[chr(x) for x in range(65,91)]+['_','.',':'], token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False) orig_train2 = tfidf.fit_transform(orig_train2) #apply a simple naive bayes on the text features model = naive_bayes.BernoulliNB() model.fit(orig_train2, orig_target) result = model.predict(tfidf.transform(orig_test2)) result = pd.DataFrame(result, index=orig_test) result.columns = ['origine'] except: tser_model = False #result will be used later to replace 'ts' and 'er' values #we need to remove train.origine so that the train can be flattened with melt del train['origine'] del train['Predicted'] #let's flatten the train as did clustifier and initialize a "ferie" columns instead of a weekend column #look_back=49 #look_back=50 #look_back=51 #look_back=54 #look_back=60 train = pd.melt(train[list(train.columns[-look_back:]) + ['Page']], id_vars='Page', var_name='date', value_name='Visits') train['date'] = train['date'].astype('datetime64[ns]') train['ferie'] = ((train.date.dt.dayofweek) >= 5).astype(float) train['origine'] = train['Page'].apply( lambda x: re.split(".wikipedia.org", x)[0][-2:]) #let's join with result to replace 'ts' and 'er' if tser_model: join = train.loc[train.origine.isin(["ts", "er"]), ['Page']] join['origine'] = 0 #init join.index = join["Page"] join.origine = result train.loc[train.origine.isin(["ts", "er"]), ['origine']] = join.origine.values #replace #official non working days by country (manual search with google) #I made a lot of shortcuts considering that only Us and Uk used english idiom, #only Spain for spanich, only France for french, etc train_us=['2015-07-04','2015-11-26','2015-12-25']+\ ['2016-07-04','2016-11-24','2016-12-26'] test_us = [] train_uk=['2015-12-25','2015-12-28'] +\ ['2016-01-01','2016-03-28','2016-05-02','2016-05-30','2016-12-26','2016-12-27'] test_uk = ['2017-01-01'] train_de=['2015-10-03', '2015-12-25', '2015-12-26']+\ ['2016-01-01', '2016-03-25', '2016-03-26', '2016-03-27', '2016-01-01', '2016-05-05', '2016-05-15', '2016-05-16', '2016-10-03', '2016-12-25', '2016-12-26'] test_de = ['2017-01-01'] train_fr=['2015-07-14', '2015-08-15', '2015-11-01', '2015-11-11', '2015-12-25']+\ ['2016-01-01','2016-03-28', '2016-05-01', '2016-05-05', '2016-05-08', '2016-05-16', '2016-07-14', '2016-08-15', '2016-11-01','2016-11-11', '2016-12-25'] test_fr = ['2017-01-01'] train_ru=['2015-11-04']+\ ['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04', '2016-01-05', '2016-01-06', '2016-01-07', '2016-02-23', '2016-03-08', '2016-05-01', '2016-05-09', '2016-06-12', '2016-11-04'] test_ru = [ '2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', '2017-02-23' ] train_es=['2015-08-15', '2015-10-12', '2015-11-01', '2015-12-06', '2015-12-08', '2015-12-25']+\ ['2016-01-01', '2016-01-06', '2016-03-25', '2016-05-01', '2016-08-15', '2016-10-12', '2016-11-01', '2016-12-06', '2016-12-08', '2016-12-25'] test_es = ['2017-01-01', '2017-01-06'] train_ja=['2015-07-20','2015-09-21', '2015-10-12', '2015-11-03', '2015-11-23', '2015-12-23']+\ ['2016-01-01', '2016-01-11', '2016-02-11', '2016-03-20', '2016-04-29', '2016-05-03', '2016-05-04', '2016-05-05', '2016-07-18', '2016-08-11', '2016-09-22', '2016-10-10', '2016-11-03', '2016-11-23', '2016-12-23'] test_ja = ['2017-01-01', '2017-01-09', '2017-02-11'] train_zh=['2015-09-27', '2015-10-01', '2015-10-02','2015-10-03','2015-10-04','2015-10-05','2015-10-06','2015-10-07']+\ ['2016-01-01', '2016-01-02', '2016-01-03', '2016-02-08', '2016-02-09', '2016-02-10', '2016-02-11', '2016-02-12', '2016-04-04', '2016-05-01', '2016-05-02', '2016-06-09', '2016-06-10', '2016-09-15', '2016-09-16', '2016-10-03', '2016-10-04','2016-10-05','2016-10-06','2016-10-07'] test_zh = ['2017-01-02', '2017-02-27', '2017-02-28', '2017-03-01'] #in China some saturday and sundays are worked train_o_zh = [ '2015-10-10', '2016-02-06', '2016-02-14', '2016-06-12', '2016-09-18', '2016-10-08', '2016-10-09' ] test_o_zh = ['2017-01-22', '2017-02-04'] #let's replace values in 'ferie' columns train.loc[(train.origine == 'en') & (train.date.isin(train_us + train_uk)), 'ferie'] = 1 train.loc[(train.origine == 'de') & (train.date.isin(train_de)), 'ferie'] = 1 train.loc[(train.origine == 'fr') & (train.date.isin(train_fr)), 'ferie'] = 1 train.loc[(train.origine == 'ru') & (train.date.isin(train_ru)), 'ferie'] = 1 train.loc[(train.origine == 'es') & (train.date.isin(train_es)), 'ferie'] = 1 train.loc[(train.origine == 'ja') & (train.date.isin(train_ja)), 'ferie'] = 1 train.loc[(train.origine == 'zh') & (train.date.isin(train_zh)), 'ferie'] = 1 train.loc[(train.origine == 'zh') & (train.date.isin(train_o_zh)), 'ferie'] = 0 #same with test #test = pd.read_csv("../input/key_2.csv") test = test_melt del test['Visits'] #test['date'] = test.Page.apply(lambda a: a[-10:]) #test['Page'] = test.Page.apply(lambda a: a[:-11]) test['date'] = test['date'].astype('datetime64[ns]') test['ferie'] = ((test.date.dt.dayofweek) >= 5).astype(float) test['origine'] = test['Page'].apply( lambda x: re.split(".wikipedia.org", x)[0][-2:]) #joint with result if tser_model: join = test.loc[test.origine.isin(["ts", "er"]), ['Page']] join['origine'] = 0 join.index = join["Page"] join.origine = result test.loc[test.origine.isin(["ts", "er"]), ['origine']] = join.origine.values test.loc[(test.origine == 'en') & (test.date.isin(test_us + test_uk)), 'ferie'] = 1 test.loc[(test.origine == 'de') & (test.date.isin(test_de)), 'ferie'] = 1 test.loc[(test.origine == 'fr') & (test.date.isin(test_fr)), 'ferie'] = 1 test.loc[(test.origine == 'ru') & (test.date.isin(test_ru)), 'ferie'] = 1 test.loc[(test.origine == 'es') & (test.date.isin(test_es)), 'ferie'] = 1 test.loc[(test.origine == 'ja') & (test.date.isin(test_ja)), 'ferie'] = 1 test.loc[(test.origine == 'zh') & (test.date.isin(test_zh)), 'ferie'] = 1 test.loc[(test.origine == 'zh') & (test.date.isin(test_o_zh)), 'ferie'] = 0 train_page_per_dow = train.groupby(['Page', 'ferie']).median().reset_index() test = test.merge(train_page_per_dow, on=['Page', 'ferie'], how='left') test['Pred2'] = test['Visits'] test.loc[test.Pred2.isnull(), 'Pred2'] = 0 test['PredC'] = ( (test['Pred2'] * 10).astype('int') / 10 + test1['Predicted']) / 2 test['Visits'] = test1['Visits'] test['Pred1'] = test1['Predicted'] #print("MODEL 2 SMAPE: ", smape(test['Visits'], test['Pred2'])) combinedSmape = smape(test['Visits'], test['PredC']) print("Combined SMAPE: ", combinedSmape) print("look_back:", look_back) print("------------------------------------") #test[['Id','Visits']].to_csv('sub.csv', index=False) return (combinedSmape)
feats += [(strict_words[i], strict_words[i+1]) for i in xrange(len(strict_words)-1)] ## synonyms of strict words #for sw in strict_words: # if sw in POS_KEY_WORDS: # feats += synonyms(w) for w in words: try: pos = corpus.wordnet.synsets(w)[0].pos if pos in ('n', 'a', 'v'): feats += [(w, pos)] except: pass return feats ## building features on training data tfidf_vectorizer = text.TfidfVectorizer(charset = 'latin-1', lowercase=False, sublinear_tf=True, tokenizer = my_tokenizer,#vocabulary = CHAT_WORDS, max_df=1.0)#, norm = 'l1') print 'extracting tfidf from training set...' t0 = time() train_X = tfidf_vectorizer.fit_transform(train_X) print 'done in %0.2fs' % (time() - t0) print 'shape of training data', train_X.shape # <codecell> ## add extra features to tfidf '@HasPositive' in tfidf_vectorizer.get_feature_names() #print filter(lambda f: f[0]=='you', tfidf_vectorizer.get_feature_names()) # <codecell>
def select_vectorizer(vectorizer_type, req_ngram_range=[1, 2]): """ Select the desired vectorizer for either text or tweet @ text_tfidf_std @ text_tfidf_custom @ text_count_std @ tweet_tfidf_std @ tweet_tfidf_custom """ # SPECIFY VECTORIZER ALGORITHM #---------------------------------# ngram_lengths = req_ngram_range if vectorizer_type == "text_tfidf_std": # Standard TFIDF Vectorizer (Text) vectorizer = text.TfidfVectorizer(input='filename', analyzer='word', ngram_range=(ngram_lengths), stop_words='english', min_df=2) return vectorizer elif vectorizer_type == "text_tfidf_custom": # TFIDF Vectorizer with NLTK Tokenizer (Text) vectorizer = text.TfidfVectorizer(input='filename', analyzer='word', ngram_range=(ngram_lengths), stop_words='english', min_df=2, tokenizer=tokenize_nltk) print("User specified custom stopwords: {} ...".format( str(custom_stopwords)[1:-1])) return vectorizer elif vectorizer_type == "text_count_std": vectorizer = text.CountVectorizer(input='filename', analyzer='word', ngram_range=(ngram_lengths), stop_words='english', min_df=2) return vectorizer elif vectorizer_type == "tweet_tfidf_std": # Standard TFIDF Vectorizer (Content) vectorizer = text.TfidfVectorizer(input='content', analyzer='word', ngram_range=(ngram_lengths), stop_words='english', min_df=2) return vectorizer elif vectorizer_type == "tweet_tfidf_custom": # Standard TFIDF Vectorizer (Content) vectorizer = text.TfidfVectorizer(input='content', analyzer='word', ngram_range=(ngram_lengths), stop_words='english', min_df=2, tokenizer=tokenize_nltk) print("User specified custom stopwords: {} ...".format( str(custom_stopwords)[1:-1])) return vectorizer else: print("error in vectorizer specification...") pass
import numpy as np import pandas as pd import shutil import os import csv import sklearn.feature_extraction.text as sk_text from sklearn.model_selection import train_test_split from sklearn import metrics path = './data/' filename_write = os.path.join(path, "class_vectorized_review.csv") data = pd.read_csv("./data/class_example.csv", encoding="utf-8") vectorizer = sk_text.TfidfVectorizer(stop_words='english', max_features=1000, min_df=1) matrix = vectorizer.fit_transform( data['all_reviews']) #turns all reviews into a tf-idf vector tfidf_data = matrix.toarray().tolist( ) #vector converterd into a list for data frame data.drop('all_reviews', axis=1, inplace=True) #data.drop('Unnamed: 0', axis = 1, inplace= True) #data.drop('Unnamed: 0.1', axis = 1, inplace= True) data.insert(2, 'all_reviews', tfidf_data) #inserts the tfidf vector into data text_data = pd.DataFrame(data['all_reviews'].values.tolist(), columns=vectorizer.get_feature_names()) data.drop('all_reviews', axis=1, inplace=True) data = pd.concat([data, text_data], axis=1) data.to_csv(filename_write, index=False)
def result(): # get select list sid = request.form['sid'] sid = sid.split() sid = [int(i) for i in sid] a = session.get('abstract', None) t = session.get('title', None) # get selected abstract sa = [] for index, item in enumerate(sid): sa.append(a[item - 1]) # get abstract excluded from selected item sid.sort(reverse=True) for index, item in enumerate(sid): a.pop(item - 1) t.pop(item - 1) # # make non-selected articles a data frame for calculation ta_df = pd.DataFrame({'Title': t, 'Abstract': a}) # combine 2 lists to one s = " " sa = s.join(sa) # make all selected abstracts into one ca = a ca.append(sa) # append the selected abstracts # # Stopwords my_additional_stop_words = ["author", "and", "of", "the", "research", "\n"] stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words) # # STEM ca = [[stem(word) for word in sentence.split(" ")] for sentence in ca] y = len(ca) for i in range(0, y): ca[i] = " ".join(ca[i]) # build tf-idf matrix tfidf = text.TfidfVectorizer(input=ca, stop_words=stop_words, analyzer='word', lowercase=True) matrix = tfidf.fit_transform(ca) # calculate similarity score sim_unigram = cosine_similarity(matrix) # build function to get similar score for all articles by locating them def get_similar_papers(x): return ta_df.loc[np.argsort(-x)] recomPaper = get_similar_papers(sim_unigram[-1]) sim_unigram[-1][::-1].sort() # descending # # sim_unigram[-1].sort() #ascending recomPaper["Similar_score"] = sim_unigram[-1] # get the 50 papers with highest similarity recomPaper50 = recomPaper[:51] recomPaper50 = recomPaper50.dropna().reset_index(drop=True) tl = recomPaper50['Title'].tolist() al = recomPaper50['Abstract'].tolist() sl = recomPaper50['Similar_score'].tolist() listc = [[x, y, z] for x, y, z in zip(tl, al, sl)] # output file # recomPaper50.to_csv("recom50papers.csv") return render_template('result.html', sid=sid, a=a, t=t, result=recomPaper50, lc=listc)
talks = df.text.tolist() # We are not going to need the identifiers for this run, so I'm leaving them commented out. # =-=-=-=-=-=-=-=-=-=-= # Create citations to identify individual texts # =-=-=-=-=-=-=-=-=-=-= # authors = df.author.tolist() # dates = df.date.tolist() # years = [re.sub('[A-Za-z ]', '', item) for item in dates] # authordate = [author+" "+year for author, year in zip(authors, years)] import sklearn.feature_extraction.text as sktext from sklearn.decomposition import NMF import numpy as np # Import stoplist stopwords = re.split('\s+', open('../data/stopwords_2.txt', 'r').read().lower()) # TFIDF parameters max_percent = 0.85 min_percent = 0.01 # One percent = 20 talks (so not enought to warrant a topic?) # Create TFIDF matrix vectorizer = sktext.TfidfVectorizer(lowercase=True, stop_words=stopwords, max_df=max_percent, min_df=min_percent) td_matrix = vectorizer.fit_transform(talks) print(td_matrix.shape)
with open(fil, 'r') as csv_file: csv_reader = csv.reader(csv_file) for line in csv_reader: temp_str = ''.join(line) temp_doc.append(temp_str) doc_string = ''.join(temp_doc) corpus.append(doc_string) temp_doc.clear() print(len(corpus)) """tf-idf αναπαράσταση των επιχειρήσεων""" vectorizer = sk_text.TfidfVectorizer(stop_words='english', min_df=10) X = vectorizer.fit_transform(corpus) print(X.toarray()) print(vectorizer.get_feature_names()) """# 1. k-means """ kmeans = sk_cluster.KMeans(init='k-means++', n_clusters=3, n_init=10) kmeans.fit_predict(X) centroids = kmeans.cluster_centers_ kmeans_labels = kmeans.labels_ error = kmeans.inertia_ print("The total error of the clustering is: ", error) print('\nCluster labels')
import sklearn.feature_extraction.text as text cv = text.CountVectorizer(doc) cv.fit(doc) cv.get_feature_names() cv.transform(doc) cv.transform(doc).toarray() pd.DataFrame(data=cv.transform(doc).toarray(), columns=cv.get_feature_names()) tfid = text.TfidfVectorizer(doc) tfid.fit(doc) tfid.transform(doc).toarray() pd.DataFrame(data=tfid.transform(doc).toarray(), columns=tfid.get_feature_names()) data.columns y = data['Rank'] x = data['Raw_joke'] tfid = text.TfidfVectorizer(x.tolist()) tfid.fit(x.tolist())
dataset = datasets.fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) data_samples = dataset.data[:n_samples] print("done in %0.3fs." % (time() - t0)) preprocessor = tokenhandler._ENSimpleTokenHandler(stem=apply_stemming, stopword=remove_stopwords) tf_vectorizer = txtfeatext.CountVectorizer( tokenizer=preprocessor, ngram_range=ngram_range, max_features=n_features ) # @TODO encoding?? (default utf8 but may depend on the user per application needs) tf_matrix = tf_vectorizer.fit_transform(data_samples) tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=preprocessor, ngram_range=ngram_range, max_features=n_features) tfidf_matrix = tfidf_vectorizer.fit_transform(data_samples) # apply NMF print( "Applying NMF on tf*idf weighted terms, n_samples=%d and n_features=%d..." % (n_samples, n_features)) nmf = decomposer.NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf_matrix) print("Applying LDA on tf weighted terms, n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = decomposer.LatentDirichletAllocation(n_topics=n_topics,
@static_var('test_num', 1) def print_ans(*args, sep=' '): with open('{}.txt'.format(print_ans.test_num), 'w') as fout: fout.write(sep.join(list(map(str, args)))) print_ans.test_num += 1 newsgroups = datasets.fetch_20newsgroups( subset='all', categories=['alt.atheism', 'sci.space'] ) pipe = pipeline.Pipeline([ ('tfidf', sktext.TfidfVectorizer()), ('svс', svm.SVC(kernel='linear', random_state=241)), ]) parameters = {'svс__C': [math.pow(10, i) for i in range(-5, 6)]} kfold = cross_validation.KFold(len(newsgroups.data), n_folds=5, shuffle=True, random_state=241) clf = grid_search.GridSearchCV(pipe, parameters, scoring='roc_auc', cv=kfold, n_jobs=8) clf.fit(newsgroups.data, newsgroups.target) print(clf.best_params_) estimator = clf.best_estimator_ estimator.fit(newsgroups.data, newsgroups.target) words = estimator.named_steps['tfidf'].get_feature_names()
ycolor.append(1) # for p in paras: # bad_results.append(q2v.filter_sentence(q['document'], is_stop_words=True)) # questions.append(p) elif q['success_outcome'] == 0: # paras = q['document'].split('\n') bad_results.append( q2v.filter_sentence(q['document'], is_stop_words=True)) ycolor.append(0) # for p in paras: # bad_results.append(q2v.filter_sentence(p, is_stop_words=True)) # questions.append(p) # print(filtered_questions[:20]) cv = ft.TfidfVectorizer() # tf_good_mat = cv.fit_transform(good_results).toarray() tf_bad_mat = cv.fit_transform(bad_results).toarray() words = cv.get_feature_names() print("len(words):", len(words)) # for t in tfmat[:30]: # print(list(t)) """ dimension reduction """ pca = PCA(n_components=2) pca.fit(tf_bad_mat) X = pca.transform(tf_bad_mat) """ Kmeans """ y_pred = KMeans(n_clusters=10, random_state=9).fit_predict(X) """ show points with colors and labels """ my = My_show(X, ycolor, ycolor, "Paragraph tf-idf features")
from sklearn.feature_extraction import text import os # %% dir = os.path.dirname(__file__) file = os.path.join(dir, "jones_t_mails.csv") file # %% df = pd.read_csv(file) df # %% notblank = df["content"].apply(lambda x: len(str(x)) > 3) df = df[notblank] # %% X = df["content"] # %% TfIdfVectorizer = text.TfidfVectorizer() TfIdfVectorizer.fit(X) # %% idfdf = pd.DataFrame({ "names": TfIdfVectorizer.get_feature_names(), "idf": TfIdfVectorizer.idf_ }) idfdf.sort_values(by=["idf"], ascending=False) # %%
parsed_emails.replace("shackleton ", "") parsed_emails.replace("chris ", "") parsed_emails.replace("germani ", "") ### append the text to word_data word_data.append(parsed_emails) ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris if name == "sara": from_data.append(0) if name == "chris": from_data.append(1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "w")) pickle.dump(from_data, open("your_email_authors.pkl", "w")) #print (word_data[152]) ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction import text vec = text.TfidfVectorizer(word_data[0]) print(vec)