def reduce_to_k_dim(M,dim): svd = TruncatedSVD(n_components = dim,n_iter = 30) reduced_model = svd.fit_transform(M) return reduced_model
d2 = pdt_ttl_vec[i, :] dst_srch_ttl1[i] = cosine_similarity(d1, d2) dst_srch_desc1 = np.zeros(srch_vec.shape[0]) for i in range(srch_vec.shape[0]): d1 = srch_vec[i, :] d2 = pdt_desc_vec[i, :] dst_srch_desc1[i] = cosine_similarity(d1, d2) dst_ttl_desc1 = np.zeros(srch_vec.shape[0]) for i in range(srch_vec.shape[0]): d1 = pdt_ttl_vec[i, :] d2 = pdt_desc_vec[i, :] dst_srch_desc1[i] = cosine_similarity(d1, d2) svd = TruncatedSVD(n_components=30, random_state=2016) srch_vec = svd.fit_transform(srch_vec) pdt_ttl_vec = svd.fit_transform(pdt_ttl_vec) pdt_desc_vec = svd.fit_transform(pdt_desc_vec) srch_vec = pd.DataFrame( srch_vec, columns=['srch_vec_' + str(i) for i in range(srch_vec.shape[1])]) pdt_ttl_vec = pd.DataFrame( pdt_ttl_vec, columns=['ttl_vec_' + str(i) for i in range(pdt_ttl_vec.shape[1])]) pdt_desc_vec = pd.DataFrame( pdt_desc_vec, columns=['desc_vec_' + str(i) for i in range(pdt_desc_vec.shape[1])]) id = list(df_all['id'])
if __name__ == "__main__": print 'loading x_tr...' t0 = time.time() x_tr = load_csr_matrix_from_npz('../data/processed/tf_idf_transformation/train/matrix.npz') print 'loading finished, time = {0}'.format(time.time()-t0) print 'loading y_tr...' t0 = time.time() y_tr = numpy.loadtxt('../data/processed/tf_idf_transformation/train/labels.csv', dtype='int') print 'loading finished, time = {0}'.format(time.time()-t0) print 'running TruncatedSVD...' t0 = time.time() from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=100) x_tr_new = svd.fit_transform(x_tr, y_tr) print 'running TruncatedSVD finished, x_new.shape = {0}, time = {1}'.format(x_tr_new.shape, time.time()-t0) #delete x_tr del x_tr print 'fitting model...' t0 = time.time() from sklearn.multiclass import OneVsRestClassifier from sklearn.linear_model import LogisticRegression clf = OneVsRestClassifier(LogisticRegression()) clf.fit(x_tr_new, y_tr) print 'fitting finished, time = {0}'.format(time.time()-t0) #delete x_tr_new, y_tr
def svd(*args, **kwargs): return TruncatedSVD(*args, **kwargs)
le = preprocessing.LabelEncoder() le.fit(df["Category"]) Y_train=le.transform(df["Category"]) X_train1=df['Content'] X_train2=[] for i in range(len(X_train1)): X_train2.append(10*df['Title'][i]+df['Content'][i]) X_train=np.array(X_train2) #read test file df_test=pd.read_csv("test_set.csv",sep="\t") vectorizer=CountVectorizer(stop_words='english') transformer=TfidfTransformer() svd=TruncatedSVD(n_components=200, random_state=42) pipeline_test = Pipeline([ ('vect', vectorizer), ('tfidf', transformer), ('svd',svd), ]) #My method---Voting Classifier clf1 = BernoulliNB(fit_prior=False) clf2 = KNeighborsClassifier(weights='distance',n_jobs=-1) clf3 = RandomForestClassifier(n_estimators=500,n_jobs=-1) clf = VotingClassifier(estimators=[('bnb',clf1),('knn',clf2),('rf',clf3)], voting='hard') pipeline = Pipeline([ ('vect', vectorizer), ('tfidf', transformer), ('svd',svd), ('clf', clf)
def do_lsa(X, target_dim): svd = TruncatedSVD(target_dim, random_state=42) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) return lsa.fit_transform(X)
def post(self): # Get the THEME labels abs_filename = ett_h.generate_dynamic_path( [base_folder_location, LabelType.THEME.value, label_file_name]) labels = (ett_h.load_data_common_separated(abs_filename, ',')) # Get the label data from input_data raw_label = TrainThemeUpload.input_data[ColumnName.LABEL.value] data = ett_t.transform_data_to_dataframe_basic( TrainThemeUpload.input_data, colnames) # Get the OneHotEncoded labels label_df = ett_t.one_hot_encoding(raw_label) #17 labels dataframe # Rename the OneHotEncoded labels label_df.columns = labels # Get the number of labels num_of_labels = len(labels) # Data preprocessing nan_cleaned_data = ett_c.clean_dataframe_by_regex( data, RegexFilter.NON_ALPHA_NUMERIC.value ) # Removed all non alphanumeric characters d_cleaned_data = ett_c.clean_dataframe_by_regex( nan_cleaned_data, RegexFilter.DIGITS_ONLY.value) # Removed all digits l_cleaned_data = ett_c.remove_non_iso_words( d_cleaned_data, Language.ENGLISH.value) # Remove non-English text rew_cleaned_data = ett_c.remove_language_stopwords( l_cleaned_data, Language.ENGLISH.name) # Remove English stop words l_transformed_data = ett_t.lowercase( rew_cleaned_data) # Transform text to lowercase le_transformed_data = ett_t.stemming_mp( l_transformed_data ) # Transform text to core words i.e. playing > play data = le_transformed_data # Return the newly transformed data # Split the data into 0.8 training datasets and 0.2 testing datasets X_train, X_test, y_train, y_test = train_test_split(data, label_df, test_size=0.2, random_state=42) endpoint_output = {} for i in range(num_of_labels): model_id = str(i) single_label = y_train.iloc[:, i] label = labels[i] print("label", label) pipeline = imbPipeline([ (ModelType.TFIDF.value, TfidfVectorizer()), # Data vectorization (ModelType.OVERSAMPLE.value, SMOTE(random_state=42)), # Data balancing (ModelType.SVD.value, TruncatedSVD()), # Feature selection (ModelType.NOR.value, preprocessing.MinMaxScaler()), # Data normalization (ModelType.CLF.value, OneVsRestClassifier(SVC())) ]) # CLassification #list_c = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1] list_c = [1] #list_n = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]) list_n = [100] # Remember to add[2,\]2] best_score = 0 epsilon = .005 dictionary = {} for para_c in list_c: for para_n in list_n: parameters = { ModelType.TFIDF.value: [ TfidfVectorizer(max_features=800, ngram_range=(1, 4), norm='l2', encoding='latin-1', stop_words='english', analyzer='word') ], ModelType.SVD.value: [ TruncatedSVD(n_components=para_n, n_iter=7, random_state=42) ], ModelType.CLF.value: [ OneVsRestClassifier( SVC(kernel='linear', probability=True, C=para_c)) ] } gs_clf = GridSearchCV(pipeline, parameters, cv=5, error_score='raise', scoring='f1') gs_clf = gs_clf.fit(X_train, single_label) current_score = gs_clf.best_score_ dictionary[current_score] = parameters for current_score in dictionary.keys(): if current_score - epsilon > best_score: best_score = current_score model_dict = dictionary[best_score] label_model_list = {} label_model_list['score'] = best_score folder_time = time.strftime("_%Y%m%d_%H%M") # Create Directory in the AWS S3 Bucket os.mkdir("/Users/yihanbao/Desktop/unisdr-training/theme/" + label + "/" + label + folder_time) # Navigate to AWS model saving folder model_folder = os.path.join( os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname(os.path.realpath(__file__))))), ett_h.generate_dynamic_path( [LabelType.THEME.value, label, label + folder_time])) """ # Connect to AWS conn = boto.s3.connect_to_region(" ",aws_access_key_id = 'AWS-Access-Key', aws_secret_access_key = 'AWS-Secrete-Key', calling_format = boto.s3.connection.OrdinaryCallingFormat()) bucket = conn.get_bucket("oict-psdg-unisdr-train-models-v1") # AWS Key aws_path = ett_h.generate_dynamic_path([LabelType.THEME.value, label, timestamp+label]) """ # Here to fit the training datasets to the models with best score # vectorization vector = model_dict[ModelType.TFIDF.value][0].fit( X_train, single_label) ett_h.save_model( vector, ett_h.generate_dynamic_path( [model_folder, label + folder_time + vector_model_name])) vectorized_df = vector.transform(X_train) label_model_list[ URLName.VECURL.value] = ett_h.generate_dynamic_path( [model_folder, label + folder_time + vector_model_name]) """ key_name = timestamp+label+model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(vector) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Balcancing sm = SMOTE(random_state=42) X_res, y_res = sm.fit_resample(vectorized_df, single_label) # Feature selction svd = model_dict[ModelType.SVD.value][0].fit(X_res, y_res) ett_h.save_model( svd, ett_h.generate_dynamic_path([ model_folder, label + folder_time + dim_reductor_model_name ])) dim_reductor_df = svd.transform(X_res) label_model_list[ URLName.DIMURL.value] = ett_h.generate_dynamic_path([ model_folder, label + folder_time + dim_reductor_model_name ]) """ key_name = timestamp+label+dim_reductor_model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(svd) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Normalizing min_max_scaler = preprocessing.MinMaxScaler() nor_model = min_max_scaler.fit(dim_reductor_df, y_res) ett_h.save_model( nor_model, ett_h.generate_dynamic_path([ model_folder, label + folder_time + normalizar_model_name ])) scaled_df = nor_model.transform(dim_reductor_df) label_model_list[ URLName.NORURL.value] = ett_h.generate_dynamic_path([ model_folder, label + folder_time + normalizar_model_name ]) """ key_name = timestamp+label+normalizar_model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(nor_model) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ # Classifier clf = model_dict[ModelType.CLF.value][0].fit(scaled_df, y_res) clf.fit(scaled_df, y_res) ett_h.save_model( clf, ett_h.generate_dynamic_path( [model_folder, label + folder_time + model_name])) label_model_list[ URLName.MODURL.value] = ett_h.generate_dynamic_path( [model_folder, label + folder_time + model_name]) """ key_name = timestamp+label+model_name full_key_name = os.path.join(path, key_name) pickle_byte_obj = pickle.dump(scaled_df) s3_resource = resource('s3') s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj) """ endpoint_output[model_id] = [label_model_list] output = json.dumps(endpoint_output) return output
print '==============' print metrics.confusion_matrix(labels, km.labels_) print '==============' print '-----------------------------------------------------' #============================================================================== #=========================Reduce Dimensionality (SVD)========================== print '##############################################################' for i in range(0,5): print 'Performing truncatedSVD...' svd = TruncatedSVD(n_components = 165, n_iter = 13,random_state = 42) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X_reduced = lsa.fit_transform(X) k_means(X_reduced, labels, 'truncatedSVD') #============================================================================== #=========================Reduce Dimensionality (PCA)========================== print '##############################################################' for i in range(0,5): print 'Performing PCA...'
'nntp', '00041032', '000062david42', '000050', '00041555', '0004244402', 'mcimail', '00043819', 'prb', '0004246', '0004422', '00044513', '00044939','access', 'digex', 'host', 'would', 'writes', 'posting', 'dseg']) # In[5]: vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range = (1, 3)) X = vectorizer.fit_transform(corpus) # In[6]: #decompose into X=UST^T lsa = TruncatedSVD(n_components = 25, n_iter = 100) lsa.fit(X) # In[7]: terms = vectorizer.get_feature_names() for i, comp in enumerate(lsa.components_): termsInComp = zip (terms, comp) sortedTerms = sorted(termsInComp, key = lambda x: x[1], reverse = True) [:10] print("Concept %d:" % i ) for term in sortedTerms: print(term[0]) print(" ")
def compute_pc(X, npc=1): svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) svd.fit(X) return svd.components_
Discart the least important """ # discard that many elements from the end of TF IDF and dictionary if discard > 0: TF = TF[:, :-discard] IDF = IDF[:-discard] dictionary = dictionary[:-discard] ''' Create the TF-IDF MATRIX from the training data to be used with svd and gmms ''' TFIDF = tfidf(TF, IDF) ''' Singular Value Decomposition ''' svd = TruncatedSVD(n_components=svd_components) TFIDFsvd = svd.fit_transform(TFIDF) ''' extract data for each class separately GMMs will be trained separately on each classes TFIDF samples ''' TFIDF_class = [] for class_num in range(1, 16): TFIDF_class.append(samples_from_class(TFIDFsvd, class_num, labels)) ''' GMM training We train #classes = 15 GMMS to estimate the distribution of the features Each row of the TFIDFsummed is a feature vector on which we train a GMM ''' GMMS = [] for class_num in range(1, 16):
print "----------------------" # Vectorization object vectorizer = TfidfVectorizer(strip_accents=None, preprocessor=None,) n_grams = [(x, y) for x in xrange(1, 2, 1) for y in xrange(2, 4, 1) if x < y] classifiers = [ ['KNN', KNeighborsClassifier(n_jobs=1, )], ['SGD', SGDClassifier()], ['DECISION TREE', DecisionTreeClassifier()], ] parameters_list = [ ['KNN', { 'dec': TruncatedSVD(), 'vect__tokenizer': [None, stemming_tokenizer], 'vect__ngram_range': n_grams, 'vect__analyzer': ['word', 'char'], 'vect__max_df': np.arange(.8, 1., .1), 'vect__min_df': np.arange(0., .2, .1), 'vect__binary': [True, False], 'vect__lowercase': [True, False], 'vect__sublinear_tf': [True, False], 'vect__stop_words': [None, stopwords.words("english")], 'dec__n_components': xrange(10, 15, 2), 'nbc__n_neighbors': xrange(3, 6, 1), 'nbc__weights': ['distance', 'uniform'], }], ['DECISION TREE', {
def generateUserFeature(self, W): svd = TruncatedSVD(n_components=5) result = svd.fit(W).transform(W) return result
import sys import numpy as np from sklearn.decomposition import TruncatedSVD from googlengram import util from vecanalysis.representations.explicit import Explicit INPUT_DIR = "/dfs/scratch0/google_ngrams/5grams_ppmi_lsmooth_fixed/" OUTPUT_DIR = "/dfs/scratch0/google_ngrams/vecs-svd/" INPUT_PATH = INPUT_DIR + '{year}.bin' OUTPUT_PATH = OUTPUT_DIR + '{year}-300vecs' if __name__ == '__main__': year = sys.argv[1] print "Loading embeddings for year", year words = util.load_pickle( "/dfs/scratch0/google_ngrams/info/interestingwords.pkl") base_embed = Explicit.load(INPUT_PATH.format(year=year), restricted_context=words) print "SVD for year", year pca = TruncatedSVD(n_components=300) new_mat = pca.fit_transform(base_embed.m) print "Saving year", year np.save(OUTPUT_PATH.format(year=year) + ".npy", new_mat) vocab_outfp = open(OUTPUT_PATH.format(year=year) + ".vocab", "w") words = [word.encode('utf-8') for word in base_embed.iw] vocab_outfp.write(" ".join(words))
def train_with_bag_of_words(X_train, y_train, scorer, classifier='SVC', search=True): """ Pass the data through a pipeline and return a trained model. Args: X_train: Train data y_train: Labels for the train data (transformed by LabelEncoder) search : Whether to search for the best hyperparameters """ estimators = { 'SVC': SVC( C=5.1, kernel='linear', decision_function_shape='ovr', #class_weight = 'balanced' # better without 'balanced' ), 'LogisticRegression': LogisticRegression(C=5.1, ), 'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.3), } if classifier != 'VotingClassifier': clf = estimators.get(classifier) else: estimators['SVC'].probability = True clf = VotingClassifier(estimators=[(k, v) for k, v in estimators.items()], voting='soft') print(clf) pipeline = Pipeline( [ ( 'col_transf', ColumnTransformer( [ ('scaler', StandardScaler(), [ 'budget', 'client.feedback', 'client.reviews_count', 'client.jobs_posted', 'client.past_hires' ]), ('title_vec', Pipeline([ ('preprocessor', SpacyPreprocessor()), ('tfidf', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, use_idf=True, ngram_range=(2, 2))), ('svd', TruncatedSVD(n_components=150)), ]), 'title'), ( 'snippet_vec', Pipeline([ ('preprocessor', SpacyPreprocessor()), ( 'tfidf', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False, use_idf=True, sublinear_tf= False, # not good results when True ngram_range=(1, 2))), ('svd', TruncatedSVD(n_components=100)), ]), 'snippet'), ('cat', ce.CatBoostEncoder(), ["job_type", 'category2', 'client.country']), ], remainder='drop')), #('oversampling', ADASYN(random_state=42)), ('classifier', clf), ], verbose=True) if search: log_space = gen_parameters_from_log_space(low_value=5, high_value=8, n_samples=10) lin_space = np.arange(2, 8, 2, dtype=np.int) if classifier == 'SVC': grid = { # 'union__title_vec__tfidf__ngram_range' : [(1,2), (2,2)], # 'union__snippet_vec__tfidf__ngram_range' : [(1,2), (2,2)], # 'union__snippet_vec__svd__n_components' : np.arange(50, 301, 50), # 'union__title_vec__svd__n_components' : np.arange(100, 301, 50), 'classifier__C': log_space, } elif classifier == 'LogisticRegression': grid = { 'classifier__C': gen_parameters_from_log_space(0.1, 10, 10), } elif classifier == 'GradientBoostingClassifier': grid = { 'classifier__learning_rate': gen_parameters_from_log_space(0.01, 1, 10), } elif classifier == 'VotingClassifier': grid = { 'classifier__lr__C': gen_parameters_from_log_space(0.1, 10, 10), 'classifier__C': gen_parameters_from_log_space(5, 8, 10), 'classifier__learning_rate': gen_parameters_from_log_space(0.01, 1, 10), } # With scoring="ovo", computes the average AUC of all possible pairwise # combinations of classes. Insensitive to class imbalance when # average='macro'. # Also see: https://stackoverflow.com/a/62471736/1253729 searcher = GridSearchCV( estimator=pipeline, param_grid=grid, n_jobs=4, return_train_score=True, refit=True, verbose=True, cv=StratifiedKFold(n_splits=3), scoring=scorer, ) model = searcher.fit(X_train, y_train.values.ravel()) print(f"Best found parameters: {searcher.best_params_}") else: model = pipeline.fit(X_train, y_train.values.ravel()) return model
#UNSUPERVISED MODEL from model import * from sklearn.manifold import TSNE from sklearn.decomposition import TruncatedSVD from sklearn.cluster import KMeans Sparse SVD on tf-idf to reduce features to 50 print("start dimensionality reduction") data = get_vectorized_tweets('training_vecs.npy').toarray() svd_model = TruncatedSVD(n_components=50) data_svd = svd_model.fit_transform(data) print("start TSNE") tsne_model = TSNE(n_components = 2) data_tsne = tsne_model.fit_transform(data_svd) np.save('tsne_training_data.npy', data_tsne) data_tsne = sample(np.asarray(get_vectorized_tweets('tsne_training_data.npy')), 500) print(data_tsne.shape) cluster_labels = KMeans(n_clusters = 5).fit(data_tsne).labels_ import matplotlib.pyplot as plt print("scatter:") plt.scatter(data_tsne[:,0], data_tsne[:,1], c = cluster_labels) plt.show() #UNSUPERVISED MODEL ONLY TOXIC SPEECH #select only toxic speech df_data = pd.read_csv("twitter-sentiment-analysis-hatred-speech/train.csv",names=('id','label','tweet'),header=None) labels = df_data.to_numpy().T[1] data_tsne = np.asarray(get_vectorized_tweets('tsne_training_data.npy'))
def svd_vector(data): svd = TruncatedSVD(n_components=1) vector = svd.fit_transform(data.ix[:, 6:].transpose()) return [item for sublist in vector for item in sublist]
rows.append({'text': text, 'class': classification}) index.append(filename) data_frame = DataFrame(rows, index=index) return data_frame data = DataFrame({'text': [], 'class': []}) for path, classification in SOURCES: data = data.append(build_data_frame(path, classification)) data = data.reindex(np.random.permutation(data.index)) ## now split files into training data and labels. probably tuple (filename, r/d) classifier = Pipeline([ ('tfidf', TfidfVectorizer()), ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), ('clf', XGBClassifier()) ]) #classifier.fit(data['text'].values, data['class'].values) k_fold = KFold(n_splits=8) scores = [] confusion = np.array([[0, 0], [0, 0]]) for train_indices, test_indices in k_fold.split(data): train_text = data.iloc[train_indices]['text'].values train_y = data.iloc[train_indices]['class'].values test_text = data.iloc[test_indices]['text'].values test_y = data.iloc[test_indices]['class'].values
def train(n_components, demean, n_samples): print("Loading data...") movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data( n_samples=n_samples) print("number of users with ratings: {}".format( len(np.unique(rating_indices[:, 0])))) print("number of movies with ratings: {}".format( len(np.unique(rating_indices[:, 1])))) n_splits = 5 kf = KFold(n_splits=n_splits, shuffle=True) kf.get_n_splits(rating_indices) if not n_components: components = [5, 10, 15, 20, 30, 50] components_loss_path = np.zeros((len(components), n_splits)) print("Finding optimal number of components...") for n, n_components in enumerate(components): print("n_components: {}".format(n_components)) for k, (train_index, test_index) in enumerate(kf.split(rating_indices)): mean = None print("Fold {}".format(k)) test_indices = rating_indices[test_index] test_indices = test_indices[:, 0], test_indices[:, 1], test_indices[:, 2] if demean: print("De-mean training data...") train_indices = rating_indices[train_index] mean = np.mean(train_indices[:, 2]) train_indices = train_indices[:, 0], train_indices[:, 1], train_indices[:, 2] - mean data_train = scipy.sparse.csr_matrix( (train_indices[2], (train_indices[0], train_indices[1])), shape=(n_users, n_items)) else: user_test_indices, item_test_indices = test_indices[ 0], test_indices[1] data_train = scipy.sparse.lil_matrix(ratings) data_train[user_test_indices, item_test_indices] = 0 data_train = scipy.sparse.csr_matrix(ratings) print("Finished de-meaning.") start = time.time() print("Fitting...") svd = TruncatedSVD(n_components=n_components) P = svd.fit_transform(data_train) Q = svd.components_ acc, loss = evaluate(P, Q, test_indices, mean=mean) print("Elapsed time: {:.1f}s".format(time.time() - start)) print("loss: {:.4f} - acc: {:.4f}".format(loss, acc)) components_loss_path[n, k] = loss mean_loss = np.mean(components_loss_path, axis=1) best_k = components[np.argmin(mean_loss)] best_loss = np.amin(mean_loss) print("best k: {}, best loss: {:.4f}".format(best_k, best_loss)) else: print("Performing cross validation...") mean_acc = 0.0 mean_loss = 0.0 for k, (train_index, test_index) in enumerate(kf.split(rating_indices)): mean = None print("Fold {}".format(k)) test_indices = rating_indices[test_index] test_indices = test_indices[:, 0], test_indices[:, 1], test_indices[:, 2] if demean: print("De-mean training data...") train_indices = rating_indices[train_index] mean = np.mean(train_indices[:, 2]) train_indices = train_indices[:, 0], train_indices[:, 1], train_indices[:, 2] - mean data_train = scipy.sparse.csr_matrix( (train_indices[2], (train_indices[0], train_indices[1])), shape=(n_users, n_items)) print("Finished de-meaning.") else: user_test_indices, item_test_indices = test_indices[ 0], test_indices[1] data_train = scipy.sparse.lil_matrix(ratings) data_train[user_test_indices, item_test_indices] = 0 data_train = scipy.sparse.csr_matrix(ratings) start = time.time() print("fitting...") svd = TruncatedSVD(n_components=n_components) P = svd.fit_transform(data_train) Q = svd.components_ acc, loss = evaluate(P, Q, test_indices, mean=mean) print("Elapsed time: {:.4f}".format(time.time() - start)) print("loss: {:.4f} - acc: {:.4f}".format(loss, acc)) mean_acc = (mean_acc * k + acc) / (k + 1) mean_loss = (mean_loss * k + loss) / (k + 1) print("mean loss: {:.4f} - mean acc: {:.4f}".format( mean_loss, mean_acc))
# Tokenize each document into words # Gets rid of stop words, and stemmed version of word # Ignores words appearing in less then 5 (or 2 if min_df = 2) documents vectorizer = CountVectorizer(min_df=5, stop_words= stop_words, tokenizer=LemmaTokenizer() ) X_train_counts = vectorizer.fit_transform(eight_train.data) X_test_counts = vectorizer.transform(eight_test.data) # TFIDF # We set smooth_idf = false so we use the equation idf(d, t) = log [ n / df(d, t) ] + 1 tfidf_transformer = TfidfTransformer(smooth_idf=False) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_tfidf = tfidf_transformer.transform(X_test_counts) # 'arpack' for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds) svd = TruncatedSVD(n_components=50, algorithm='arpack') X_train_lsi = svd.fit_transform(X_train_tfidf) X_test_lsi = svd.transform(X_test_tfidf) # separate into two groups(Computer Tech & Recreation) train_target_group = [ int(x / 4) for x in eight_train.target] test_actual= [ int(x / 4) for x in eight_test.target] # Logistic Regresstion Classifier log_reg = LogisticRegression() log_reg.fit(X_train_lsi, train_target_group) predicted = log_reg.predict(X_test_lsi) predicted_probs = log_reg.predict_proba(X_test_lsi) fpr, tpr, _ = roc_curve(test_actual, predicted_probs[:,1])
# Transposing the matrix X = ratings_matrix.T X.head() # X = ratings_matrix # X.head() X.shape X1 = X #Decomposing the Matrix SVD = TruncatedSVD(n_components=10) decomposed_matrix = SVD.fit_transform(X) decomposed_matrix.shape #Correlation Matrix correlation_matrix = np.corrcoef(decomposed_matrix) correlation_matrix.shape X.index[75] # Index of product ID purchased by customer i = "B00000K135" product_names = list(X.index)
def main(argv): choose_mindf = argv[1] try: path = argv[2] except: path = None categories1 = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ] categories2 = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian' ] cat_all = [ 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian', 'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.windows.x', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ] dclass = Data(categories1, cat_all, categories2, path) stop_words = text.ENGLISH_STOP_WORDS print('-----Part A-----') #plot_histogram(dclass) print('-----Part B-----') vectorizer2 = CountVectorizer(min_df=2, stop_words=stop_words, max_df=0.8) tfidf_transformer2 = TfidfTransformer() vectorizer5 = CountVectorizer(min_df=5, stop_words=stop_words, max_df=0.8) tfidf_transformer5 = TfidfTransformer() tfidf2 = preprocess(dclass, dclass.training_data1, vectorizer2, tfidf_transformer2, train=True) tfidf5 = preprocess(dclass, dclass.training_data1, vectorizer5, tfidf_transformer5, train=True) #default min_df=5 print('# of terms with min_df = 2:', tfidf2[0, :].toarray().shape[1], '\n# of terms with min_df = 5:', tfidf5[0, :].toarray().shape[1]) d_tfidf = {'2': tfidf2, '5': tfidf5} d_vectorizer = {'2': vectorizer2, '5': vectorizer5} d_transformer = {'2': tfidf_transformer2, '5': tfidf_transformer5} print('-----Part C-----') vectorizerc_2 = CountVectorizer(min_df=2, stop_words=stop_words, max_df=0.8) tfidf_transformerc_2 = TfidfTransformer() tfidf_c_2 = preprocess(dclass, dclass.training_data2, vectorizerc_2, tfidf_transformerc_2, train=True, ICF=True) #default min_df=5, use TF-ICF find_10most(dclass, tfidf_c_2) vectorizerc_5 = CountVectorizer(min_df=5, stop_words=stop_words, max_df=0.8) tfidf_transformerc_5 = TfidfTransformer() tfidf_c_5 = preprocess(dclass, dclass.training_data2, vectorizerc_5, tfidf_transformerc_5, train=True, ICF=True) #default min_df=5, use TF-ICF find_10most(dclass, tfidf_c_5) print('-----Part D-----') #SVD and NMF base on TF-IDF5 result svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42) D_LSI = svd.fit_transform(d_tfidf[choose_mindf]) model = NMF(n_components=50, init='random', random_state=0) D_NMF = model.fit_transform(d_tfidf[choose_mindf]) print('LSI.shape:', D_LSI.shape, '\nNMF.shape:', D_NMF.shape) print('-----Part E-----') #SVM tfidftest = preprocess(dclass, dclass.testing_data1, d_vectorizer[choose_mindf], d_transformer[choose_mindf], train=False) #testing data D_LSI_test = svd.transform(tfidftest) D_NMF_test = model.transform(tfidftest) print('for D_LSI:') part_e(dclass, D_LSI, D_LSI_test) print('for D_NMF:') part_e(dclass, D_NMF, D_NMF_test) print('-----Part F-----') print('for D_LSI:') part_f(dclass, D_LSI, D_LSI_test) print('for D_NMF:') part_f(dclass, D_NMF, D_NMF_test) print('-----Part G-----') part_g(dclass, D_NMF, D_NMF_test, dclass.training_target1) print('-----Part H-----') part_h(dclass, D_LSI, D_LSI_test) part_h(dclass, D_NMF, D_NMF_test) print('-----Part I-----') part_i(dclass, D_LSI, D_LSI_test) part_i(dclass, D_NMF, D_NMF_test) print('-----Part J-----') tfidf2_j = preprocess(dclass, dclass.training_dataj, vectorizer2, tfidf_transformer2, train=True) D_LSI_j = svd.fit_transform(tfidf2_j) D_NMF_j = model.fit_transform(tfidf2_j) tfidftest_j = preprocess(dclass, dclass.testing_dataj, vectorizer2, tfidf_transformer2, train=False) #testing data D_LSI_test_j = svd.transform(tfidftest_j) D_NMF_test_j = model.transform(tfidftest_j) print('----------------Naive Bayes in J-----------------') part_g(dclass, D_NMF_j, D_NMF_test_j, dclass.training_targetj, True) print('----------------SVM in J with LSI data-----------') part_j_SVM(dclass, D_LSI_j, D_LSI_test_j) print('----------------SVM in J with NMF data-----------') part_j_SVM(dclass, D_NMF_j, D_NMF_test_j)
def test_pipeline_column_transformer(self): iris = datasets.load_iris() X = iris.data[:, :3] y = iris.target X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1" if x > 0.5 else "cat2") X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3" if x > 0.5 else "cat4") y_train = y % 2 numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] categorical_features = [3, 4] # ["vcat", "vcat2"] classifier = LogisticRegression( C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3, ) numeric_transformer = Pipeline(steps=[ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()), ]) categorical_transformer = Pipeline(steps=[ ( "onehot", OneHotEncoder(sparse=True, handle_unknown="ignore"), ), ( "tsvd", TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4), ), ]) preprocessor = ColumnTransformer(transformers=[ ("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features), ]) model = Pipeline(steps=[("precprocessor", preprocessor), ("classifier", classifier)]) model.fit(X_train, y_train) initial_type = [ ("numfeat", FloatTensorType([None, 3])), ("strfeat", StringTensorType([None, 2])), ] X_train = X_train[:11] model_onnx = convert_sklearn(model, initial_types=initial_type) dump_data_and_model( X_train, model, model_onnx, basename="SklearnPipelineColumnTransformerPipeliner", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.3') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.4.0')", ) if __name__ == "__main__": from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer pydot_graph = GetPydotGraph( model_onnx.graph, name=model_onnx.graph.name, rankdir="TP", node_producer=GetOpNodeProducer("docstring"), ) pydot_graph.write_dot("graph.dot") import os os.system("dot -O -G=300 -Tpng graph.dot")
for size in tqdm(size_list): model = KeyedVectors.load("./trained_model/fasttext_gensim_" + str(size) + ".model") words_np = [] words_label = [] for word in list_words: words_np.append(model[word]) words_label.append(word) word_vector_reduced = {} for index, vec in enumerate(words_np): word_vector_reduced[words_label[index]] = vec list_cosin_similarity = [] for x, y in zip(data["Word1"], data["Word2"]): list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2)) data['Relation_number'] = new_col data["FastText_" + str(size)] = list_cosin_similarity if size == 200: for new_size in size_list[:-1]: svd = TruncatedSVD(n_components=new_size, n_iter=30) svd.fit(words_np) reduced = svd.transform(words_np) word_vector_reduced = {} for index, vec in enumerate(reduced): word_vector_reduced[words_label[index]] = vec list_cosin_similarity = [] for x, y in zip(data["Word1"], data["Word2"]): list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2)) data["FastText_SVD_" + str(new_size)] = list_cosin_similarity # Ghi ket qua ra file csv tmp_name = os.path.basename(path_visim).split('.')[0] + '_result.csv' data.to_csv(os.path.join("./result", tmp_name), sep="\t")
# print titles for sentence in test_data['Content']: temp_title = '' for j in range(10): temp_title = titles2[i] + ' ' + temp_title sentences2.append(temp_title + PorterStemmer().stem_sentence(sentence)) i = i + 1 #Vectorizing-LSI-Classifier X_train = np.array(sentences) X_test = np.array(sentences2) clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopw)),\ ('svd' , TruncatedSVD(n_components=1000) ),\ ('clf', svm.SVC(C=10, gamma = 0.0001, kernel= 'linear', class_weight='balanced')), ]) clf.fit(X_train, y) predicted = clf.predict(X_test) #Print Results categories = le.inverse_transform(predicted) i = 0 CsvData2 = [['Id', 'Category']] for t in test_data['Id']: CsvData2.append([t, categories[i]]) i = i + 1
if len(fns) > 1: print('Multiple merged embeddings in working directory.') sys.exit() else: m = fns[0] print('Reading raw.') sys.stdout.flush() df = pd.read_csv(m, index_col=0, header=None) if df.index.names[0] == 0: print('Renaming index column to SampleID.') df.index.names = ['SampleID'] df.to_csv(m, compression='gzip') mat = df.to_numpy().T sampids = df.index del df print('Performing svd.') sys.stdout.flush() svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0) svd.fit(mat) pc = svd.components_ mat -= mat.dot(pc.T) * pc print('Saving nonraw.') sys.stdout.flush() df = pd.DataFrame(mat.T, index=sampids) df.index.names = ['SampleID'] df.to_csv(m.replace('_raw', ''), compression='gzip')
transformer_list=[ # Pipeline for pulling features from the post's title line ('title', Pipeline([ ('selector', ItemSelector(key='title')), ('tfidf', TfidfVectorizer(min_df=50, stop_words='english')), ])), # Pipeline for standard bag-of-words model for abstract ('abstract_bow', Pipeline([ ('selector', ItemSelector(key='abstract')), ('tfidf', TfidfVectorizer(stop_words='english')), ('best', TruncatedSVD(n_components=50)), ])), # Pipeline for pulling ad hoc features from post's abstract ( 'abstract_stats', Pipeline([ ('selector', ItemSelector(key='abstract')), ('stats', TextStats()), # returns a list of dicts ('vect', DictVectorizer() ), # list of dicts -> feature matrix ])), ], # weight components in FeatureUnion transformer_weights={
def main(): with open("./huck_finn.txt", mode="r") as file: huck_finn = file.read() with open("./s_holmes.txt", mode="r") as file: s_holmes = file.read() print(len(s_holmes)) print(len(huck_finn)) # extract_names_raw(s_holmes, "./s_holmes_names.txt") # extract_names_raw(huck_finn, "./huck_finn_names.txt") # # After the names have been checked, load them and order by count of words # cleaned_s_holmes_names = [] # with open("s_holmes_names.txt", "r") as file: # for line in file: # cleaned_s_holmes_names.append(line[:-1]) # cleaned_s_holmes_names.sort(key = lambda x: -len(x.split(" "))) # for name in cleaned_s_holmes_names: # s_holmes = s_holmes.replace(name, "xxnamexx") # # cleaned_huck_finn_names = [] # with open("huck_finn_names.txt", "r") as file: # for line in file: # cleaned_huck_finn_names.append(line[:-1]) # for name in cleaned_huck_finn_names: # huck_finn = huck_finn.replace(name, "xxnamexx") # # print("_______") # print(len(s_holmes)) # print(len(huck_finn)) # s_holmes_sent = sent_tokenize(s_holmes) # print(s_holmes_sent[0]) # for i in range(10): # print(nltk.pos_tag(word_tokenize(s_holmes_sent[i].lower()))) # print("____________________") # s_holmes_words = [i for i in word_tokenize(s_holmes.lower()) if i not in STOP] # print(nltk.pos_tag(s_holmes_words[:50])) # s_holmes_sizes = [len(i) for i in s_holmes_words] # print(s_holmes_sizes[:50]) s_holmes_tokens = np.array(word_tokenize(s_holmes.lower())) s_holmes_lens = [len(w) for w in s_holmes_tokens] s_holmes_indexes = cumsum(s_holmes_lens) // S_HOLMES_LEN # print(s_holmes_tokens[:50]) print(s_holmes_indexes) # print(sum(s_holmes_indexes == 0)) # print(sum(s_holmes_indexes == 1)) # print(sum(s_holmes_indexes == 100)) # print(sum(s_holmes_indexes == 220)) # print(sum(s_holmes_indexes == 221)) corpus = [] for ind in unique(s_holmes_indexes): corpus.append(" ".join(s_holmes_tokens[s_holmes_indexes == ind])) # print(corpus) # print(len(corpus)) huck_finn_tokens = np.array(word_tokenize(huck_finn.lower())) huck_finn_lens = [len(w) for w in huck_finn_tokens] huck_finn_indexes = cumsum(huck_finn_lens) // HUCK_FINN_LEN print(huck_finn_indexes) # print(sum(huck_finn_indexes == 0)) # print(sum(huck_finn_indexes == 1)) # print(sum(huck_finn_indexes == 100)) # print(sum(huck_finn_indexes == 227)) # print(sum(huck_finn_indexes == 228)) for ind in unique(huck_finn_indexes): corpus.append(" ".join(huck_finn_tokens[huck_finn_indexes == ind])) print(len(corpus)) # # tf-idf # vectorizer = TfidfVectorizer(min_df=2, stop_words=stopwords.words("english")) # X = vectorizer.fit_transform(corpus) # print(X.shape) ## print(X[1,:]) # ltf-real_entropy vectorizer = CountVectorizer(min_df=2, stop_words=stopwords.words("english")) X = vectorizer.fit_transform(corpus) print("X.shape: ", X.shape) X = X.A global_frequencies = X.sum(axis=1) print(global_frequencies) print(X[0, :] / global_frequencies[0]) for i in range(X.shape[0]): p = X[i, :] / global_frequencies[i] real_entropy = -np.sum(p[p != 0] * np.log2(p[p != 0])) X[i, :] = np.log2(X[i, :] + 1) * real_entropy svd = TruncatedSVD(n_components=15, n_iter=7) svd.fit(X) print(svd.singular_values_) print(svd.transform(X).shape) # Xk = normalize(svd.transform(X), axis=1, norm="l2") Xk = svd.transform(X) # print("*______*") # print(X.A) # print(Xk[0, :]) # print("*______*") S = Xk @ Xk.T print(S.shape) # print(type(S)) np.savetxt("S_text.csv", S, delimiter=",") plt.imshow(S, cmap='hot', interpolation='nearest') plt.show()
] dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42) labels_true = dataset.target true_k = np.unique(labels_true).shape[0] # t0 = time() vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, stop_words='english', use_idf=True) X = vectorizer.fit_transform(dataset.data) svd = TruncatedSVD(2) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) # explained_variance = svd.explained_variance_ratio_.sum() Wardhierarchial = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=2, pooling_func='deprecated').fit(X) labels = Wardhierarchial.labels_ print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
path = "E:/MyPython/机器学习——达观杯/feature/feature_file/" """===================================================================================================================== 0 读取tfidf特征 """ print("0 读取tfidf特征") tfidf_path = path +'data_w_tfidf.pkl' f_tfidf = open(tfidf_path, 'rb') x_train, y_train, x_test = pickle.load(f_tfidf) f_tfidf.close() """===================================================================================================================== 1 特征降维:lsa """ print("1 特征降维:lsa") lsa = TruncatedSVD(n_components=200) x_train = lsa.fit_transform(x_train) x_test = lsa.transform(x_test) """===================================================================================================================== 2 将lsa特征保存至本地 """ print("2 将lsa特征保存至本地") data = (x_train, y_train, x_test) f_data = open(path + 'data_w_tfidf(lsa).pkl', 'wb') pickle.dump(data, f_data) f_data.close() t_end = time.time() print("lsa特征完成,共耗时:{}min".format((t_end-t_start)/60))