示例#1
0
def reduce_to_k_dim(M,dim):
    svd = TruncatedSVD(n_components = dim,n_iter = 30)
    reduced_model = svd.fit_transform(M)
    return reduced_model
示例#2
0
    d2 = pdt_ttl_vec[i, :]
    dst_srch_ttl1[i] = cosine_similarity(d1, d2)

dst_srch_desc1 = np.zeros(srch_vec.shape[0])
for i in range(srch_vec.shape[0]):
    d1 = srch_vec[i, :]
    d2 = pdt_desc_vec[i, :]
    dst_srch_desc1[i] = cosine_similarity(d1, d2)

dst_ttl_desc1 = np.zeros(srch_vec.shape[0])
for i in range(srch_vec.shape[0]):
    d1 = pdt_ttl_vec[i, :]
    d2 = pdt_desc_vec[i, :]
    dst_srch_desc1[i] = cosine_similarity(d1, d2)

svd = TruncatedSVD(n_components=30, random_state=2016)

srch_vec = svd.fit_transform(srch_vec)
pdt_ttl_vec = svd.fit_transform(pdt_ttl_vec)
pdt_desc_vec = svd.fit_transform(pdt_desc_vec)

srch_vec = pd.DataFrame(
    srch_vec, columns=['srch_vec_' + str(i) for i in range(srch_vec.shape[1])])
pdt_ttl_vec = pd.DataFrame(
    pdt_ttl_vec,
    columns=['ttl_vec_' + str(i) for i in range(pdt_ttl_vec.shape[1])])
pdt_desc_vec = pd.DataFrame(
    pdt_desc_vec,
    columns=['desc_vec_' + str(i) for i in range(pdt_desc_vec.shape[1])])

id = list(df_all['id'])
示例#3
0
文件: test2.py 项目: winaboy/CSE515P
if __name__ == "__main__":
    print 'loading x_tr...'
    t0 = time.time()
    x_tr = load_csr_matrix_from_npz('../data/processed/tf_idf_transformation/train/matrix.npz')
    print 'loading finished, time = {0}'.format(time.time()-t0)

    print 'loading y_tr...'
    t0 = time.time()
    y_tr = numpy.loadtxt('../data/processed/tf_idf_transformation/train/labels.csv', dtype='int')
    print 'loading finished, time = {0}'.format(time.time()-t0)
 
    print 'running TruncatedSVD...'
    t0 = time.time()
    from sklearn.decomposition import TruncatedSVD
    svd = TruncatedSVD(n_components=100)
    x_tr_new = svd.fit_transform(x_tr, y_tr)
    print 'running TruncatedSVD finished, x_new.shape = {0}, time = {1}'.format(x_tr_new.shape, time.time()-t0)
  
    #delete x_tr
    del x_tr

    print 'fitting model...'
    t0 = time.time()
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn.linear_model import LogisticRegression
    clf = OneVsRestClassifier(LogisticRegression())
    clf.fit(x_tr_new, y_tr)
    print 'fitting finished, time = {0}'.format(time.time()-t0)

    #delete x_tr_new, y_tr
示例#4
0
def svd(*args, **kwargs):
    return TruncatedSVD(*args, **kwargs)
le = preprocessing.LabelEncoder()
le.fit(df["Category"])
Y_train=le.transform(df["Category"])
X_train1=df['Content'] 
X_train2=[]
for i in range(len(X_train1)):
	X_train2.append(10*df['Title'][i]+df['Content'][i])

X_train=np.array(X_train2)

#read test file
df_test=pd.read_csv("test_set.csv",sep="\t")

vectorizer=CountVectorizer(stop_words='english')
transformer=TfidfTransformer()
svd=TruncatedSVD(n_components=200, random_state=42) 
pipeline_test = Pipeline([
    ('vect', vectorizer),
    ('tfidf', transformer),
    ('svd',svd),
])
#My method---Voting Classifier
clf1 = BernoulliNB(fit_prior=False)
clf2 = KNeighborsClassifier(weights='distance',n_jobs=-1)
clf3 = RandomForestClassifier(n_estimators=500,n_jobs=-1)
clf = VotingClassifier(estimators=[('bnb',clf1),('knn',clf2),('rf',clf3)], voting='hard')
pipeline = Pipeline([
    ('vect', vectorizer),
    ('tfidf', transformer),
    ('svd',svd),
    ('clf', clf)
 def do_lsa(X, target_dim):
     svd = TruncatedSVD(target_dim, random_state=42)
     normalizer = Normalizer(copy=False)
     lsa = make_pipeline(svd, normalizer)
     return lsa.fit_transform(X)
示例#7
0
    def post(self):

        # Get the THEME labels
        abs_filename = ett_h.generate_dynamic_path(
            [base_folder_location, LabelType.THEME.value, label_file_name])
        labels = (ett_h.load_data_common_separated(abs_filename, ','))
        # Get the label data from input_data
        raw_label = TrainThemeUpload.input_data[ColumnName.LABEL.value]
        data = ett_t.transform_data_to_dataframe_basic(
            TrainThemeUpload.input_data, colnames)
        # Get the OneHotEncoded labels
        label_df = ett_t.one_hot_encoding(raw_label)  #17 labels dataframe
        # Rename the OneHotEncoded labels
        label_df.columns = labels
        # Get the number of labels
        num_of_labels = len(labels)
        # Data preprocessing
        nan_cleaned_data = ett_c.clean_dataframe_by_regex(
            data, RegexFilter.NON_ALPHA_NUMERIC.value
        )  # Removed all non alphanumeric characters
        d_cleaned_data = ett_c.clean_dataframe_by_regex(
            nan_cleaned_data,
            RegexFilter.DIGITS_ONLY.value)  # Removed all digits
        l_cleaned_data = ett_c.remove_non_iso_words(
            d_cleaned_data, Language.ENGLISH.value)  # Remove non-English text
        rew_cleaned_data = ett_c.remove_language_stopwords(
            l_cleaned_data, Language.ENGLISH.name)  # Remove English stop words
        l_transformed_data = ett_t.lowercase(
            rew_cleaned_data)  # Transform text to lowercase
        le_transformed_data = ett_t.stemming_mp(
            l_transformed_data
        )  # Transform text to core words i.e. playing > play
        data = le_transformed_data  # Return the newly transformed data

        # Split the data into 0.8 training datasets and 0.2 testing datasets
        X_train, X_test, y_train, y_test = train_test_split(data,
                                                            label_df,
                                                            test_size=0.2,
                                                            random_state=42)
        endpoint_output = {}
        for i in range(num_of_labels):
            model_id = str(i)
            single_label = y_train.iloc[:, i]
            label = labels[i]
            print("label", label)
            pipeline = imbPipeline([
                (ModelType.TFIDF.value,
                 TfidfVectorizer()),  # Data vectorization
                (ModelType.OVERSAMPLE.value,
                 SMOTE(random_state=42)),  # Data balancing
                (ModelType.SVD.value, TruncatedSVD()),  # Feature selection
                (ModelType.NOR.value,
                 preprocessing.MinMaxScaler()),  # Data normalization
                (ModelType.CLF.value, OneVsRestClassifier(SVC()))
            ])  # CLassification

            #list_c = [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
            list_c = [1]

            #list_n = [100, 150, 200, 250, 300, 350, 400, 450, 500, 550])
            list_n = [100]
            # Remember to add[2,\]2]
            best_score = 0
            epsilon = .005
            dictionary = {}

            for para_c in list_c:
                for para_n in list_n:
                    parameters = {
                        ModelType.TFIDF.value: [
                            TfidfVectorizer(max_features=800,
                                            ngram_range=(1, 4),
                                            norm='l2',
                                            encoding='latin-1',
                                            stop_words='english',
                                            analyzer='word')
                        ],
                        ModelType.SVD.value: [
                            TruncatedSVD(n_components=para_n,
                                         n_iter=7,
                                         random_state=42)
                        ],
                        ModelType.CLF.value: [
                            OneVsRestClassifier(
                                SVC(kernel='linear',
                                    probability=True,
                                    C=para_c))
                        ]
                    }
                    gs_clf = GridSearchCV(pipeline,
                                          parameters,
                                          cv=5,
                                          error_score='raise',
                                          scoring='f1')
                    gs_clf = gs_clf.fit(X_train, single_label)
                    current_score = gs_clf.best_score_
                    dictionary[current_score] = parameters

            for current_score in dictionary.keys():
                if current_score - epsilon > best_score:
                    best_score = current_score

            model_dict = dictionary[best_score]

            label_model_list = {}
            label_model_list['score'] = best_score

            folder_time = time.strftime("_%Y%m%d_%H%M")
            # Create Directory in the AWS S3 Bucket
            os.mkdir("/Users/yihanbao/Desktop/unisdr-training/theme/" + label +
                     "/" + label + folder_time)
            # Navigate to AWS model saving folder
            model_folder = os.path.join(
                os.path.dirname(
                    os.path.dirname(
                        os.path.dirname(
                            os.path.dirname(os.path.realpath(__file__))))),
                ett_h.generate_dynamic_path(
                    [LabelType.THEME.value, label, label + folder_time]))
            """
            # Connect to AWS
            conn = boto.s3.connect_to_region(" ",aws_access_key_id = 'AWS-Access-Key', aws_secret_access_key = 'AWS-Secrete-Key',
                                 calling_format = boto.s3.connection.OrdinaryCallingFormat())

            bucket = conn.get_bucket("oict-psdg-unisdr-train-models-v1")
        
            # AWS Key 
            aws_path = ett_h.generate_dynamic_path([LabelType.THEME.value, label, timestamp+label])
            """
            # Here to fit the training datasets to the  models with best score
            # vectorization
            vector = model_dict[ModelType.TFIDF.value][0].fit(
                X_train, single_label)
            ett_h.save_model(
                vector,
                ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + vector_model_name]))
            vectorized_df = vector.transform(X_train)
            label_model_list[
                URLName.VECURL.value] = ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + vector_model_name])
            """
            key_name = timestamp+label+model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(vector) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """
            # Balcancing
            sm = SMOTE(random_state=42)
            X_res, y_res = sm.fit_resample(vectorized_df, single_label)

            # Feature selction
            svd = model_dict[ModelType.SVD.value][0].fit(X_res, y_res)
            ett_h.save_model(
                svd,
                ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + dim_reductor_model_name
                ]))
            dim_reductor_df = svd.transform(X_res)
            label_model_list[
                URLName.DIMURL.value] = ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + dim_reductor_model_name
                ])
            """
            key_name = timestamp+label+dim_reductor_model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(svd) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """

            # Normalizing
            min_max_scaler = preprocessing.MinMaxScaler()
            nor_model = min_max_scaler.fit(dim_reductor_df, y_res)
            ett_h.save_model(
                nor_model,
                ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + normalizar_model_name
                ]))
            scaled_df = nor_model.transform(dim_reductor_df)
            label_model_list[
                URLName.NORURL.value] = ett_h.generate_dynamic_path([
                    model_folder, label + folder_time + normalizar_model_name
                ])
            """
            key_name = timestamp+label+normalizar_model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(nor_model) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """

            # Classifier
            clf = model_dict[ModelType.CLF.value][0].fit(scaled_df, y_res)
            clf.fit(scaled_df, y_res)
            ett_h.save_model(
                clf,
                ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + model_name]))
            label_model_list[
                URLName.MODURL.value] = ett_h.generate_dynamic_path(
                    [model_folder, label + folder_time + model_name])
            """
            key_name = timestamp+label+model_name
            full_key_name = os.path.join(path, key_name)
            
            pickle_byte_obj = pickle.dump(scaled_df) 
            s3_resource = resource('s3')
            s3_resource.Object(bucket,full_key_name).put(Body=pickle_byte_obj)
            """
            endpoint_output[model_id] = [label_model_list]
        output = json.dumps(endpoint_output)
        return output
示例#8
0
文件: Q3.py 项目: ttommytang/EE219
    print '=============='
    print metrics.confusion_matrix(labels, km.labels_)
    print '=============='
    print '-----------------------------------------------------'
    #==============================================================================






#=========================Reduce Dimensionality (SVD)==========================
print '##############################################################'
for i in range(0,5):
    print 'Performing truncatedSVD...'
    svd = TruncatedSVD(n_components = 165, n_iter = 13,random_state = 42)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    
    X_reduced = lsa.fit_transform(X)
    
    k_means(X_reduced, labels, 'truncatedSVD')
#==============================================================================



#=========================Reduce Dimensionality (PCA)==========================
print '##############################################################'
for i in range(0,5):
    print 'Performing PCA...'
    
示例#9
0
                'nntp', '00041032', '000062david42', '000050', '00041555', '0004244402', 'mcimail', '00043819',
                'prb', '0004246', '0004422', '00044513', '00044939','access', 'digex', 'host', 'would', 'writes',
                'posting', 'dseg'])


# In[5]:

vectorizer = TfidfVectorizer(stop_words=stopset,
                                use_idf=True, ngram_range = (1, 3))
X = vectorizer.fit_transform(corpus)


# In[6]:

#decompose into X=UST^T
lsa = TruncatedSVD(n_components = 25, n_iter = 100)
lsa.fit(X)


# In[7]:

terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_):
    termsInComp = zip (terms, comp)
    sortedTerms = sorted(termsInComp, key = lambda x: x[1], reverse = True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print(" ")

示例#10
0
def compute_pc(X, npc=1):
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_
Discart the least important 
"""
# discard that many elements from the end of TF IDF and dictionary
if discard > 0:
    TF = TF[:, :-discard]
    IDF = IDF[:-discard]
    dictionary = dictionary[:-discard]
'''
Create the TF-IDF MATRIX from the training data
to be used with svd and gmms
'''
TFIDF = tfidf(TF, IDF)
'''
Singular Value Decomposition
'''
svd = TruncatedSVD(n_components=svd_components)
TFIDFsvd = svd.fit_transform(TFIDF)
'''
extract data for each class separately
GMMs will be trained separately on each classes TFIDF samples
'''
TFIDF_class = []
for class_num in range(1, 16):
    TFIDF_class.append(samples_from_class(TFIDFsvd, class_num, labels))
'''
GMM training
We train #classes = 15 GMMS to estimate the distribution of the features 
Each row of the TFIDFsummed is a feature vector on which we train a GMM
'''
GMMS = []
for class_num in range(1, 16):
print "----------------------"

# Vectorization object
vectorizer = TfidfVectorizer(strip_accents=None, preprocessor=None,)

n_grams = [(x, y) for x in xrange(1, 2, 1) for y in xrange(2, 4, 1) if x < y]

classifiers = [
    ['KNN', KNeighborsClassifier(n_jobs=1, )],
    ['SGD', SGDClassifier()],
    ['DECISION TREE', DecisionTreeClassifier()],
]

parameters_list = [
    ['KNN', {
        'dec': TruncatedSVD(),
        'vect__tokenizer': [None, stemming_tokenizer],
        'vect__ngram_range': n_grams,
        'vect__analyzer': ['word', 'char'],
        'vect__max_df': np.arange(.8, 1., .1),
        'vect__min_df': np.arange(0., .2, .1),
        'vect__binary': [True, False],
        'vect__lowercase': [True, False],
        'vect__sublinear_tf': [True, False],
        'vect__stop_words': [None, stopwords.words("english")],

        'dec__n_components': xrange(10, 15, 2),
        'nbc__n_neighbors': xrange(3, 6, 1),
        'nbc__weights': ['distance', 'uniform'],
    }],
    ['DECISION TREE', {
示例#13
0
 def generateUserFeature(self, W):
     svd = TruncatedSVD(n_components=5)
     result = svd.fit(W).transform(W)
     return result
示例#14
0
import sys

import numpy as np
from sklearn.decomposition import TruncatedSVD

from googlengram import util
from vecanalysis.representations.explicit import Explicit

INPUT_DIR = "/dfs/scratch0/google_ngrams/5grams_ppmi_lsmooth_fixed/"
OUTPUT_DIR = "/dfs/scratch0/google_ngrams/vecs-svd/"
INPUT_PATH = INPUT_DIR + '{year}.bin'
OUTPUT_PATH = OUTPUT_DIR + '{year}-300vecs'

if __name__ == '__main__':
    year = sys.argv[1]
    print "Loading embeddings for year", year
    words = util.load_pickle(
        "/dfs/scratch0/google_ngrams/info/interestingwords.pkl")
    base_embed = Explicit.load(INPUT_PATH.format(year=year),
                               restricted_context=words)
    print "SVD for year", year
    pca = TruncatedSVD(n_components=300)
    new_mat = pca.fit_transform(base_embed.m)
    print "Saving year", year
    np.save(OUTPUT_PATH.format(year=year) + ".npy", new_mat)
    vocab_outfp = open(OUTPUT_PATH.format(year=year) + ".vocab", "w")
    words = [word.encode('utf-8') for word in base_embed.iw]
    vocab_outfp.write(" ".join(words))
示例#15
0
def train_with_bag_of_words(X_train,
                            y_train,
                            scorer,
                            classifier='SVC',
                            search=True):
    """
    Pass the data through a pipeline and return a trained model.

    Args:
        X_train: Train data
        y_train: Labels for the train data (transformed by LabelEncoder)
        search : Whether to search for the best hyperparameters
    """

    estimators = {
        'SVC':
        SVC(
            C=5.1,
            kernel='linear',
            decision_function_shape='ovr',
            #class_weight            = 'balanced' # better without 'balanced'
        ),
        'LogisticRegression':
        LogisticRegression(C=5.1, ),
        'GradientBoostingClassifier':
        GradientBoostingClassifier(learning_rate=0.3),
    }

    if classifier != 'VotingClassifier':
        clf = estimators.get(classifier)
    else:
        estimators['SVC'].probability = True
        clf = VotingClassifier(estimators=[(k, v)
                                           for k, v in estimators.items()],
                               voting='soft')

    print(clf)

    pipeline = Pipeline(
        [
            (
                'col_transf',
                ColumnTransformer(
                    [
                        ('scaler', StandardScaler(), [
                            'budget', 'client.feedback',
                            'client.reviews_count', 'client.jobs_posted',
                            'client.past_hires'
                        ]),
                        ('title_vec',
                         Pipeline([
                             ('preprocessor', SpacyPreprocessor()),
                             ('tfidf',
                              TfidfVectorizer(tokenizer=identity,
                                              preprocessor=None,
                                              lowercase=False,
                                              use_idf=True,
                                              ngram_range=(2, 2))),
                             ('svd', TruncatedSVD(n_components=150)),
                         ]), 'title'),
                        (
                            'snippet_vec',
                            Pipeline([
                                ('preprocessor', SpacyPreprocessor()),
                                (
                                    'tfidf',
                                    TfidfVectorizer(
                                        tokenizer=identity,
                                        preprocessor=None,
                                        lowercase=False,
                                        use_idf=True,
                                        sublinear_tf=
                                        False,  # not good results when True
                                        ngram_range=(1, 2))),
                                ('svd', TruncatedSVD(n_components=100)),
                            ]),
                            'snippet'),
                        ('cat', ce.CatBoostEncoder(),
                         ["job_type", 'category2', 'client.country']),
                    ],
                    remainder='drop')),

            #('oversampling', ADASYN(random_state=42)),
            ('classifier', clf),
        ],
        verbose=True)

    if search:

        log_space = gen_parameters_from_log_space(low_value=5,
                                                  high_value=8,
                                                  n_samples=10)

        lin_space = np.arange(2, 8, 2, dtype=np.int)

        if classifier == 'SVC':
            grid = {
                # 'union__title_vec__tfidf__ngram_range'   : [(1,2), (2,2)],
                # 'union__snippet_vec__tfidf__ngram_range' : [(1,2), (2,2)],
                # 'union__snippet_vec__svd__n_components'  : np.arange(50, 301, 50),
                # 'union__title_vec__svd__n_components'    : np.arange(100, 301, 50),
                'classifier__C': log_space,
            }

        elif classifier == 'LogisticRegression':
            grid = {
                'classifier__C': gen_parameters_from_log_space(0.1, 10, 10),
            }

        elif classifier == 'GradientBoostingClassifier':
            grid = {
                'classifier__learning_rate':
                gen_parameters_from_log_space(0.01, 1, 10),
            }

        elif classifier == 'VotingClassifier':
            grid = {
                'classifier__lr__C':
                gen_parameters_from_log_space(0.1, 10, 10),
                'classifier__C':
                gen_parameters_from_log_space(5, 8, 10),
                'classifier__learning_rate':
                gen_parameters_from_log_space(0.01, 1, 10),
            }

        # With scoring="ovo", computes the average AUC of all possible pairwise
        # combinations of classes. Insensitive to class imbalance when
        # average='macro'.
        # Also see: https://stackoverflow.com/a/62471736/1253729

        searcher = GridSearchCV(
            estimator=pipeline,
            param_grid=grid,
            n_jobs=4,
            return_train_score=True,
            refit=True,
            verbose=True,
            cv=StratifiedKFold(n_splits=3),
            scoring=scorer,
        )

        model = searcher.fit(X_train, y_train.values.ravel())
        print(f"Best found parameters: {searcher.best_params_}")

    else:
        model = pipeline.fit(X_train, y_train.values.ravel())

    return model
#UNSUPERVISED MODEL
from model import *
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans


Sparse SVD on tf-idf to reduce features to 50
print("start dimensionality reduction")
data = get_vectorized_tweets('training_vecs.npy').toarray()
svd_model = TruncatedSVD(n_components=50)
data_svd = svd_model.fit_transform(data)
print("start TSNE")
tsne_model = TSNE(n_components = 2)
data_tsne = tsne_model.fit_transform(data_svd)
np.save('tsne_training_data.npy', data_tsne)
data_tsne = sample(np.asarray(get_vectorized_tweets('tsne_training_data.npy')), 500)
print(data_tsne.shape)
cluster_labels = KMeans(n_clusters = 5).fit(data_tsne).labels_

import matplotlib.pyplot as plt
print("scatter:")
plt.scatter(data_tsne[:,0], data_tsne[:,1], c = cluster_labels)
plt.show()

#UNSUPERVISED MODEL ONLY TOXIC SPEECH
#select only toxic speech
df_data = pd.read_csv("twitter-sentiment-analysis-hatred-speech/train.csv",names=('id','label','tweet'),header=None)
labels = df_data.to_numpy().T[1]
data_tsne = np.asarray(get_vectorized_tweets('tsne_training_data.npy'))
示例#17
0
def svd_vector(data):
    svd = TruncatedSVD(n_components=1)
    vector = svd.fit_transform(data.ix[:, 6:].transpose())
    return [item for sublist in vector for item in sublist]
示例#18
0
        rows.append({'text': text, 'class': classification})
        index.append(filename)

    data_frame = DataFrame(rows, index=index)
    return data_frame

data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(np.random.permutation(data.index))
## now split files into training data and labels. probably tuple (filename, r/d)

classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD(algorithm='randomized', n_components=300)),
    ('clf', XGBClassifier())
])

#classifier.fit(data['text'].values, data['class'].values)

k_fold = KFold(n_splits=8)
scores = []
confusion = np.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold.split(data):
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values
示例#19
0
def train(n_components, demean, n_samples):
    print("Loading data...")
    movie_titles, ratings, rating_indices, n_users, n_items = get_netflix_data(
        n_samples=n_samples)
    print("number of users with ratings: {}".format(
        len(np.unique(rating_indices[:, 0]))))
    print("number of movies with ratings: {}".format(
        len(np.unique(rating_indices[:, 1]))))
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True)
    kf.get_n_splits(rating_indices)

    if not n_components:
        components = [5, 10, 15, 20, 30, 50]
        components_loss_path = np.zeros((len(components), n_splits))
        print("Finding optimal number of components...")
        for n, n_components in enumerate(components):
            print("n_components: {}".format(n_components))
            for k, (train_index,
                    test_index) in enumerate(kf.split(rating_indices)):
                mean = None
                print("Fold {}".format(k))
                test_indices = rating_indices[test_index]
                test_indices = test_indices[:,
                                            0], test_indices[:,
                                                             1], test_indices[:,
                                                                              2]
                if demean:
                    print("De-mean training data...")
                    train_indices = rating_indices[train_index]
                    mean = np.mean(train_indices[:, 2])
                    train_indices = train_indices[:,
                                                  0], train_indices[:,
                                                                    1], train_indices[:,
                                                                                      2] - mean
                    data_train = scipy.sparse.csr_matrix(
                        (train_indices[2],
                         (train_indices[0], train_indices[1])),
                        shape=(n_users, n_items))
                else:
                    user_test_indices, item_test_indices = test_indices[
                        0], test_indices[1]
                    data_train = scipy.sparse.lil_matrix(ratings)
                    data_train[user_test_indices, item_test_indices] = 0
                    data_train = scipy.sparse.csr_matrix(ratings)
                print("Finished de-meaning.")
                start = time.time()
                print("Fitting...")
                svd = TruncatedSVD(n_components=n_components)
                P = svd.fit_transform(data_train)
                Q = svd.components_
                acc, loss = evaluate(P, Q, test_indices, mean=mean)
                print("Elapsed time: {:.1f}s".format(time.time() - start))
                print("loss: {:.4f} - acc: {:.4f}".format(loss, acc))
                components_loss_path[n, k] = loss
        mean_loss = np.mean(components_loss_path, axis=1)
        best_k = components[np.argmin(mean_loss)]
        best_loss = np.amin(mean_loss)
        print("best k: {}, best loss: {:.4f}".format(best_k, best_loss))
    else:
        print("Performing cross validation...")
        mean_acc = 0.0
        mean_loss = 0.0
        for k, (train_index,
                test_index) in enumerate(kf.split(rating_indices)):
            mean = None
            print("Fold {}".format(k))
            test_indices = rating_indices[test_index]
            test_indices = test_indices[:, 0], test_indices[:,
                                                            1], test_indices[:,
                                                                             2]
            if demean:
                print("De-mean training data...")
                train_indices = rating_indices[train_index]
                mean = np.mean(train_indices[:, 2])
                train_indices = train_indices[:,
                                              0], train_indices[:,
                                                                1], train_indices[:,
                                                                                  2] - mean
                data_train = scipy.sparse.csr_matrix(
                    (train_indices[2], (train_indices[0], train_indices[1])),
                    shape=(n_users, n_items))
                print("Finished de-meaning.")
            else:
                user_test_indices, item_test_indices = test_indices[
                    0], test_indices[1]
                data_train = scipy.sparse.lil_matrix(ratings)
                data_train[user_test_indices, item_test_indices] = 0
                data_train = scipy.sparse.csr_matrix(ratings)
            start = time.time()
            print("fitting...")
            svd = TruncatedSVD(n_components=n_components)
            P = svd.fit_transform(data_train)
            Q = svd.components_
            acc, loss = evaluate(P, Q, test_indices, mean=mean)
            print("Elapsed time: {:.4f}".format(time.time() - start))
            print("loss: {:.4f} - acc: {:.4f}".format(loss, acc))
            mean_acc = (mean_acc * k + acc) / (k + 1)
            mean_loss = (mean_loss * k + loss) / (k + 1)
        print("mean loss: {:.4f} - mean acc: {:.4f}".format(
            mean_loss, mean_acc))
示例#20
0
# Tokenize each document into words
# Gets rid of stop words, and stemmed version of word
# Ignores words appearing in less then 5 (or 2 if min_df = 2) documents 
vectorizer = CountVectorizer(min_df=5, stop_words= stop_words, tokenizer=LemmaTokenizer() )
X_train_counts = vectorizer.fit_transform(eight_train.data)
X_test_counts = vectorizer.transform(eight_test.data)

# TFIDF
# We set smooth_idf = false so we use the equation idf(d, t) = log [ n / df(d, t) ] + 1
tfidf_transformer = TfidfTransformer(smooth_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# 'arpack' for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds)
svd = TruncatedSVD(n_components=50, algorithm='arpack')
X_train_lsi = svd.fit_transform(X_train_tfidf)
X_test_lsi = svd.transform(X_test_tfidf)

# separate into two groups(Computer Tech & Recreation)
train_target_group = [ int(x / 4) for x in eight_train.target]
test_actual= [ int(x / 4) for x in eight_test.target]

# Logistic Regresstion Classifier 
log_reg = LogisticRegression()
log_reg.fit(X_train_lsi, train_target_group)

predicted = log_reg.predict(X_test_lsi)
predicted_probs = log_reg.predict_proba(X_test_lsi)

fpr, tpr, _ = roc_curve(test_actual, predicted_probs[:,1])
示例#21
0
# Transposing the matrix

X = ratings_matrix.T
X.head()

# X = ratings_matrix
# X.head()
X.shape


X1 = X

#Decomposing the Matrix

SVD = TruncatedSVD(n_components=10)
decomposed_matrix = SVD.fit_transform(X)
decomposed_matrix.shape

#Correlation Matrix

correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

X.index[75]

# Index of product ID purchased by customer

i = "B00000K135"

product_names = list(X.index)
示例#22
0
def main(argv):
    choose_mindf = argv[1]
    try:
        path = argv[2]
    except:
        path = None
    categories1 = [
        'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
        'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles',
        'rec.sport.baseball', 'rec.sport.hockey'
    ]
    categories2 = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian'
    ]
    cat_all = [
        'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'misc.forsale',
        'soc.religion.christian', 'alt.atheism', 'comp.graphics',
        'comp.os.ms-windows.misc', 'comp.windows.x', 'rec.autos',
        'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey',
        'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space',
        'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc',
        'talk.religion.misc'
    ]
    dclass = Data(categories1, cat_all, categories2, path)
    stop_words = text.ENGLISH_STOP_WORDS

    print('-----Part A-----')
    #plot_histogram(dclass)
    print('-----Part B-----')

    vectorizer2 = CountVectorizer(min_df=2, stop_words=stop_words, max_df=0.8)
    tfidf_transformer2 = TfidfTransformer()

    vectorizer5 = CountVectorizer(min_df=5, stop_words=stop_words, max_df=0.8)
    tfidf_transformer5 = TfidfTransformer()
    tfidf2 = preprocess(dclass,
                        dclass.training_data1,
                        vectorizer2,
                        tfidf_transformer2,
                        train=True)
    tfidf5 = preprocess(dclass,
                        dclass.training_data1,
                        vectorizer5,
                        tfidf_transformer5,
                        train=True)  #default min_df=5

    print('# of terms with min_df = 2:', tfidf2[0, :].toarray().shape[1],
          '\n# of terms with min_df = 5:', tfidf5[0, :].toarray().shape[1])

    d_tfidf = {'2': tfidf2, '5': tfidf5}
    d_vectorizer = {'2': vectorizer2, '5': vectorizer5}
    d_transformer = {'2': tfidf_transformer2, '5': tfidf_transformer5}

    print('-----Part C-----')

    vectorizerc_2 = CountVectorizer(min_df=2,
                                    stop_words=stop_words,
                                    max_df=0.8)
    tfidf_transformerc_2 = TfidfTransformer()

    tfidf_c_2 = preprocess(dclass,
                           dclass.training_data2,
                           vectorizerc_2,
                           tfidf_transformerc_2,
                           train=True,
                           ICF=True)  #default min_df=5, use TF-ICF
    find_10most(dclass, tfidf_c_2)

    vectorizerc_5 = CountVectorizer(min_df=5,
                                    stop_words=stop_words,
                                    max_df=0.8)
    tfidf_transformerc_5 = TfidfTransformer()

    tfidf_c_5 = preprocess(dclass,
                           dclass.training_data2,
                           vectorizerc_5,
                           tfidf_transformerc_5,
                           train=True,
                           ICF=True)  #default min_df=5, use TF-ICF
    find_10most(dclass, tfidf_c_5)

    print('-----Part D-----')  #SVD and NMF base on TF-IDF5 result
    svd = TruncatedSVD(n_components=50, n_iter=7, random_state=42)
    D_LSI = svd.fit_transform(d_tfidf[choose_mindf])
    model = NMF(n_components=50, init='random', random_state=0)
    D_NMF = model.fit_transform(d_tfidf[choose_mindf])
    print('LSI.shape:', D_LSI.shape, '\nNMF.shape:', D_NMF.shape)

    print('-----Part E-----')  #SVM
    tfidftest = preprocess(dclass,
                           dclass.testing_data1,
                           d_vectorizer[choose_mindf],
                           d_transformer[choose_mindf],
                           train=False)  #testing data
    D_LSI_test = svd.transform(tfidftest)
    D_NMF_test = model.transform(tfidftest)
    print('for D_LSI:')
    part_e(dclass, D_LSI, D_LSI_test)
    print('for D_NMF:')
    part_e(dclass, D_NMF, D_NMF_test)

    print('-----Part F-----')
    print('for D_LSI:')
    part_f(dclass, D_LSI, D_LSI_test)
    print('for D_NMF:')
    part_f(dclass, D_NMF, D_NMF_test)

    print('-----Part G-----')
    part_g(dclass, D_NMF, D_NMF_test, dclass.training_target1)

    print('-----Part H-----')
    part_h(dclass, D_LSI, D_LSI_test)
    part_h(dclass, D_NMF, D_NMF_test)

    print('-----Part I-----')
    part_i(dclass, D_LSI, D_LSI_test)
    part_i(dclass, D_NMF, D_NMF_test)

    print('-----Part J-----')

    tfidf2_j = preprocess(dclass,
                          dclass.training_dataj,
                          vectorizer2,
                          tfidf_transformer2,
                          train=True)
    D_LSI_j = svd.fit_transform(tfidf2_j)
    D_NMF_j = model.fit_transform(tfidf2_j)

    tfidftest_j = preprocess(dclass,
                             dclass.testing_dataj,
                             vectorizer2,
                             tfidf_transformer2,
                             train=False)  #testing data
    D_LSI_test_j = svd.transform(tfidftest_j)
    D_NMF_test_j = model.transform(tfidftest_j)

    print('----------------Naive Bayes in J-----------------')
    part_g(dclass, D_NMF_j, D_NMF_test_j, dclass.training_targetj, True)

    print('----------------SVM in J with LSI data-----------')
    part_j_SVM(dclass, D_LSI_j, D_LSI_test_j)

    print('----------------SVM in J with NMF data-----------')
    part_j_SVM(dclass, D_NMF_j, D_NMF_test_j)
示例#23
0
    def test_pipeline_column_transformer(self):

        iris = datasets.load_iris()
        X = iris.data[:, :3]
        y = iris.target
        X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"])
        X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1"
                                              if x > 0.5 else "cat2")
        X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3"
                                               if x > 0.5 else "cat4")
        y_train = y % 2
        numeric_features = [0, 1, 2]  # ["vA", "vB", "vC"]
        categorical_features = [3, 4]  # ["vcat", "vcat2"]

        classifier = LogisticRegression(
            C=0.01,
            class_weight=dict(zip([False, True], [0.2, 0.8])),
            n_jobs=1,
            max_iter=10,
            solver="lbfgs",
            tol=1e-3,
        )

        numeric_transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ])

        categorical_transformer = Pipeline(steps=[
            (
                "onehot",
                OneHotEncoder(sparse=True, handle_unknown="ignore"),
            ),
            (
                "tsvd",
                TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4),
            ),
        ])

        preprocessor = ColumnTransformer(transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ])

        model = Pipeline(steps=[("precprocessor",
                                 preprocessor), ("classifier", classifier)])

        model.fit(X_train, y_train)
        initial_type = [
            ("numfeat", FloatTensorType([None, 3])),
            ("strfeat", StringTensorType([None, 2])),
        ]

        X_train = X_train[:11]
        model_onnx = convert_sklearn(model, initial_types=initial_type)

        dump_data_and_model(
            X_train,
            model,
            model_onnx,
            basename="SklearnPipelineColumnTransformerPipeliner",
            allow_failure="StrictVersion(onnx.__version__)"
            " < StrictVersion('1.3') or "
            "StrictVersion(onnxruntime.__version__)"
            " <= StrictVersion('0.4.0')",
        )

        if __name__ == "__main__":
            from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer

            pydot_graph = GetPydotGraph(
                model_onnx.graph,
                name=model_onnx.graph.name,
                rankdir="TP",
                node_producer=GetOpNodeProducer("docstring"),
            )
            pydot_graph.write_dot("graph.dot")

            import os

            os.system("dot -O -G=300 -Tpng graph.dot")
    for size in tqdm(size_list):
        model = KeyedVectors.load("./trained_model/fasttext_gensim_" + str(size) + ".model")
        words_np = []
        words_label = []
        for word in list_words:
            words_np.append(model[word])
            words_label.append(word)
        word_vector_reduced = {}
        for index, vec in enumerate(words_np):
            word_vector_reduced[words_label[index]] = vec
        list_cosin_similarity = []
        for x, y in zip(data["Word1"], data["Word2"]):
            list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2))
        data['Relation_number'] = new_col
        data["FastText_" + str(size)] = list_cosin_similarity
        if size == 200:
            for new_size in size_list[:-1]:
                svd = TruncatedSVD(n_components=new_size, n_iter=30)
                svd.fit(words_np)
                reduced = svd.transform(words_np)
                word_vector_reduced = {}
                for index, vec in enumerate(reduced):
                    word_vector_reduced[words_label[index]] = vec
                list_cosin_similarity = []
                for x, y in zip(data["Word1"], data["Word2"]):
                    list_cosin_similarity.append(round(cosin_similarity(x, y, word_vector_reduced), 2))
                data["FastText_SVD_" + str(new_size)] = list_cosin_similarity
    # Ghi ket qua ra file csv
    tmp_name = os.path.basename(path_visim).split('.')[0] + '_result.csv'
    data.to_csv(os.path.join("./result", tmp_name), sep="\t")
示例#25
0
# print titles

for sentence in test_data['Content']:
    temp_title = ''

    for j in range(10):
        temp_title = titles2[i] + ' ' + temp_title

    sentences2.append(temp_title + PorterStemmer().stem_sentence(sentence))
    i = i + 1

#Vectorizing-LSI-Classifier
X_train = np.array(sentences)
X_test = np.array(sentences2)
clf = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopw)),\
                ('svd' , TruncatedSVD(n_components=1000) ),\
                ('clf', svm.SVC(C=10, gamma = 0.0001, kernel= 'linear', class_weight='balanced')),
               ])

clf.fit(X_train, y)
predicted = clf.predict(X_test)

#Print Results
categories = le.inverse_transform(predicted)

i = 0
CsvData2 = [['Id', 'Category']]

for t in test_data['Id']:
    CsvData2.append([t, categories[i]])
    i = i + 1
if len(fns) > 1:
    print('Multiple merged embeddings in working directory.')
    sys.exit()
else:
    m = fns[0]

print('Reading raw.')
sys.stdout.flush()
df = pd.read_csv(m, index_col=0, header=None)
if df.index.names[0] == 0:
    print('Renaming index column to SampleID.')
    df.index.names = ['SampleID']
    df.to_csv(m, compression='gzip')

mat = df.to_numpy().T
sampids = df.index
del df

print('Performing svd.')
sys.stdout.flush()
svd = TruncatedSVD(n_components=1, n_iter=7, random_state=0)
svd.fit(mat)
pc = svd.components_
mat -= mat.dot(pc.T) * pc

print('Saving nonraw.')
sys.stdout.flush()
df = pd.DataFrame(mat.T, index=sampids)
df.index.names = ['SampleID']
df.to_csv(m.replace('_raw', ''), compression='gzip')
示例#27
0
                transformer_list=[

                    # Pipeline for pulling features from the post's title line
                    ('title',
                     Pipeline([
                         ('selector', ItemSelector(key='title')),
                         ('tfidf',
                          TfidfVectorizer(min_df=50, stop_words='english')),
                     ])),

                    # Pipeline for standard bag-of-words model for abstract
                    ('abstract_bow',
                     Pipeline([
                         ('selector', ItemSelector(key='abstract')),
                         ('tfidf', TfidfVectorizer(stop_words='english')),
                         ('best', TruncatedSVD(n_components=50)),
                     ])),

                    # Pipeline for pulling ad hoc features from post's abstract
                    (
                        'abstract_stats',
                        Pipeline([
                            ('selector', ItemSelector(key='abstract')),
                            ('stats', TextStats()),  # returns a list of dicts
                            ('vect', DictVectorizer()
                             ),  # list of dicts -> feature matrix
                        ])),
                ],

                # weight components in FeatureUnion
                transformer_weights={
示例#28
0
def main():
    with open("./huck_finn.txt", mode="r") as file:
        huck_finn = file.read()

    with open("./s_holmes.txt", mode="r") as file:
        s_holmes = file.read()

    print(len(s_holmes))
    print(len(huck_finn))

    #    extract_names_raw(s_holmes, "./s_holmes_names.txt")
    #    extract_names_raw(huck_finn, "./huck_finn_names.txt")

    #    # After the names have been checked, load them and order by count of words
    #    cleaned_s_holmes_names = []
    #    with open("s_holmes_names.txt", "r") as file:
    #        for line in file:
    #            cleaned_s_holmes_names.append(line[:-1])
    #    cleaned_s_holmes_names.sort(key = lambda x: -len(x.split(" ")))
    #    for name in cleaned_s_holmes_names:
    #        s_holmes = s_holmes.replace(name, "xxnamexx")
    #
    #    cleaned_huck_finn_names = []
    #    with open("huck_finn_names.txt", "r") as file:
    #        for line in file:
    #            cleaned_huck_finn_names.append(line[:-1])
    #    for name in cleaned_huck_finn_names:
    #        huck_finn = huck_finn.replace(name, "xxnamexx")
    #
    #    print("_______")
    #    print(len(s_holmes))
    #    print(len(huck_finn))

    #    s_holmes_sent = sent_tokenize(s_holmes)
    #    print(s_holmes_sent[0])
    #    for i in range(10):
    #        print(nltk.pos_tag(word_tokenize(s_holmes_sent[i].lower())))
    #        print("____________________")

    #    s_holmes_words = [i for i in word_tokenize(s_holmes.lower()) if i not in STOP]
    #    print(nltk.pos_tag(s_holmes_words[:50]))
    #    s_holmes_sizes = [len(i) for i in s_holmes_words]
    #    print(s_holmes_sizes[:50])

    s_holmes_tokens = np.array(word_tokenize(s_holmes.lower()))
    s_holmes_lens = [len(w) for w in s_holmes_tokens]
    s_holmes_indexes = cumsum(s_holmes_lens) // S_HOLMES_LEN
    #    print(s_holmes_tokens[:50])
    print(s_holmes_indexes)
    #    print(sum(s_holmes_indexes == 0))
    #    print(sum(s_holmes_indexes == 1))
    #    print(sum(s_holmes_indexes == 100))
    #    print(sum(s_holmes_indexes == 220))
    #    print(sum(s_holmes_indexes == 221))

    corpus = []
    for ind in unique(s_holmes_indexes):
        corpus.append(" ".join(s_holmes_tokens[s_holmes_indexes == ind]))
#    print(corpus)
#    print(len(corpus))

    huck_finn_tokens = np.array(word_tokenize(huck_finn.lower()))
    huck_finn_lens = [len(w) for w in huck_finn_tokens]
    huck_finn_indexes = cumsum(huck_finn_lens) // HUCK_FINN_LEN
    print(huck_finn_indexes)
    #    print(sum(huck_finn_indexes == 0))
    #    print(sum(huck_finn_indexes == 1))
    #    print(sum(huck_finn_indexes == 100))
    #    print(sum(huck_finn_indexes == 227))
    #    print(sum(huck_finn_indexes == 228))

    for ind in unique(huck_finn_indexes):
        corpus.append(" ".join(huck_finn_tokens[huck_finn_indexes == ind]))

    print(len(corpus))

    #    # tf-idf
    #    vectorizer = TfidfVectorizer(min_df=2, stop_words=stopwords.words("english"))
    #    X = vectorizer.fit_transform(corpus)
    #    print(X.shape)
    ##    print(X[1,:])

    # ltf-real_entropy
    vectorizer = CountVectorizer(min_df=2,
                                 stop_words=stopwords.words("english"))
    X = vectorizer.fit_transform(corpus)
    print("X.shape: ", X.shape)
    X = X.A
    global_frequencies = X.sum(axis=1)
    print(global_frequencies)
    print(X[0, :] / global_frequencies[0])

    for i in range(X.shape[0]):
        p = X[i, :] / global_frequencies[i]
        real_entropy = -np.sum(p[p != 0] * np.log2(p[p != 0]))
        X[i, :] = np.log2(X[i, :] + 1) * real_entropy

    svd = TruncatedSVD(n_components=15, n_iter=7)
    svd.fit(X)
    print(svd.singular_values_)
    print(svd.transform(X).shape)

    # Xk = normalize(svd.transform(X), axis=1, norm="l2")
    Xk = svd.transform(X)
    #    print("*______*")
    #    print(X.A)
    #    print(Xk[0, :])
    #    print("*______*")

    S = Xk @ Xk.T
    print(S.shape)

    #    print(type(S))
    np.savetxt("S_text.csv", S, delimiter=",")

    plt.imshow(S, cmap='hot', interpolation='nearest')
    plt.show()
示例#29
0
]

dataset = fetch_20newsgroups(subset='all',
                             categories=categories,
                             shuffle=True,
                             random_state=42)
labels_true = dataset.target
true_k = np.unique(labels_true).shape[0]
# t0 = time()
vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=10000,
                             min_df=2,
                             stop_words='english',
                             use_idf=True)
X = vectorizer.fit_transform(dataset.data)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
# explained_variance = svd.explained_variance_ratio_.sum()

Wardhierarchial = AgglomerativeClustering(affinity='euclidean',
                                          compute_full_tree='auto',
                                          connectivity=None,
                                          linkage='ward',
                                          memory=None,
                                          n_clusters=2,
                                          pooling_func='deprecated').fit(X)
labels = Wardhierarchial.labels_
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
示例#30
0
path = "E:/MyPython/机器学习——达观杯/feature/feature_file/"
"""=====================================================================================================================
0 读取tfidf特征
"""
print("0 读取tfidf特征")
tfidf_path = path +'data_w_tfidf.pkl'
f_tfidf = open(tfidf_path, 'rb')
x_train, y_train, x_test = pickle.load(f_tfidf)
f_tfidf.close()

"""=====================================================================================================================
1 特征降维:lsa
"""
print("1 特征降维:lsa")
lsa = TruncatedSVD(n_components=200)
x_train = lsa.fit_transform(x_train)
x_test = lsa.transform(x_test)

"""=====================================================================================================================
2 将lsa特征保存至本地
"""
print("2 将lsa特征保存至本地")
data = (x_train, y_train, x_test)
f_data = open(path + 'data_w_tfidf(lsa).pkl', 'wb')
pickle.dump(data, f_data)
f_data.close()

t_end = time.time()
print("lsa特征完成,共耗时:{}min".format((t_end-t_start)/60))