def spot_check(self, X_train, Y_train, n_splits=20): models = [] models.append( ('SVC', OneVsRestClassifier(SVC(probability=True)), ('RM', OneVsRestClassifier( RandomForestClassifier(criterion='entropy'))), ('GB', OneVsRestClassifier(GradientBoostingClassifier())), ('MLP', OneVsRestClassifier(MLPClassifier())), ('LR', OneVsRestClassifier(LogisticRegression())))) acc = [] loss = [] names = [] scoring = { 'hamming': make_scorer(hamming_loss), 'Accuracy': make_scorer(accuracy_score) } for i, tup in enumerate(models): name = tup[0] model = tup[1] kfold = KFold(n_splits=n_splits, random_state=1, shuffle=True) cv_results = cross_validate(model, X_train, Y_train, cv=kfold, scoring=scoring, return_train_score=True) acc.append(cv_results['test_Accuracy']) loss.append(cv_results['test_hamming']) names.append(name) return names, acc, loss
def load_models(projectname): models = [] for i in range(MODEL_NUMBER): model = model_from_json( open('D:/TSE/python/missplaceclass/models/' + projectname + '-' + (str)(i) + '.json').read()) model.load_weights('D:/TSE/python/missplaceclass/models/' + projectname + '-' + (str)(i) + '.h5') models.append(model) return models
def build_model(tokenizer): embedding_model = word2vec.Word2Vec.load( 'D:/TSE/python/missplaceclass/embedding_model/new_model_1.bin') word_index = tokenizer.word_index nb_words = len(word_index) embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) for word, i in word_index.items(): if word == 'false': print(word) embedding_vector = embedding_model.wv[word] if embedding_vector is not None: embedding_matrix[i] = embedding_vector models = [] for i in range(MODEL_NUMBER): embedding_layer = Embedding(nb_words + 1, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix], trainable=False) model_left = Sequential() model_left.add(embedding_layer) model_left.add(Conv1D(128, 1, padding="same", activation='tanh')) model_left.add(Conv1D(128, 1, activation='tanh')) model_left.add(Conv1D(128, 1, activation='tanh')) model_left.add(Flatten()) model_right = Sequential() model_right.add( Conv1D(128, 1, input_shape=(8, 1), padding="same", activation='tanh')) model_right.add(Conv1D(128, 1, activation='tanh')) model_right.add(Conv1D(128, 1, activation='tanh')) model_right.add(Flatten()) output = merge.Concatenate()([model_left.output, model_right.output]) output = Dense(128, activation='tanh')(output) output = Dense(1, activation='sigmoid')(output) input_left = model_left.input input_right = model_right.input model = Model([input_left, input_right], output) model.compile(loss='binary_crossentropy', optimizer='Adadelta', metrics=['accuracy']) models.append(model) return models
def topic_model_coherence_generator(corpus, texts, dictionary, start_topic_count=2, end_topic_count=10, step=1, cpus=1): models = [] coherence_scores = [] for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)): mallet_lda_model = gensim.models.wrappers.LdaMallet(mallet_path=MALLET_PATH, corpus=corpus, num_topics=topic_nums, id2word=dictionary, iterations=500, workers=cpus) cv_coherence_model_mallet_lda = gensim.models.CoherenceModel(model=mallet_lda_model, corpus=corpus, texts=texts, dictionary=dictionary, coherence='c_v') coherence_score = cv_coherence_model_mallet_lda.get_coherence() coherence_scores.append(coherence_score) models.append(mallet_lda_model) return models, coherence_scores
def find_best_topic_num(dataset_name, lim_low, lim_high): coherences = [] models = [] sentences, dic, corpus = load_topic_data(dataset_name) for i in range(lim_low, lim_high + 1): lda_model = gensim.models.ldamulticore.LdaMulticore( corpus=corpus, id2word=dic, num_topics=i, random_state=100, chunksize=100, passes=10, per_word_topics=True) # update_every=1, models.append(lda_model) coherences.append(get_coherence_score(lda_model, sentences, dic)) max_coherence_index = coherences.index(max(coherences)) draw_plot(dataset_name, list(range(lim_low, len(coherences) + lim_low)), coherences, max(coherences), max_coherence_index + lim_low) models[max_coherence_index].save("../models/tm_" + dataset_name + ".model")
def build_model(num_topics): print('Building model', flush=True) # class GutenbergCorpusBOW(object): # def __iter__(self): # for document in os.listdir('Gutenberg/txt'): # splitdoc = [] # for line in open('Gutenberg/txt/' + document): # splitdoc.extend(line.lower().split()) # yield dictionary.doc2bow(splitdoc) # def __len__(self): # return len(os.listdir('Gutenberg/txt')) docs = [] for file in os.listdir("resources/"): with open("resources/" + file, encoding='utf8') as doc: try: txt = doc.read() except: continue docs.append(txt) docs = process(docs) dictionary = Dictionary(docs) # Remove rare and common tokens. # Filter out words that occur too frequently or too rarely. max_freq = 0.5 min_wordcount = 2 dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq) # print(dictionary) # _ = dictionary[0] # This sort of "initializes" dictionary.id2token. corpus = [dictionary.doc2bow(doc) for doc in docs] models = [] for i in range(10): lda = LdaModel(corpus, num_topics=num_topics) models.append(lda) with open("topic_models_top.pkl", "wb") as mfile: print("Writing topic_models_top.pkl", flush=True) pickle.dump((models, dictionary), mfile)
def build_models(data, data_labels, unique_labels, N=30): models = [] vocabularies = [] LDA = gensim.models.ldamodel.LdaModel for i in range(len(unique_labels)): L = unique_labels[i] corpus = [ word_tokenize(data[i]) for i in range(len(data)) if data_labels[i] == L ] vocabulary = corpora.Dictionary(corpus) BOW = [vocabulary.doc2bow(doc) for doc in corpus] LDAmodel = LDA(BOW, num_topics=N, id2word=vocabulary, passes=25, alpha='auto', minimum_probability=0.01, random_state=30) models.append(LDAmodel) vocabularies.append(vocabulary) return models, vocabularies
def iterate_topics(topic_range=range(10, 50, 5), number_of_records=None): corpus_list, titles = load_corpus('../texts/gists/', number_of_records=number_of_records) with open('../outputs/iter_titles.pkl', 'wb') as fp: pickle.dump(titles, fp) models = [] count_vect = fit_vectorizer(corpus_list) corpus, id2word = convert_corpus(corpus_list, count_vect) with open('../outputs/iter_corpus.pkl', 'wb') as fp: pickle.dump(corpus, fp) for num_tops in topic_range: lda_loop = fit_lda(num_tops, corpus, id2word, 100, multicore=2, save=True) plot_distances(lda_loop, title=f'Differences {num_tops} Topics') models.append(lda_loop) plot_distances( models[0], title=f'Comparing {topic_range[0]} to {topic_range[-1]} Topics', other_model=models[-1]) return models
def model_validation(self,X_train,Y_train,X_test,Y_test,categories=[],subtitle='annot4'): if len(categories)<1: categories=self.categories models = [] models.append(('SVC', OneVsRestClassifier(SVC(probability=True, C=8, class_weight='balanced', degree=1, gamma='scale', kernel='rbf', break_ties=False, cache_size=200, coef0=0.0, decision_function_shape='ovr', max_iter=-1, random_state=None, shrinking=True, tol=0.001, verbose=False)))) models.append(('RM', OneVsRestClassifier(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='entropy', max_depth=None, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False)))) models.append(('ens',OneVsRestClassifier(estimator=EnsembleClassifier(classifiers=[["SVC",models[0][1]], ["RM",models[1][1]]])))) trained_models=[] for i,tup in enumerate(models): model=tup[1].fit(X_train, Y_train) trained_models.append(model) pkl_filename = tup[0]+"pickle_model.pkl" with open(self.results+pkl_filename, 'wb') as file: print ("Saved classifier: "+tup[0]) pickle.dump(model, file) prediction_prob=model.predict_proba(X_test) Y_pred = model.predict(X_test) test_array = Y_test.to_numpy() title='valid'+'_'+str(tup[0])+'_'+subtitle self.roc_curve_plot(test_array,prediction_prob,title,categories) self.pr_curve_plot(test_array,prediction_prob,title,categories) all_mats=self.multi_cm(Y_test,Y_pred,title) acc=accuracy_score(Y_test, Y_pred), loss=hamming_loss(Y_test, Y_pred) acc_c,recall,prec=self.evaluate_ml_metrics(Y_test,Y_pred) print(acc,loss,acc_c,recall,prec, file=open(self.results+'_'+str(tup[0])+'_'+subtitle+"output.txt", "a")) return trained_models
import time import gensim.models from python_code.model.my_tokenize.tokenizer import cut model_names = ['model_78w.bin', 'whole_content_1_100_80w.bin'] models = [] for model_name in model_names: t = time.time() models.append(gensim.models.Word2Vec.load('../bin/' + model_name)) print('load model : ' + model_name + ' spend ' + str(time.time() - t) + ' seconds') def similarity_test(arg1='台灣', arg2='中國'): print('similarity test ' + arg1 + " compare with " + arg2) for model in models: print(model.similarity(arg1, arg2)) def doesnt_match_test(): data_set = [["早餐", "午餐", "晚餐", "宵夜", "車禍"], ["國文", "英文", "數學", "物理", "電腦"], ["爸爸", "媽媽", "書包"]] for data in data_set: print(data) for model in models: print(model.doesnt_match(data))
data_file = 'processed-blog-posts-noun%s' % str(len(raw_data)) tokenized_posts_train = get_serialized_entity( 'train-posts-%s', lambda: process_blog_posts(train_set['text'])) tokenized_posts_test = get_serialized_entity( 'test-posts-%s', lambda: process_blog_posts(test_set['text'])) topic_counts = [10, 20, 50] log_perplexities, models, dictionaries = [], [], [] for num_topic in topic_counts: model_file = '%s-lda-model-topics-%s' % (data_file, str(num_topic)) (corpus, dictionary, model) = get_serialized_entity( model_file, lambda: train_LDA(tokenized_posts_train, num_topic)) test_corpus = [dictionary.doc2bow(text) for text in tokenized_posts_test] log_perplexities.append(model.log_perplexity(test_corpus)) models.append(model) dictionaries.append(dictionary) #get 200 texts for a sample set sample_texts = test_set['text'][1:200].copy() ner_tags = sample_texts.apply(lambda text: ", ".join(get_ents(text))) selected_model_idx = 1 selected_model = models[selected_model_idx] selected_dictionary = dictionaries[selected_model_idx] ner_tags, lda_tags = [], [] for text in sample_texts: ner_tags.append(", ".join(get_ents(text))) lda_tags.append(", ".join( get_LDA_tags(selected_model, selected_dictionary, text)))
# determine the number of topics coherenceScores = [] models = [] # try the number of topics from 2 to 10 for num_topics in range(2, 10, 1): # build the lad model ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics, random_state=0, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) models.append(ldaModel) # calculate the coherence score coherencemodel = CoherenceModel(model=ldaModel, texts=tokens_lst, dictionary=id2word, coherence='c_v') coherenceScores.append(round(coherencemodel.get_coherence(), 3)) print(coherenceScores) # In[10]: # plot the coherence scores against the number of topics fig = plt.figure(figsize=(13, 8)) x = range(2, 10, 1) plt.plot(x, coherenceScores, color="steelblue")
def find_best_model_cv(n_topic_range, texts, id2word, corpus, threshold=None, random_state=42, plot=True, verbose=False): """ Searches for the best model in a given range by C_v coherence value Parameters: - `n_topic_range` a range of values for the `num_topics` parameter of a gensim LDA model to try - `texts` a list of documents broken into words - `id2word` a dictionary containing word encodings - `corpus` the result of mapping each word in `texts` to its value in `id2word` - `random_state` a random state for use in a gensim LDA model - `threshold` a float that specifies a coherence value that if reached will cause the function to return early - `plot` a boolean specifying whether or not to plot coherence values against each `num_topics` value - `verbose` a boolean specifying whether or not to print updates Returns: a tuple containing the best model, the list of all models attempted, and a list of all coherence values obtained, respectively. """ models = [] coherence_vals = [] for n_topics in n_topic_range: # Print percentage progress if verbose: diff = max(n_topic_range) - n_topic_range.start print( str(round(100 * (n_topics - n_topic_range.start) / diff, 1)) + "% done") lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=n_topics, random_state=random_state, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) co_model = CoherenceModel(lda_model, texts=texts, dictionary=id2word, coherence="c_v") coherence = co_model.get_coherence() models.append(lda_model) coherence_vals.append(coherence) if threshold is not None and coherence > threshold: if verbose: print('Returning early with a coherence value of ' + str(coherence)) if plot: actual_range = range(n_topic_range.start, n_topics + n_topic_range.step, n_topic_range.step) plt.plot(actual_range, coherence_vals, 'b') plt.show() return lda_model, models, coherence_vals if plot: # The portion of the range that was actually iterated through plt.plot(n_topic_range, coherence_vals, 'b') plt.show() return models[np.argmax(coherence_vals)], models, coherence_vals