def __init__(self, train_corpus_fname=None, tokenized_train_corpus_fname=None, test_corpus_fname=None, tokenized_test_corpus_fname=None, model_name="bert", model_save_path=None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None, sp_model_path=None): # configurations tf.logging.set_verbosity(tf.logging.INFO) self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 if not os.path.exists(model_save_path): os.mkdir(model_save_path) # define tokenizer if self.model_name == "bert": self.tokenizer = FullTokenizer(vocab_file=vocab_fname, do_lower_case=False) elif self.model_name == "xlnet": sp = spm.SentencePieceProcessor() sp.Load(sp_model_path) self.tokenizer = sp else: self.tokenizer = get_tokenizer("mecab") # load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
def latent_dirichlet_allocation(corpus_fname, output_fname, tokenizer_name="mecab"): make_save_path(output_fname) documents, tokenized_corpus = [], [] tokenizer = get_tokenizer(tokenizer_name) with open(corpus_fname, 'r', encoding='utf-8') as f: for document in f: tokens = list(set(tokenizer.morphs(document.strip()))) documents.append(document) tokenized_corpus.append(tokens) dictionary = corpora.Dictionary(tokenized_corpus) corpus = [dictionary.doc2bow(text) for text in tokenized_corpus] LDA = ldamulticore.LdaMulticore(corpus, id2word=dictionary, num_topics=30, minimum_probability=0.0, workers=4) # 특정 토픽의 확률이 0.5보다 클 경우에만 데이터를 리턴한다 # 확률의 합은 1이기 때문에 해당 토픽이 해당 문서에서 확률값이 가장 큰 토픽이 된다 all_topics = LDA.get_document_topics(corpus, minimum_probability=0.5, per_word_topics=False) with open(output_fname + ".results", 'w') as f: for doc_idx, topic in enumerate(all_topics): if len(topic) == 1: topic_id, prob = topic[0] f.writelines(documents[doc_idx].strip() + "\u241E" + ' '.join(tokenized_corpus[doc_idx]) + "\u241E" + str(topic_id) + "\u241E" + str(prob) + "\n") LDA.save(output_fname + ".model")
def latent_semantic_analysis(corpus_fname, output_fname, tokenizer_name="mecab"): make_save_path(output_fname) tokenizer = get_tokenizer(tokenizer_name) titles, raw_corpus, noun_corpus = [], [], [] with open(corpus_fname, 'r', encoding='utf-8') as f: for line in f: try: title, document = line.strip().split("\u241E") titles.append(title) raw_corpus.append(document) nouns = tokenizer.nouns(document) noun_corpus.append(' '.join(nouns)) except: continue # construct tf-idf matrix vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1), lowercase=True, tokenizer=lambda x: x.split()) input_matrix = vectorizer.fit_transform(noun_corpus) # compute truncated SVD svd = TruncatedSVD(n_components=100) vecs = svd.fit_transform(input_matrix) with open(output_fname, 'w') as f: for doc_idx, vec in enumerate(vecs): str_vec = [str(el) for el in vec] f.writelines(titles[doc_idx] + "\u241E" + raw_corpus[doc_idx] + '\u241E' + ' '.join(str_vec) + "\n")
def __init__( self, tune_model_fname="/notebooks/embedding/data/sentence-embeddings/elmo/tune-ckpt", pretrain_model_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/elmo.model", options_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/options.json", vocab_fname="/notebooks/embedding/data/sentence-embeddings/elmo/pretrain-ckpt/elmo-vocab.txt", max_characters_per_token=30, dimension=256, num_labels=2, use_notebook=False): # configurations super().__init__("elmo", dimension, use_notebook) self.tokenizer = get_tokenizer("mecab") self.batcher = Batcher(lm_vocab_file=vocab_fname, max_token_length=max_characters_per_token) self.ids_placeholder, self.elmo_embeddings, self.probs = make_elmo_graph( options_fname, pretrain_model_fname, max_characters_per_token, num_labels, tune=False) # restore model saver = tf.train.Saver(tf.global_variables()) self.sess = tf.Session() checkpoint_path = tf.train.latest_checkpoint(tune_model_fname) saver.restore(self.sess, checkpoint_path)
def __init__(self, train_corpus_fname = None, tokenized_train_corpus_fname = None, test_corpus_fname = None, tokenized_test_corpus_fname= None, model_name='bert', model_save_path = None, vocab_fname=None, eval_every=1000, batch_size=32, num_epochs=10, dropout_keep_prob_rate=0.9, model_ckpt_path=None): self.model_name = model_name self.eval_every = eval_every self.model_ckpt_path = model_ckpt_path self.model_save_path = model_save_path self.batch_size = batch_size self.num_epochs = num_epochs self.dropout_keep_prob_rate = dropout_keep_prob_rate self.best_valid_score = 0.0 #tokenizer defining if self.model_name =='bert': self.tokenizer = FullTokenizer(vocab_file = vocab_fname, do_lower_case = False) else: self.tokenizer = get_tokenizer('mecab') #load or tokenize corpus self.train_data, self.train_data_size = self.load_or_tokenize_corpus(train_corpus_fname, tokenized_train_corpus_fname) self.test_data, self.test_data_size = self.load_or_tokenize_corpus(test_corpus_fname, tokenized_test_corpus_fname)
def __init__(self, train_fname, embedding_fname, model_fname, embedding_corpus_fname, embedding_method="fasttext", is_weighted=True, average=False, dim=100, tokenizer_name="mecab"): # configurations make_save_path(model_fname) self.dim = dim self.average = average if is_weighted: model_full_fname = model_fname + "-weighted" else: model_full_fname = model_fname + "-original" self.tokenizer = get_tokenizer(tokenizer_name) if is_weighted: # ready for weighted embeddings self.embeddings = self.load_or_construct_weighted_embedding(embedding_fname, embedding_method, embedding_corpus_fname) print("loading weighted embeddings, complete!") else: # ready for original embeddings words, vectors = self.load_word_embeddings(embedding_fname, embedding_method) self.embeddings = defaultdict(list) for word, vector in zip(words, vectors): self.embeddings[word] = vector print("loading original embeddings, complete!") if not os.path.exists(model_full_fname): print("train Continuous Bag of Words model") self.model = self.train_model(train_fname, model_full_fname) else: print("load Continuous Bag of Words model") self.model = self.load_model(model_full_fname)
def __init__(self, vecs_txt_fname, vecs_bin_fname=None, method="word2vec", dim=100, tokenizer_name="mecab"): self.tokenizer = get_tokenizer(tokenizer_name) self.tokenizer_name = tokenizer_name self.dim = dim self.method = method self.dictionary, self.words, self.vecs = self.load_vectors( vecs_txt_fname, method) if "fasttext" in method: self.model = load_ft_model(vecs_bin_fname)
model = load_model(model_path, custom_objects={'InteractingLayer': interaction}) # model.summary() EMBEDDING_DIM = 200 MAX_SEQUENCE_LENGTH = 200 MAX_JACCARD_LENGTH = 30 INC_BATCH_SIZE = 80000 BASE_DIR = '' # W2V_MODEL_DIR = '/Users/knight/Desktop/GodClassDetection/embedding_model/new_model6.bin' W2V_MODEL_DIR = '/Users/knight/Desktop/GodClassDetection/embedding_model/new_model6_nltk.bin' TRAIN_SET_DIR = '/Users/knight/Desktop/GodClassDetection/trainset' # 直接改成自己的路径 FULL_MN_DIR = TRAIN_SET_DIR tokenizer = preprocess.get_tokenizer(FULL_MN_DIR) all_word_index = tokenizer.word_index embedding_matrix = preprocess.get_embedding_matrix(all_word_index, W2V_MODEL_DIR, dim=EMBEDDING_DIM) acc_list = [] loss_list = [] print("11111111111111111") x_train, y_train = preprocess.get_xy_train(TRAIN_SET_DIR + '/finetune', tokenizer=tokenizer, mn_maxlen=MAX_SEQUENCE_LENGTH, embedding_matrix=embedding_matrix) print('Fine tune model.')
def __init__(self, fname, tokenizer_name="mecab"): self.fname = fname self.tokenizer = get_tokenizer(tokenizer_name)
在LSTM层之前插入mask_value=0.的Masking层 3.Merge层:keras.layers.Concatenate(axis=-1)该层接收一个列表的同shape张量,并返回它们的按照给定轴相接构成的向量。axis=-1 按列拼接 axis=0 按行拼接 ''' # 参数设置 EMBEDDING_DIM = 200 MAX_SEQUENCE_LENGTH = 50 MAX_JACCARD_LENGTH = 30 INC_BATCH_SIZE = 80000 BASE_DIR = '' # W2V_MODEL_DIR = '/Users/knight/Desktop/GodClassDetection-master-mao-new/embedding_model/new_model6.bin' W2V_MODEL_DIR = '/Users/knight/Desktop/GodClassDetection/embedding_model/new_model6_nltk.bin' TRAIN_SET_DIR = '/Users/knight/Desktop/GodClassDetection/trainset' # 直接改成自己的路径 tokenizer = preprocess.get_tokenizer(TRAIN_SET_DIR) all_word_index = tokenizer.word_index embedding_matrix = preprocess.get_embedding_matrix(all_word_index, W2V_MODEL_DIR, dim=EMBEDDING_DIM) epochs = 50 acc_list = [] loss_list = [] x_train, y_train = preprocess.get_xy_train( TRAIN_SET_DIR + '/data', tokenizer=tokenizer, mn_maxlen=MAX_SEQUENCE_LENGTH, # x_train, y_train = preprocess.get_xy_train(TRAIN_SET_DIR + '/finetune', tokenizer=tokenizer, mn_maxlen=MAX_SEQUENCE_LENGTH, embedding_matrix=embedding_matrix)
def __init__(self, model_path="data/lda.results", tokenizer_name="mecab"): self.tokenizer = get_tokenizer(tokenizer_name) self.all_topics = self.load_results(model_path + ".results") self.model = LdaModel.load(model_path + ".model")
import time import numpy as np #os.environ["CUDA_VISIBLE_DEVICES"] = "0" total_y_pre = [] total_y_test = [] EMBEDDING_DIM = 200 MAX_SEQUENCE_LENGTH = 50 MAX_JACCARD_LENGTH = 30 INC_BATCH_SIZE = 80000 W2V_MODEL_DIR = 'D:/TSE/python/largeclass/new_model.bin' TRAIN_SET_DIR = 'D:/TSE/largeclass/data' tokenizer = preprocess.get_tokenizer() all_word_index = tokenizer.word_index embedding_matrix = preprocess.get_embedding_matrix(all_word_index, W2V_MODEL_DIR, dim=EMBEDDING_DIM) MODEL_NUMBER = 5 SUBSETSIZE = 0.8 def eval(y_pre, y_test): tp, tn, fp, fn = 0, 0, 0, 0 for i in range(len(y_pre)): total_y_pre.append(y_pre[i]) total_y_test.append(y_test[i]) if y_pre[i] >= 0.5: