def extract_ngrams(corpus): """ input: whole dataset output: two dictionaries, key: tweet_id, value: 1-dimensional binary numpy array Done: Out of Vocabulary (OOV) words """ if not os.path.exists('baseline/unigram_vocab.txt') or not os.path.exists( 'baseline/bigram_vocab.txt'): construct_vocabulary(corpus) # key: word, value: index unigram_vocab = read_vocabulary('baseline/unigram_vocab.txt') bigram_vocab = read_vocabulary('baseline/bigram_vocab.txt') unigram_dict = {} bigram_dict = {} for data in corpus: tokens = data.tweet_words() lower_tokens = [t.lower() for t in tokens] _id = data.tweet_id # +1 for OOV unigram_dict[_id] = np.zeros(len(unigram_vocab) + 1).tolist() bigram_dict[_id] = np.zeros(len(bigram_vocab) + 1).tolist() for idx, ele in enumerate(lower_tokens): # unigram unigram_dict[_id][unigram_vocab.get(ele, len(unigram_vocab))] = 1. if idx == len(lower_tokens) - 1: continue # bigram bigram_dict[_id][bigram_vocab.get((ele, lower_tokens[idx + 1]), len(bigram_vocab))] = 1. return unigram_dict, bigram_dict
def predict(): sep_word = thulac.thulac(seg_only=True) model = Seq2Seq(batch_size=1, forward_only=True) model_path = './models/0612/' vocab_en, _, = utils.read_vocabulary(config.TRAIN_ENC_VOCABULARY) _, vocab_de, = utils.read_vocabulary(config.TRAIN_DEC_VOCABULARY) with tf.Session() as sess: # 恢复前一次训练 ckpt = tf.train.get_checkpoint_state(model_path) if ckpt != None: print('find modal: ', ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("没找到模型") while True: input_string = raw_input('me > ') # 退出 if input_string == 'quit': exit() personal_ans = utils.check_pre_ques(input_string.decode('utf-8')) if personal_ans is not None: print('AI > ' + personal_ans) continue input_string_vec = [] aseq = sep_word.cut(input_string, text=True) for words in aseq.split(' '): input_string_vec.append(vocab_en.get(words, config.UNK_ID)) bucket_id = min([ b for b in range(len(config.BUCKETS)) if config.BUCKETS[b][0] > len(input_string_vec) ]) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: [(input_string_vec, [])]}, bucket_id) _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) outputs = [ int(np.argmax(logit, axis=1)) for logit in output_logits ] if config.EOS_ID in outputs: outputs = outputs[:outputs.index(config.EOS_ID)] response = "".join( [tf.compat.as_str(vocab_de[output]) for output in outputs]) print('AI > ' + response)
def train(): # Get the vocabulary index_to_word, _ = utils.read_vocabulary() # Create the model if resume_training: model = ChatbotModel(weights_file=weights_file) else: model = ChatbotModel( embedding_matrix=utils.read_embedding_matrix(index_to_word)) # Load the data q, a = utils.read_training_sequences() print("Total training sequences:", q.shape[0]) print("Example context-answer pair") print(utils.seq_to_text(q[0], index_to_word)) print(utils.seq_to_text(a[0], index_to_word)) q_val = q[:N_VAL, :] a_val = a[:N_VAL, :] q = q[N_VAL:, :] a = a[N_VAL:, :] n_train = len(q) - N_VAL step = round(n_train / NUM_SUBSETS) # Prepare validation data Q_val, A_val, Y_val = prepare_fit_data(q_val, a_val) # Train for m in range(EPOCHS): print("\nStarting epoch", m + 1, "\n") # Loop over training subsets so it fits in RAM for n in range(0, n_train, step): print("Training epoch: %d. Data slice: %d - %d" % (m + 1, n, n + step)) Q, A, Y = prepare_fit_data(q[n:n + step], a[n:n + step]) model.fit([Q, A], Y, batch_size=BATCH_SIZE, epochs=1) # Make sure memory is cleared del Q del A del Y gc.collect() print("Evaluating on validation set...") loss, acc = model.evaluate([Q_val, A_val], Y_val, verbose=0) print("Validation accuracy: %f, loss = %f" % (acc, loss)) model.save_weights(weights_file, overwrite=True)
def __init__(self): self.logger = logging.getLogger('trainlogger') self.logger.setLevel(logging.INFO) formatter = logging.Formatter( fmt='%(levelname)s\t%(asctime)s\t%(message)s', datefmt='%Y-%m-%dT%H:%M:%S') handler = logging.FileHandler('./logs/predict.log','a') handler.setFormatter(formatter) self.logger.addHandler(handler) self.model = Seq2Seq(batch_size=1, forward_only=True) model_path = './models/0612/' self.vocab_en, _, = utils.read_vocabulary(config.TRAIN_ENC_VOCABULARY) _, self.vocab_de, = utils.read_vocabulary(config.TRAIN_DEC_VOCABULARY) self.sess = tf.Session() ckpt = tf.train.get_checkpoint_state(model_path) if ckpt != None: self.logger.info('find modal: ' + ckpt.model_checkpoint_path) self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: self.logger.error("没找到模型") raise self.sep_word = thulac.thulac(seg_only=True)
def testRNN(vocabulary_file, training_dir): print("Reading vocabulary " + vocabulary_file + "...") words, dictionary = read_vocabulary(vocabulary_file, MAX_VOCAB_SIZE) print("Reading sentences and training RNN...") start = timer() rnn = RNNExtended(len(words), HIDDEN_LAYER_SIZE) num_words = 0 for i in range(NUM_ITER): sentences = tokenize_files(dictionary, training_dir) for sentence in itertools.islice(sentences, MAX_SENTENCES): # Todo, create context window for each sentence? rnn.train(sentence) num_words += len(sentence) print("Iteration " + str(i + 1) + "/" + str(NUM_ITER) + " finished (" + str(num_words) + " words)") num_words = 0 print("- Took %.2f sec" % (timer() - start))
def testSkipGram(vocabulary_file, training_dir): last_sentence = None print("Reading vocabulary " + vocabulary_file + "...") words, dictionary = read_vocabulary(vocabulary_file, MAX_VOCAB_SIZE) print("Reading sentences and training SkipGram...") start = timer() skip_gram = SkipGram(len(words), WINDOW_SIZE, HIDDEN_LAYER_SIZE) num_words = 0 for i in range(NUM_ITER): sentences = tokenize_files(dictionary, training_dir) for sentence in itertools.islice(sentences, MAX_SENTENCES): last_sentence = sentence skip_gram.train(sentence) num_words += len(sentence) ll = skip_gram.train(last_sentence, compute_ll=True) print("Iteration " + str(i + 1) + "/" + str(NUM_ITER) + " finished (" + str(num_words) + " words)") print("Log-likelihood: " + str(ll)) num_words = 0 print("- Took %.2f sec" % (timer() - start))
def predict_babelnet(input_path: str, output_path: str, resources_path: str) -> None: """ DO NOT MODIFY THE SIGNATURE! This is the skeleton of the prediction function. The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) with your predictions in the "<id> <BABELSynset>" format (e.g. "d000.s000.t000 bn:01234567n"). The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. If you don't know what HARD CODING means see: https://en.wikipedia.org/wiki/Hard_coding :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded). :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :return: None """ print("Predicting Babelnet...") out_vocab = utils.read_vocabulary( os.path.join(resources_path, config.OUT_VOCAB_BN)) bn2wn = utils.read_map(os.path.join(resources_path, config.BABELNET2WORDNET), reverse=False) dense_layer = 0 is_bn = True _prediction( input_path, output_path, resources_path, out_vocab, dense_layer, is_bn, bn2domain=bn2wn, )
def main(input_file, vocabulary_file): """Automatically check and correct the spelling of a file.""" vocabulary = utils.read_vocabulary(vocabulary_file) logging.info("Read %i words.", len(vocabulary)) text = utils.read_text(input_file) check(text, vocabulary)
def dependency(corpus, num_cluster=NUM_CLUSTER, min_freq=MIN_FREQ): """ input: whole dataset output: two dictionaries, key: tweet_id, value: 1-dimensional binary numpy array """ sorted_corpus = sorted(corpus, key=lambda tweet: tweet.tweet_id) depend_fn = 'baseline/dependency_A.txt.predict' if not os.path.exists(depend_fn): print('dependency parser file not exist, run the repo first') sys.exit(1) # reading vocab cluster_fn = 'baseline/brown_cluster_{}.txt'.format(num_cluster) if not os.path.exists(cluster_fn): print('brown cluster file not exist, run the repo first') sys.exit(1) if not os.path.exists('baseline/unigram_vocab.txt'): construct_vocabulary(corpus) cluster_vocab = read_brown_cluster(cluster_fn, min_freq) unigram_vocab = read_vocabulary('baseline/unigram_vocab.txt') # dict to store the features word_dict = {} cluster_dict = {} # NOTE: this requires tweets are sorted from 1 to n when passing to the txt file idx = 1 with open(depend_fn, 'r') as inf: word_tmp = np.zeros((len(unigram_vocab) + 1, len(unigram_vocab) + 1)) cluster_tmp = np.zeros( (len(cluster_vocab) + 1, len(cluster_vocab) + 1)) valid_arc = {} tweet_word_dict = {} tweet_tokens = [] for line in inf: if line.strip(): word_idx, word, _, tag, tag, _, arc_idx, _ = line.split('\t') tweet_word_dict[word_idx] = word tweet_tokens.append(word) try: int_arc_idx = int(arc_idx) except TypeError as err: int_arc_idx = -1 if int_arc_idx > 0: valid_arc[word_idx] = arc_idx else: # tweets are separated by a empty line # There might be some exceptions due to space # lower_tweet_words = [t.lower() for t in sorted_corpus[idx-1].tweet_words()] # assert tweet_tokens == lower_tweet_words, \ # ' '.join(tweet_tokens) + '\n' + ' '.join(lower_tweet_words) # when encounter a empty line, summary and store last chunks, init. for next chunk # summary for k, v in valid_arc.items(): dim1_word, dim2_word = tweet_word_dict[k], tweet_word_dict[ v] word_tmp[ unigram_vocab.get(dim1_word, len(unigram_vocab)), unigram_vocab.get(dim2_word, len(unigram_vocab))] = 1. cluster_tmp[ cluster_vocab.get(dim1_word, len(cluster_vocab)), cluster_vocab.get(dim2_word, len(cluster_vocab))] = 1. # flatten # TODO: sparse representation needed word_dict[idx] = word_tmp.flatten() cluster_dict[idx] = cluster_tmp.flatten() # init for next chunk # plus 1 for OOV valid_arc = {} tweet_word_dict = {} tweet_tokens = [] word_tmp = np.zeros( (len(unigram_vocab) + 1, len(unigram_vocab) + 1)) cluster_tmp = np.zeros( (len(cluster_vocab) + 1, len(cluster_vocab) + 1)) idx += 1 if idx % int(len(corpus) * 0.1) == 0: print(idx / int(len(corpus) * 0.1)) return word_dict, cluster_dict
def _prediction( input_path: str, output_path: str, resources_path: str, out_vocab: Dict, i_dense: int, is_bn: bool, bn2domain: Dict = None, ) -> None: """ This method is used to handle the prediction of a task :param input_path: the path of the input file to predict in the same format as Raganato's framework (XML files you downloaded). :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :param out_vocab: sense inventory :param i_dense: i-esime fully connected layer: 0 -> Babelnet 1 -> Wordnet domains 2 -> Lexicographer :param is_bn: if True, predicts Babelnet :param bn2domain: a map Babelnet to domain :return: None """ config_tf = tf.ConfigProto() config_tf.gpu_options.allow_growth = True tf.keras.backend.set_session(tf.Session(config=config_tf)) test_set = parser.parser_test_set(input_path) vocab = utils.read_vocabulary(os.path.join(resources_path, config.VOCAB)) wn2bn = utils.read_map(os.path.join(resources_path, config.BABELNET2WORDNET), reverse=True) out_vocab_bn = utils.read_vocabulary( os.path.join(resources_path, config.OUT_VOCAB_BN)) out_vocab_wnd = utils.read_vocabulary( os.path.join(resources_path, config.OUT_VOCAB_WND)) out_vocab_lex = utils.read_vocabulary( os.path.join(resources_path, config.OUT_VOCAB_LEX)) out_vocab_pos = utils.read_vocabulary( os.path.join(resources_path, config.POS_VOCAB)) pre_trained = gensim.models.KeyedVectors.load_word2vec_format(os.path.join( resources_path, config.SQUEEZED_EMB), binary=True) print("Downloading ELMo...") model = models.build_model( vocab_size=len(vocab), out_size_bn=len(out_vocab_bn), out_size_wnd=len(out_vocab_wnd), out_size_lex=len(out_vocab_lex), out_size_pos=len(out_vocab_pos), word2vec=pre_trained, is_elmo=config.IS_ELMO, attention=config.ATTENTION, is_sense_emb=config.SENSE_EMB, ) reversed_vocab = utils.reverse_vocab(out_vocab_bn) model.load_weights(str(os.path.join(resources_path, config.MODEL_WEIGHTS))) with open(str(output_path), mode="w") as file: for row in tqdm(test_set): if not config.IS_ELMO: tmp = preprocesser.text2id([row[0]], vocab) else: tmp = np.array([row[0]]) tmp_row = list(row[2]) inp = tmp if config.SENSE_EMB: sens_emb = np.ones((1, len(inp[0].split())), dtype=int) inp_pos = np.array([row[1]]) inp = [inp, sens_emb, inp_pos] predictions = model.predict(inp, verbose=0)[i_dense] for senses in tmp_row: sense_position = utils.senses_position_from_vocab( senses["lemma"], out_vocab_bn, bn2domain) synsets = [reversed_vocab[x] for x in sense_position] if not is_bn: synsets = [ bn2domain.get(syn.split("_")[-1]) for syn in synsets ] sense_position = [out_vocab[syn] for syn in synsets] to_compute = np.array([ predictions[0][senses["position"]][sen_pos] for sen_pos in sense_position ]) if len(to_compute) != 0: file.write(senses["id"] + " " + synsets[to_compute.argmax()].split("_")[-1] + "\n") else: file.write(senses["id"] + " " + utils.most_frequent_sense( senses["lemma"], senses["pos"], wn2bn, bn2domain=bn2domain, is_bn=is_bn, ) + "\n")
import numpy as np import nltk import config as cfg import utils import textprocessor from model import ChatbotModel # Set a random seed for reproducibility np.random.seed(1337) index_to_word, word_to_index = utils.read_vocabulary() # Init our keras model and load the weights from file #weights_file = "model_weights_low-training-acc33.h5" #weights_file = "model_weights_halfway-training-acc62.h5" weights_file = "model_weights_overfit-training-acc86.h5" model = ChatbotModel(weights_file=weights_file) print("Model loaded.") """ Create a sequence from a given raw string. The returned sequence can be fed directly into the bot """ def create_sequence(query): # Use NLTK to get word tokens tokenized = nltk.word_tokenize(query) # Replace out-of-vocabulary words with the UNKNOWN token tokenized = [ w if w in word_to_index else cfg.TOKEN_UNKNOWN for w in tokenized ] # Map the words to their respective indices