def evaluate(): vocab_map, _ = dataset.read_map('corpus/mapping') sess = tf.Session() Model = create_model(sess, 'test') Model.batch_size = 1 df = pd.read_csv('corpus/SAD.csv', header=None) df = df.dropna() #df=df.head() idx = list(df.index) random.seed(SEED) random.shuffle(idx) df = df.ix[idx] cut_by = int(0.9 * df.shape[0]) train_df = df.iloc[:cut_by] val_df = df.iloc[cut_by:] for df in [train_df, val_df]: sentences = df[3] answers = df[1] scores = [] for i, sentence in enumerate(sentences): if i % 1000 == 0: print(i) token_ids = dataset.convert_to_token(sentence, vocab_map) encoder_input, encoder_length, _ = Model.get_batch( [(0, token_ids)], shuffle=False) score = Model.step(sess, encoder_input, encoder_length) #print(i,score) scores.append(score) scores = [s[0][0] for s in scores] auc = roc_auc_score(answers, scores) yield auc
def evaluate(cut_mode): if cut_mode == "word": import jieba_fast as jieba jieba.load_userdict("dict_fasttext.txt") vocab_map, _ = dataset.read_map('corpus/mapping') sess = tf.Session() Model = create_model(sess, 'test') Model.batch_size = 1 sys.stdout.write('>') sys.stdout.flush() sentence = sys.stdin.readline() sentence = sentence_cutter(sentence, cut_mode) while (sentence): print('sentence: ', sentence) token_ids = dataset.convert_to_token(sentence, vocab_map) print('toekn_ids: ', token_ids) encoder_input, encoder_length, _ = Model.get_batch([(0, token_ids)]) print('encoder_input: ', encoder_input, encoder_input.shape) print('encoder_length: ', encoder_length) score = Model.step(sess, encoder_input, encoder_length) print('Score: ', score[0][0]) print('>', end='') sys.stdout.flush() sentence = sys.stdin.readline() sentence = sentence_cutter(sentence, cut_mode)
def build_model(self): cell = tf.contrib.rnn.GRUCell(self.unit_size) params = tf.get_variable('embedding', [self.vocab_size, self.unit_size]) self.encoder_input = tf.placeholder(tf.int32, [None, self.max_length]) embedding = tf.nn.embedding_lookup(params, self.encoder_input) self.seq_length = tf.placeholder(tf.int32, [None]) _, hidden_state = tf.nn.dynamic_rnn(cell, embedding, sequence_length=self.seq_length, dtype=tf.float32) w = tf.get_variable('w', [self.unit_size, 1]) b = tf.get_variable('b', [1]) output = tf.matmul(hidden_state, w) + b self.logit = tf.nn.sigmoid(output) if self.mode != 'test': self.target = tf.placeholder(tf.float32, [None, 1]) self.loss = tf.reduce_mean(tf.square(self.target - self.logit)) self.opt = tf.train.AdamOptimizer().minimize(self.loss) else: self.vocab_map, _ = dataset.read_map( 'sentiment_analysis/corpus/mapping')
def train(): if gfile.Exists('corpus/mapping') and gfile.Exists('corpus/SAD.csv.token'): print('Files have already been formed!') else: dataset.form_vocab_mapping(50000) vocab_map, _ = dataset.read_map('corpus/mapping') dataset.file_to_token('corpus/SAD.csv', vocab_map) d = dataset.read_data('corpus/SAD.csv.token') random.seed(SEED) random.shuffle(d) train_set = d[:int(0.9 * len(d))] valid_set = d[int(-0.1 * len(d)):] sess = tf.Session() Model = create_model(sess, 'train') #Model = create_model(sess, 'valid') step = 0 loss = 0 while (True): step += 1 encoder_input, encoder_length, target = Model.get_batch(train_set) ''' print(encoder_input) print(encoder_length) print(target) exit() ''' loss_train = Model.step(sess, encoder_input, encoder_length, target) loss += loss_train / CHECK_STEP if step % CHECK_STEP == 0: Model.mode = 'valid' temp_loss = 0 for _ in range(100): encoder_input, encoder_length, target = Model.get_batch( valid_set) loss_valid = Model.step(sess, encoder_input, encoder_length, target) temp_loss += loss_valid / 100. Model.mode = 'train' print("Train Loss: %s" % loss) print("Valid Loss: %s" % temp_loss) checkpoint_path = os.path.join('saved_model/', 'dis.ckpt') Model.saver.save(sess, checkpoint_path, global_step=step) print("Model Saved!") loss = 0
def top_layer(self,outputs): w = tf.get_variable('w', [self.unit_size, 1]) b = tf.get_variable('b', [1]) output = tf.matmul(outputs, w) + b self.logit = tf.nn.sigmoid(output) if self.mode != 'test': self.target = tf.placeholder(tf.float32, [None, 1]) self.loss = tf.reduce_mean(tf.square(self.target - self.logit)) self.opt = tf.train.AdamOptimizer().minimize(self.loss) else: #self.vocab_map, _ = dataset.read_map('sentiment_analysis/corpus/mapping') self.vocab_map, _ = dataset.read_map('./corpus/mapping')
def evaluate(): vocab_map, _ = dataset.read_map('corpus/mapping') sess = tf.Session() Model = create_model(sess, 'test') Model.batch_size = 1 sys.stdout.write('>') sys.stdout.flush() sentence = sys.stdin.readline() while(sentence): token_ids = dataset.convert_to_token(sentence, vocab_map) encoder_input, encoder_length, _ = Model.get_batch([(0, token_ids)]) score = Model.step(sess, encoder_input, encoder_length) print('Score: ' + str(score[0][0])) print('>', end = '') sys.stdout.flush() sentence = sys.stdin.readline()