Exemplo n.º 1
0
    def predict_words(self, word1, word2):
        inpH = InputHelper()

        w1, w2, dim = inpH.getWords(word1, word2, vocab_filepath, 30)
        graph = tf.get_default_graph()

        with graph.as_default():
            input_x1 = graph.get_operation_by_name("input_x1").outputs[0]
            input_x2 = graph.get_operation_by_name("input_x2").outputs[0]

            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            sim = graph.get_operation_by_name("accuracy/temp_sim").outputs[0]

            r1 = self.session.run([sim], {
                input_x1: w1,
                input_x2: w2,
                dropout_keep_prob: 1.0
            })
            r2 = self.session.run([sim], {
                input_x1: w2,
                input_x2: w1,
                dropout_keep_prob: 1.0
            })

            r = max(r1, r2)

        return r[0][0]
Exemplo n.º 2
0
# 语句最多长度(包含多少个词)
# MAX_DOCUMENT_LENGTH = 12
# MAX_DOCUMENT_LENGTH = 8
# MAX_DOCUMENT_LENGTH = 20(7th-June)
MAX_DOCUMENT_LENGTH = 40
# 验证集比例
DEV_PERCENT = 10

# Misc Parameters
ALLOW_SOFT_PLACEMENT = True
LOG_DEVICE_PLACEMENT = False

print('训练开始......................')
start_time = datetime.datetime.now()

inpH = InputHelper()
# 将原始的训练文件转化为分词后的训练文件
# inpH.train_file_preprocess(TRAINING_FILES_RAW, TRAINING_FILES_FORMAT)
# sys.exit(0)

train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets(
    TRAINING_FILES_RAW, MAX_DOCUMENT_LENGTH, DEV_PERCENT, BATCH_SIZE)

# dev_batches = inpH.batch_iter(list(zip(dev_set[0], dev_set[1], dev_set[2])), BATCH_SIZE, 1)
# for index,dev_batch in enumerate(dev_batches):
#     print(index, dev_batch)
# sys.exit(0)

# for index, value in enumerate(dev_set[2]):
#     print(index, dev_set[0][index], dev_set[1][index], dev_set[2][index])
# sys.exit(0)
Exemplo n.º 3
0
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.training_files==None:
    print "Input Files List is empty. use --training_files argument."
    exit()
 


inpH = InputHelper()
train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files, FLAGS.training_labeled_files, 
                                                        FLAGS.dev_files, FLAGS.dev_labeled_files, max_document_length, 10, FLAGS.batch_size)
embedding_matrix = inpH.getEmbeddings(FLAGS.embedding_file,FLAGS.embedding_dim)
entity_embedding_matrix = inpH.getEntityEmbeddings(FLAGS.entity_embedding_file,FLAGS.entity_embedding_dim)
entity_vocab_size = len(entity_embedding_matrix)
if mode == 'random':
    entity_embedding_matrix = np.asarray(None)

# Training
# ==================================================
print("starting graph def")
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
Exemplo n.º 4
0
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.training_files == None:
    print("Input Files List is empty. use --training_files argument.")
    exit()

training_files = './data/questions' # Your train data and label
label_path = './data/labels'
inpH = InputHelper()
train_set, dev_set, sum_no_of_batches = inpH.getDataSets(training_files, label_path, 10, FLAGS.batch_size)

# 从scores中取出前五 get label using probs
def get_label_using_probs(scores, top_number=5):
    index_list = np.argsort(scores)[-top_number:]
    index_list = index_list[::-1]
    return index_list

# 计算f1的值
def f1_eval(predict_label_and_marked_label_list):
    """
    :param predict_label_and_marked_label_list: 一个元组列表。例如
    [ ([1, 2, 3, 4, 5], [4, 5, 6, 7]),
      ([3, 2, 1, 4, 7], [5, 7, 3])
     ]
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.training_files == None:
    print("Input Files List is empty. use --training_files argument.")
    exit()

training_paths = FLAGS.training_files.split(",")

multi_train_size = len(training_paths)
max_document_length = FLAGS.max_document_words

inpH = InputHelper()
train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets(
    training_paths, max_document_length, FLAGS.filter_h_pad, 10,
    FLAGS.batch_size)
inpH.loadW2V(FLAGS.word2vec, FLAGS.word2vec_format)
# Training
# ==================================================
print("starting graph def")
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    print("started session")
    with sess.as_default():
        cnn = TextCNN(sequence_length=max_document_length,
Exemplo n.º 6
0
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS.flag_values_dict()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.training_files==None:
    print("Input Files List is empty. use --training_files argument.")
    exit()


max_document_length=400
inpH = InputHelper()
train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files,max_document_length, 10,
                                                                         FLAGS.batch_size, FLAGS.is_char_based)
trainableEmbeddings=False
if FLAGS.is_char_based==True:
    FLAGS.word2vec_model = False
else:
    if FLAGS.word2vec_model==None:
        trainableEmbeddings=True
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
          "You are using word embedding based semantic similarity but "
          "word2vec model path is empty. It is Recommended to use  --word2vec_model  argument. "
          "Otherwise now the code is automatically trying to learn embedding values (may not help in accuracy)"
          "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
    else:
        inpH.loadW2V(FLAGS.word2vec_model, FLAGS.word2vec_format)
# Training parameters
batch_size = 64
num_epochs = 300
evaluate_every = 1000
checkpoint_every = 1000

# Misc Parameters
allow_soft_placement = True
log_device_placement = False
trainableEmbeddings = False

training_files = "train_snli.txt"

max_document_length = 15
inpH = InputHelper()
train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets(
    training_files, 10, max_document_length, batch_size, is_char_based)
trainableEmbeddings = False
if is_char_based == True:
    word2vec_model = False

inpH.loadW2V(word2vec_model, word2vec_format)
vocab_size = len(vocab_processor.vocabulary_)
vocab_size


def _calculate_fan_in_and_fan_out(tensor):
    if tensor.ndimension() < 2:
        raise ValueError(
            "fan in and fan out can not be computed for tensor of size ",
Exemplo n.º 8
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.eval_filepath == None or FLAGS.vocab_filepath == None or FLAGS.model == None:
    print("Eval or Vocab filepaths are empty.")
    exit()

# load data and map id-transform based on training time vocabulary
inpH = InputHelper()
x1_test, x2_test, ent_x1_test, ent_x2_test, y_test, x1_temp, x2_temp, add_fea_test = inpH.getTestDataSet(
    FLAGS.eval_filepath, FLAGS.eval_labeled_filepath, FLAGS.vocab_filepath,
    max_document_length)
#embedding_matrix = inpH.getEmbeddings(FLAGS.embedding_file,FLAGS.embedding_dim)
#entity_embedding_matrix = inpH.getEntityEmbeddings(FLAGS.entity_embedding_file,FLAGS.hidden_units)

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = FLAGS.model
print checkpoint_file
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.eval_filepath==None or FLAGS.vocab_filepath==None or FLAGS.model==None :
    print("Eval or Vocab filepaths are empty.")
    exit()

# load data and map id-transform based on training time vocabulary
inpH = InputHelper()
x1_test,x2_test,y_test = inpH.getTestDataSet(FLAGS.eval_filepath, FLAGS.vocab_filepath, 30)

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = FLAGS.model
print checkpoint_file
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
print (EVAL_FILE)
print (OUTPUT_FILE)

# Eval Parameters
BATCH_SIZE = 64  # 批大小
VOCAB_FILE = './vocab/vocab'  # 训练使使用的词表
MODEL = './models/model-4000'  # 加载训练模型
ALLOW_SOFT_PLACEMENT = True
LOG_DEVICE_PLACEMENT = False

# 语句最多长度(包含多少个词)
MAX_DOCUMENT_LENGTH = 40

# load data and map id-transform based on training time vocabulary
inpH = InputHelper()
x1_test, x2_test = inpH.getTestDataSet(EVAL_FILE, VOCAB_FILE, MAX_DOCUMENT_LENGTH)

# for index, _ in enumerate(x1_test):
#     print(index, x1_test[index], x2_test[index])

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = MODEL
print checkpoint_file
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=ALLOW_SOFT_PLACEMENT,
Exemplo n.º 11
0
    def getSentence_Embedding(self, x1, x2, max_document_length):
        checkpoint_dir = os.path.abspath(
            os.path.join(self.bilstm_dir, "checkpoints"))
        print(checkpoint_dir)
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
        #print 'ckpt:',ckpt
        checkpoint_file = ckpt.model_checkpoint_path

        vocab_file = os.path.join(checkpoint_dir, "vocab")

        inpH = InputHelper()
        vocab_processor = MyVocabularyProcessor(max_document_length,
                                                min_frequency=0)
        vocab_processor = vocab_processor.restore(vocab_file)

        tmp = []
        (x1_index, x2_index, mask_x1, mask_x2,
         tmp) = inpH.get_data(vocab_processor, x1, x2, tmp,
                              max_document_length)

        idfModel = loadIDFModel(self.idfModel_file)

        # Extract word:id mapping from the object.
        vocab_dict = vocab_processor.vocabulary_._mapping
        vocab_id_w = dict((y, x) for x, y in vocab_dict.iteritems())

        print("\nGenerating Sentence Embedding Result...\n")
        graph = tf.Graph()

        with graph.as_default():
            sess = tf.Session()
            with sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{}.meta".format(checkpoint_file))
                sess.run(tf.initialize_all_variables())
                saver.restore(sess, checkpoint_file)

                # Get the placeholders from the graph by name
                # the output is a list with only one element
                input_x1 = graph.get_operation_by_name("input_x1").outputs[0]
                input_x2 = graph.get_operation_by_name("input_x2").outputs[0]

                sentence_representation1 = graph.get_operation_by_name(
                    "sentence_embedding/Representation1").outputs[0]
                sentence_representation2 = graph.get_operation_by_name(
                    "sentence_embedding/Representation2").outputs[0]
                print "Sentence vector shape after sentence modeling"

                r1, r2 = sess.run(
                    [sentence_representation1, sentence_representation2], {
                        input_x1: x1_index,
                        input_x2: x2_index
                    })

                # Applied Attention_mechanism
                representation1 = self.getAttention_M(r1, mask_x1, x1,
                                                      x1_index, vocab_id_w,
                                                      idfModel)
                representation2 = self.getAttention_M(r2, mask_x2, x2,
                                                      x2_index, vocab_id_w,
                                                      idfModel)

        return representation1, representation2
Exemplo n.º 12
0
batch_size = FLAGS.batch_size
num_epochs = FLAGS.num_epochs

print("\nParameters:")
for attr, value in sorted(FLAGS.flag_values_dict().iteritems()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.database==None:
    print("Input Files List is empty. use -database argument.")
    exit()

max_document_length=15
#max_document_length=sys.maxint # attempt to read all words in a document
inpH = InputHelper()
#train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.database,max_document_length, 10,
#                                                                         FLAGS.batch_size, FLAGS.is_char_based)

num_docs = inpH.get_num_docs(FLAGS.training_folder)

db = lite.connect(FLAGS.database)
cursor = db.cursor()
emb_map, vocab_processor = inpH.getEmbeddingsMap(cursor, max_document_length, num_docs)
train_count, dev_count = inpH.get_counts(FLAGS.training_folder)[0:2]
total_count = train_count + dev_count

sum_no_of_batches = int(math.ceil(float(train_count) / batch_size))
dev_no_of_batches = int(math.ceil(float(dev_count) / batch_size))

train_set = inpH.my_train_batch(emb_map, train_count, FLAGS.batch_size, num_epochs)
# 语句最多长度(包含多少个词)
# MAX_DOCUMENT_LENGTH = 12
# MAX_DOCUMENT_LENGTH = 8
# MAX_DOCUMENT_LENGTH = 20(7th-June)
MAX_DOCUMENT_LENGTH = 40
# 验证集比例
DEV_PERCENT = 10

# Misc Parameters
ALLOW_SOFT_PLACEMENT = True
LOG_DEVICE_PLACEMENT = False

print ('训练开始......................')
start_time = datetime.datetime.now()

inpH = InputHelper()
# 将原始的训练文件转化为分词后的训练文件
# inpH.train_file_preprocess(TRAINING_FILES_RAW, TRAINING_FILES_FORMAT)
# sys.exit(0)


train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets(TRAINING_FILES_RAW, MAX_DOCUMENT_LENGTH,
                                                                          DEV_PERCENT,
                                                                          BATCH_SIZE)

# dev_batches = inpH.batch_iter(list(zip(dev_set[0], dev_set[1], dev_set[2])), BATCH_SIZE, 1)
# for index,dev_batch in enumerate(dev_batches):
#     print(index, dev_batch)
# sys.exit(0)

# for index, value in enumerate(dev_set[2]):
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.training_files==None:
    print("Input Files List is empty. use --training_files argument.")
    exit()


max_document_length=15
inpH = InputHelper()
train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files,max_document_length, 10,
                                                                         FLAGS.batch_size, FLAGS.is_char_based)
trainableEmbeddings=False
if FLAGS.is_char_based==True:
    FLAGS.word2vec_model = False
else:
    if FLAGS.word2vec_model==None:
        trainableEmbeddings=True
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
          "You are using word embedding based semantic similarity but "
          "word2vec model path is empty. It is Recommended to use  --word2vec_model  argument. "
          "Otherwise now the code is automatically trying to learn embedding values (may not help in accuracy)"
          "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
    else:
        inpH.loadW2V(FLAGS.word2vec_model, FLAGS.word2vec_format)
Exemplo n.º 15
0
tf.flags.DEFINE_string("database", '../plag.db', "Database path (default: ../plag.db)")
tf.flags.DEFINE_boolean("auto_chunk", True, "Automatically set chunk_size (default: True")
tf.flags.DEFINE_string("folder", "ds", "Folder in which datasets will be created. (default: ds")
tf.flags.DEFINE_boolean("intra_only", True, "If true, combine sentences of same document only. If false, combines sentences between all documents of a same author. (default: True")

FLAGS = tf.flags.FLAGS

batch_size = FLAGS.batch_size
percent_dev = FLAGS.percent_dev
percent_test = FLAGS.percent_test
database = FLAGS.database
num_docs = FLAGS.num_docs

print("\nParameters:")
for attr, value in sorted(FLAGS.flag_values_dict().iteritems()):
    print("{}={}".format(attr.upper(), value))
print("")

inpH = InputHelper()

start_time = time.time()

db = lite.connect(database)
cursor = db.cursor()
total_count = inpH.my_get_counts(cursor, FLAGS.intra_only, num_docs)

train_count, dev_count, test_count = inpH.build_datasets(cursor, total_count, batch_size, percent_dev, percent_test, FLAGS.auto_chunk, FLAGS.folder, FLAGS.intra_only, num_docs)

end_time = time.time()
print('Time elapsed on dataset creation for {} documents: {} seconds.'.format('all' if num_docs < 0 else num_docs, round(end_time - start_time, 2)))
Exemplo n.º 16
0
# 自己训练的word2vec模型
WORD2VEC_MODEL_SELF = './word2vec_model.bin'

# word2vec模型(采用已训练好的中文模型)
# WORD2VEC_MODEL = '../word2vecmodel/news_12g_baidubaike_20g_novel_90g_embedding_64.bin'
WORD2VEC_MODEL = WORD2VEC_MODEL_SELF
#  模型格式为bin
WORD2VEC_FORMAT = 'bin'

# 卷积filter大小
filter_size = [1, 2, SENTENCE_LENGTH]

# 全连接层Dropout
FULL_CONNECT_LAYER_DROPOUT = 0.8

inpH = InputHelper()
# 训练自己的word2vec模型
# inpH.gen_word2vec(TRAINING_FILES_RAW, WORD2VEC_MODEL_SELF, EMBEDDING_DIM)
# exit(0)

train_set, dev_set, vocab_processor, sum_no_of_batches = inpH.getDataSets(
    TRAINING_FILES_RAW, SENTENCE_LENGTH, DEV_PERCENT, BATCH_SIZE)

# print(type(vocab_processor.vocabulary_._mapping))
# for index, k in enumerate(vocab_processor.vocabulary_._mapping):
#     print('vocab-{}, {}:{}'.format(index, k,vocab_processor.vocabulary_._mapping[k]))
#     print('======:{}'.format(vocab_processor.vocabulary_.reverse(vocab_processor.vocabulary_._mapping[k])))

# origin_sentence='为啥我花呗叫话费都交不了'
# print(origin_sentence)
# sentence_list=list(vocab_processor.transform(np.asarray([origin_sentence])))
Exemplo n.º 17
0

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.eval_filepath==None or FLAGS.model==None :
    print("Eval or Vocab filepaths are empty.")
    exit()

w2v, model_dict, index_to_word = load_word_2vec_model.get_model_embeddings()
# load data and map id-transform based on training time vocabulary
inpH = InputHelper()
x1_test,x2_test,ids = inpH.getTestDataSet(FLAGS.eval_filepath, 30, model_dict)

wr = open('submissions_train.csv', 'w')
print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = FLAGS.model
print checkpoint_file
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
Exemplo n.º 18
0
print("\nParameters:")
for attr, value in sorted(FLAGS.flag_values_dict().iteritems()):
    print("{}={}".format(attr.upper(), value))
print("")


# returns a list of tuples, each containing the id for each document found in the database.
def get_document_ids(cursor):
    sql = 'select id from article'
    cursor.execute(sql)
    return cursor.fetchall()


########## main ##########
inpH = InputHelper()
db = lite.connect(FLAGS.database)
cursor = db.cursor()

doc_ids = get_document_ids(cursor)
doc_count = len(doc_ids)

if os.path.exists(FLAGS.output_dir):
    shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
os.mkdir(FLAGS.output_dir)

i = 0
for doc_id in doc_ids:
    sql = 'select count(*) from sentence where fk_article_id = ?'
    cursor.execute(sql, doc_id)
    tuple_count = cursor.fetchall()[0][0]
Exemplo n.º 19
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.eval_filepath == None or FLAGS.vocab_filepath == None or FLAGS.model == None:
    print("Eval or Vocab filepaths are empty.")
    exit()

# load data and map id-transform based on training time vocabulary
inpH = InputHelper()
x_test, y_test = inpH.getTestDataSet(FLAGS.eval_filepath, FLAGS.vocab_filepath,
                                     600, 5)

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = FLAGS.model
print checkpoint_file
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
Exemplo n.º 20
0
BATCH_SIZE = 64
# 验证集文件
EVAL_FILEPATH = 'validation.txt0'
# 词表(在训练过程中已生成)
VOCAB_FILEPATH = 'runs/1528462228/checkpoints/vocab'
# 模型文件
MODEL = 'runs/1528462228/checkpoints/model-10000'

# 语句最多长度(包含多少个词)
MAX_DOCUMENT_LENGTH = 30

# Misc Parameters
ALLOW_SOFT_PLACEMENT = True
LOG_DEVICE_PLACEMENT = False

inpH = InputHelper()

x1_test, x2_test, y_test = inpH.getTestDataSet(EVAL_FILEPATH, VOCAB_FILEPATH, MAX_DOCUMENT_LENGTH)

# for index ,value in enumerate(x1_test):
#     print (index, x1_test[index], x2_test[index], y_test[index])
# sys.exit(0)

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = MODEL
print checkpoint_file
graph = tf.Graph()
with graph.as_default():
Exemplo n.º 21
0
class Trainer(NLPApp):
    def __init__(self, FLAGs):
        self.FLAGS = FLAGs
        self.inpH = InputHelper()
        self.session_conf = tf.ConfigProto(
            allow_soft_placement=self.FLAGS.allow_soft_placement,
            log_device_placement=self.FLAGS.log_device_placement)

    def __load_word2vec(self):

        trainableEmbeddings = False
        if self.FLAGS.is_char_based == True:
            self.FLAGS.word2vec_model = False
        else:
            if self.FLAGS.word2vec_model == None:
                trainableEmbeddings = True
                print(
                    "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
                    "You are using word embedding based semantic similarity but "
                    "word2vec model path is empty. It is Recommended to use  --word2vec_model  argument. "
                    "Otherwise now the code is automatically trying to learn embedding values (may not help in accuracy)"
                    "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
            else:
                self.inpH.loadW2V(self.FLAGS.word2vec_model,
                                  self.FLAGS.word2vec_format)
        return trainableEmbeddings

    def __build_storage_path(self):
        '''
		Ex.
		out_dir:                        runs/1548973755
		checkpoint_dir_abs:             runs/1548973755/checkpoints
		checkpoint_model_abs:           runs/1548973755/checkpoints/model
		checkpoint_saved_model_abs:     runs/1548973755/checkpoints/model-XXX
		vocab_path:                     runs/1548973755/checkpoints/vocab
		:return:
		'''
        checkpoint_dir_abs = os.path.abspath(self.FLAGS.checkpoint_dir)

        # run/1412312455/checkpoints
        if self.FLAGS.checkpoint_dir and os.path.exists(checkpoint_dir_abs):
            # run/1412312455/
            print(
                "Checkpoint dir:{} exists, loading vocab and weights from it".
                format(self.FLAGS.checkpoint_dir))
            out_dir = os.path.join(checkpoint_dir_abs, os.pardir)
        else:
            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            checkpoint_dir_abs = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            os.makedirs(checkpoint_dir_abs)

        checkpoint_model_abs = os.path.join(checkpoint_dir_abs, "model")

        print("Writing to {}\n".format(out_dir))
        checkpoint_saved_model_abs = os.path.join(checkpoint_dir_abs,
                                                  self.FLAGS.model)
        vocab_path = os.path.join(checkpoint_dir_abs, "vocab")
        return out_dir, checkpoint_dir_abs, checkpoint_model_abs, checkpoint_saved_model_abs, vocab_path

    def run(self):
        '''
		Main logic of the app
		:return:
		'''
        # define all the path
        out_dir, checkpoint_dir_abs, checkpoint_model_abs, checkpoint_saved_model_abs, vocab_path = self.__build_storage_path(
        )

        # splitting test and val data
        train_set, dev_set, vocab_processor, sum_no_of_batches = self.inpH.getDataSets(
            self.FLAGS.training_files, self.FLAGS.max_document_length, 10,
            self.FLAGS.batch_size, self.FLAGS.is_char_based, vocab_path)

        # structure the model either from build or reload
        if self.FLAGS.model and os.path.exists(
                "{}.meta".format(checkpoint_saved_model_abs)):
            print("loading trained model from check point:{}".format(
                checkpoint_saved_model_abs))
            saver, sess, input_tensors, result_tensors, metric_ops = self.__launch_from_load(
                checkpoint_saved_model_abs, out_dir)
        else:
            trainableEmbeddings = self.__load_word2vec()
            initW = self.__init_embedding_matrix(vocab_processor)
            saver, sess, input_tensors, result_tensors, metric_ops = self.__launch_from_build(
                vocab_processor, trainableEmbeddings, out_dir,
                checkpoint_dir_abs, initW)

        # train batches
        self.__run_batches(sess, sum_no_of_batches, train_set, dev_set, saver,
                           input_tensors, result_tensors, metric_ops,
                           checkpoint_model_abs)

        # don't forget to close the session
        sess.close()

    def __launch_from_load(self, model_path, out_dir):

        graph = tf.Graph()

        # this with is necessary, even you set graph para to init sess
        with graph.as_default():
            saver = tf.train.import_meta_graph("{}.meta".format(model_path))

        sess = tf.Session(graph=graph, config=self.session_conf)
        with sess.as_default():

            saver.restore(sess, model_path)

            # Get the placeholders from the graph by name
            input_x1 = graph.get_operation_by_name("input_x1").outputs[0]
            input_x2 = graph.get_operation_by_name("input_x2").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]

            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            global_step = graph.get_operation_by_name("global_step").outputs[0]
            loss = graph.get_operation_by_name("loss/loss_fun").outputs[0]
            accuracy = graph.get_operation_by_name(
                "accuracy/accuracy").outputs[0]
            distance = graph.get_operation_by_name(
                "output/distance").outputs[0]
            temp_sim = graph.get_operation_by_name(
                "accuracy/temp_sim").outputs[0]

            # Tensors we want to evaluate
            tr_op_set = graph.get_operation_by_name("tr_op_set").outputs[0]
            train_summary_op = graph.get_operation_by_name(
                "train_summary_op").outputs[0]
            dev_summary_op = graph.get_operation_by_name(
                "dev_summary_op").outputs[0]

            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, graph)
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, graph)

            input_tensors = InputTensors(input_x1, input_x2, input_y,
                                         dropout_keep_prob)
            result_tensors = ResultTensors(global_step, loss, accuracy,
                                           distance, temp_sim)
            metric_ops = MetricOps(tr_op_set, train_summary_op, dev_summary_op,
                                   train_summary_writer, dev_summary_writer)

        return saver, sess, input_tensors, result_tensors, metric_ops

    def __launch_from_build(self, vocab_processor, trainableEmbeddings,
                            out_dir, checkpoint_dir_abs, initW):
        # ==================================================
        print("starting graph def")
        graph = tf.Graph()

        with graph.as_default():
            # will use default_graph as input para, and current default_graph is the `graph`
            sess = tf.Session(graph=graph, config=self.session_conf)
            print("started session")
            with sess.as_default():
                if self.FLAGS.is_char_based:
                    siameseModel = SiameseLSTM(
                        sequence_length=self.FLAGS.max_document_length,
                        vocab_size=len(vocab_processor.vocabulary_),
                        embedding_size=self.FLAGS.embedding_dim,
                        hidden_units=self.FLAGS.hidden_units,
                        l2_reg_lambda=self.FLAGS.l2_reg_lambda,
                        batch_size=self.FLAGS.batch_size)
                else:
                    siameseModel = SiameseLSTMw2v(
                        sequence_length=self.FLAGS.max_document_length,
                        vocab_size=len(vocab_processor.vocabulary_),
                        embedding_size=self.FLAGS.embedding_dim,
                        hidden_units=self.FLAGS.hidden_units,
                        l2_reg_lambda=self.FLAGS.l2_reg_lambda,
                        batch_size=self.FLAGS.batch_size,
                        trainableEmbeddings=trainableEmbeddings)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-3)
            print("initialized siameseModel object")

            grads_and_vars = optimizer.compute_gradients(siameseModel.loss)
            tr_op_set = optimizer.apply_gradients(grads_and_vars,
                                                  global_step=global_step,
                                                  name='tr_op_set')
            print("defined training_ops")
            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)
            print("defined gradient summaries")

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", siameseModel.loss)
            acc_summary = tf.summary.scalar("accuracy", siameseModel.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_op = tf.identity(train_summary_op,
                                           'train_summary_op')
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_op = tf.identity(dev_summary_op, 'dev_summary_op')
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            saver = tf.train.Saver(tf.global_variables(), max_to_keep=100)
            sess.run(tf.global_variables_initializer())
            if initW is not None:
                sess.run(siameseModel.W.assign(initW))

            graphpb_txt = str(graph.as_graph_def())
            with open(os.path.join(checkpoint_dir_abs, "graphpb.txt"),
                      'w') as f:
                f.write(graphpb_txt)

        input_tensors = InputTensors(siameseModel.input_x1,
                                     siameseModel.input_x2,
                                     siameseModel.input_y,
                                     siameseModel.dropout_keep_prob)
        result_tensors = ResultTensors(global_step, siameseModel.loss,
                                       siameseModel.accuracy,
                                       siameseModel.distance,
                                       siameseModel.temp_sim)
        metric_ops = MetricOps(tr_op_set, train_summary_op, dev_summary_op,
                               train_summary_writer, dev_summary_writer)
        return saver, sess, input_tensors, result_tensors, metric_ops

    def __run_batches(self, sess, sum_no_of_batches, train_set, dev_set, saver,
                      input_tensors, result_tensors, metric_ops,
                      checkpoint_prefix):

        # Generate batches,Seq of [question1_tokenized, question2_tokenized, label]
        batches = self.inpH.batch_iter(
            list(zip(train_set[0], train_set[1], train_set[2])),
            self.FLAGS.batch_size, self.FLAGS.num_epochs)

        max_validation_acc = 0.0
        for nn in range(sum_no_of_batches * self.FLAGS.num_epochs):
            batch = next(batches)
            if len(batch) < 1:
                continue
            x1_batch, x2_batch, y_batch = zip(*batch)
            if len(y_batch) < 1:
                continue
            self.__train_step(sess, input_tensors, result_tensors, metric_ops,
                              x1_batch, x2_batch, y_batch)
            current_step = tf.train.global_step(sess,
                                                result_tensors.global_step)
            sum_acc = 0.0
            if current_step % self.FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                dev_batches = self.inpH.batch_iter(
                    list(zip(dev_set[0], dev_set[1], dev_set[2])),
                    self.FLAGS.batch_size, 1)
                for db in dev_batches:
                    if len(db) < 1:
                        continue
                    x1_dev_b, x2_dev_b, y_dev_b = zip(*db)
                    if len(y_dev_b) < 1:
                        continue

                    acc = self.__dev_step(sess, input_tensors, result_tensors,
                                          metric_ops, x1_dev_b, x2_dev_b,
                                          y_dev_b)
                    sum_acc = sum_acc + acc
                print("")

            # 如果当前模型在validation数据上精确度提高了,那么打印metric并保存模型
            if current_step % self.FLAGS.checkpoint_every == 0:
                if sum_acc >= max_validation_acc:
                    max_validation_acc = sum_acc
                    saver.save(sess,
                               checkpoint_prefix,
                               global_step=current_step)
                    tf.train.write_graph(sess.graph.as_graph_def(),
                                         checkpoint_prefix,
                                         "graph" + str(nn) + ".pb",
                                         as_text=True)
                    print(
                        "Saved model {} with sum_accuracy={} checkpoint to {}\n"
                        .format(nn, max_validation_acc, checkpoint_prefix))

    def __train_step(self, sess, input_tensors, result_tensors, metric_ops,
                     x1_batch, x2_batch, y_batch):
        """
		A single training step
		"""
        # 为什么要不时的颠倒输入的句子对的顺序?损失函数应该和输入顺序无关啊?
        if random() > 0.5:
            x1_batch, x2_batch = x2_batch, x1_batch

        feed_dict = {
            input_tensors.input_x1: x1_batch,
            input_tensors.input_x2: x2_batch,
            input_tensors.input_y: y_batch,
            input_tensors.dropout_keep_prob: 1.0,
        }
        _, step, loss, accuracy, dist, sim, summaries = sess.run([
            metric_ops.tr_op_set, result_tensors.global_step,
            result_tensors.loss, result_tensors.accuracy,
            result_tensors.distance, result_tensors.temp_sim,
            metric_ops.train_summary_op
        ], feed_dict)

        time_str = datetime.datetime.now().isoformat()
        print("TRAIN {}: step {}, loss {:g}, acc {:g}".format(
            time_str, step, loss, accuracy))
        metric_ops.train_summary_writer.add_summary(summaries, step)

    def __dev_step(self, sess, input_tensors, result_tensors, metric_ops,
                   x1_batch, x2_batch, y_batch):
        """
		A single training step
		"""
        if random() > 0.5:
            x1_batch, x2_batch = x2_batch, x1_batch

        feed_dict = {
            input_tensors.input_x1: x1_batch,
            input_tensors.input_x2: x2_batch,
            input_tensors.input_y: y_batch,
            input_tensors.dropout_keep_prob: 1.0,
        }
        step, loss, accuracy, sim, summaries = sess.run([
            result_tensors.global_step, result_tensors.loss,
            result_tensors.accuracy, result_tensors.temp_sim,
            metric_ops.dev_summary_op
        ], feed_dict)

        time_str = datetime.datetime.now().isoformat()
        print("DEV {}: step {}, loss {:g}, acc {:g}".format(
            time_str, step, loss, accuracy))
        metric_ops.dev_summary_writer.add_summary(summaries, step)
        # print(y_batch, sim)
        return accuracy

    def __init_embedding_matrix(self, vocab_processor):

        if self.FLAGS.word2vec_model:
            # initial embedding matrix with random uniform
            initW = np.random.uniform(
                -0.25, 0.25,
                (len(vocab_processor.vocabulary_), self.FLAGS.embedding_dim))
            # initW = np.zeros(shape=(len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
            # load any vectors from the word2vec
            print("initializing initW with pre-trained word2vec embeddings")
            for w in vocab_processor.vocabulary_._mapping:
                arr = []
                # 去掉词中所有非数字和字母的字符
                s = re.sub('[^0-9a-zA-Z]+', '', w)
                if w in self.inpH.pre_emb:
                    arr = self.inpH.pre_emb[w]
                elif w.lower() in self.inpH.pre_emb:
                    arr = self.inpH.pre_emb[w.lower()]
                elif s in self.inpH.pre_emb:
                    arr = self.inpH.pre_emb[s]
                elif s.isdigit():
                    arr = self.inpH.pre_emb["zero"]

                if len(arr) > 0:
                    # sometime, the vector of the word may start with an offset, use the last embedding_dim numbers will solve the problem.
                    if len(arr) > self.FLAGS.embedding_dim:
                        arr = arr[-self.FLAGS.embedding_dim:]
                    idx = vocab_processor.vocabulary_.get(w)
                    initW[idx] = np.asarray(arr).astype(np.float32)

                # 如果arr是[],那么代表数据中的词在trained word2vec中不存在,那么就用最开始随机的weights来训练

            print("Done assigning intiW. len=" + str(len(initW)))
            # initW 会作为新的embedding matrix在内存中运行, 把inpH中的PreEmb哈希表删除释放缓存!
            self.inpH.deletePreEmb()
            gc.collect()
            return initW
Exemplo n.º 22
0
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.eval_filepath==None or FLAGS.vocab_filepath==None or FLAGS.model==None :
    print("Eval or Vocab filepaths are empty.")
    exit()

# load data and map id-transform based on training time vocabulary
inpH = InputHelper()
x1_test,x2_test,y_test = inpH.getTestDataSet1(FLAGS.eval_filepath, FLAGS.vocab_filepath, 30)

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = FLAGS.model
print checkpoint_file
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
Exemplo n.º 23
0
 def __init__(self, FLAGs):
     self.FLAGS = FLAGs
     self.inpH = InputHelper()
     self.session_conf = tf.ConfigProto(
         allow_soft_placement=self.FLAGS.allow_soft_placement,
         log_device_placement=self.FLAGS.log_device_placement)
Exemplo n.º 24
0
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

if FLAGS.training_files==None:
    print "Input Files List is empty. use --training_files argument."
    exit()
 
max_document_length=30
inpH = InputHelper()
train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files,max_document_length, 10, FLAGS.batch_size)

# Training
# ==================================================
print("starting graph def")
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    print("started session")
    with sess.as_default():
        siameseModel = SiameseLSTM(
            sequence_length=max_document_length,
            vocab_size=len(vocab_processor.vocabulary_),
Exemplo n.º 25
0
def infer(batch_size_infer, x1_infer, x2_infer):
    # Eval Parameters
    tf.flags.DEFINE_integer("batch_size", batch_size_infer,
                            "Batch Size (default: 64)")
    tf.flags.DEFINE_string("checkpoint_dir", "",
                           "Checkpoint directory from training run")
    tf.flags.DEFINE_string("eval_filepath", "validation_short.txt0",
                           "Evaluate on this data (Default: None)")
    tf.flags.DEFINE_string("vocab_filepath",
                           "runs/1543141697/checkpoints/vocab",
                           "Load training time vocabulary (Default: None)"
                           )  # setze vocab filepath ein
    tf.flags.DEFINE_string("model", "runs/1543141697/checkpoints/model-2000",
                           "Load trained model checkpoint (Default: None)"
                           )  # setze model filepath ein

    # Misc Parameters
    tf.flags.DEFINE_boolean("allow_soft_placement", True,
                            "Allow device soft device placement")
    tf.flags.DEFINE_boolean("log_device_placement", False,
                            "Log placement of ops on devices")

    FLAGS = tf.flags.FLAGS
    FLAGS._parse_flags()
    print("\nParameters:")
    for attr, value in sorted(FLAGS.__flags.items()):
        print("{}={}".format(attr.upper(), value))
    print("")

    if FLAGS.eval_filepath == None or FLAGS.vocab_filepath == None or FLAGS.model == None:
        print("Eval or Vocab filepaths are empty.")
        exit()

    all_predictions = []
    for x1, x2 in zip(x1_infer, x2_infer):

        # load data and map id-transform based on training time vocabulary
        inpH = InputHelper()
        x1_test, x2_test = inpH.getTestDataSet_infer(x1, x2,
                                                     FLAGS.vocab_filepath, 30)

        print("\nEvaluating...\n")

        # Evaluation
        # ==================================================
        checkpoint_file = FLAGS.model
        print checkpoint_file
        graph = tf.Graph()
        with graph.as_default():
            session_conf = tf.ConfigProto(
                allow_soft_placement=FLAGS.allow_soft_placement,
                log_device_placement=FLAGS.log_device_placement)
            sess = tf.Session(config=session_conf)
            with sess.as_default():
                # Load the saved meta graph and restore variables
                saver = tf.train.import_meta_graph(
                    "{}.meta".format(checkpoint_file))
                sess.run(tf.initialize_all_variables())
                saver.restore(sess, checkpoint_file)

                # Get the placeholders from the graph by name
                input_x1 = graph.get_operation_by_name("input_x1").outputs[0]
                input_x2 = graph.get_operation_by_name("input_x2").outputs[0]
                input_y = graph.get_operation_by_name("input_y").outputs[0]

                dropout_keep_prob = graph.get_operation_by_name(
                    "dropout_keep_prob").outputs[0]
                # Tensors we want to evaluate
                predictions = graph.get_operation_by_name(
                    "output/distance").outputs[0]

                accuracy = graph.get_operation_by_name(
                    "accuracy/accuracy").outputs[0]

                sim = graph.get_operation_by_name(
                    "accuracy/temp_sim").outputs[0]

                #emb = graph.get_operation_by_name("embedding/W").outputs[0]
                #embedded_chars = tf.nn.embedding_lookup(emb,input_x)
                # Generate batches for one epoch
                batches = inpH.batch_iter(list(zip(x1_test, x2_test)),
                                          2 * FLAGS.batch_size,
                                          1,
                                          shuffle=False)
                # Collect the predictions here
                all_d = []
                for db in batches:
                    x1_dev_b, x2_dev_b = zip(*db)
                    #                 print('db ', db)
                    #                 print('********************')
                    #                 print('x1_dev_b')
                    #                 print(x1_dev_b)
                    #                 print('********************')
                    batch_predictions = sess.run(
                        [predictions], {
                            input_x1: x1_dev_b,
                            input_x2: x2_dev_b,
                            dropout_keep_prob: 1.0
                        })
        all_predictions.append(list(batch_predictions))
    return all_predictions