def config_model_prediction(self, model, feature_ph_dict, params=None): # we use the context a bit, so let's be brief c = model.context # the text will be encoded by a standard module from tf.hub model.text_encoder = hub.Module(FLAGS.module_handle) logging.info('encoding text with %s', FLAGS.module_handle) # one input is the question text, which is encoded by a specified # text_encoder module question_name = get_text_module_input_name() question_ph = feature_ph_dict[question_name] model.question_encoding = model.text_encoder( {question_name: question_ph}) # we will want to be able to map the encoded text to a set of entities in a # given type. This function returns an NQL expression, over the specified # type, which is formed by running the text encoding through a learned # linear map to get the number of dimensions right, and then applying a # softmax def linear_text_remapper(type_name): num_input_dims = FLAGS.num_text_dims num_output_dims = c.get_max_id(type_name) initializer = tf.glorot_uniform_initializer()( [num_input_dims, num_output_dims]) weight_matrix = tf.Variable(initializer) remapped_text = tf.matmul(model.question_encoding, weight_matrix) return c.as_nql(remapped_text, type_name) # the seeds, ie entities in the question, are the other # input to the model. by convention inputs are passed in # in tensorflow format, so we'll wrap them as NQL model.seeds = c.as_nql(feature_ph_dict['seeds'], 'entity_t') model.rels = [ linear_text_remapper('rel_g') for h in range(FLAGS.num_hops) ] model.dirs = [ linear_text_remapper('direction_t') for h in range(FLAGS.num_hops) ] # finally we define the NQL part of the model # start with seeds and build a model that follows exactly num_hops hops model.raw_y = [model.seeds] for h in range(FLAGS.num_hops): prev_raw_y = model.raw_y[-1] cur_raw_y = \ prev_raw_y.follow(model.rels[h], +1).if_any( model.dirs[h] & c.one('forward', 'direction_t')) \ | prev_raw_y.follow(model.rels[h], -1).if_any( model.dirs[h] & c.one('backward', 'direction_t')) # mask out seed entities if h == 1 and FLAGS.mask_seeds: filtered_cur_raw_y = tf.where( tf.equal(model.seeds.tf, 0), cur_raw_y.tf, tf.fill(tf.shape(cur_raw_y.tf), 0.0)) cur_raw_y = filtered_cur_raw_y cur_raw_y = c.as_nql(cur_raw_y, 'entity_t') model.raw_y.append(cur_raw_y) model.predicted_y = nql.nonneg_softmax(model.raw_y[-1].tf) # record the predictions: in addition to the answer we'll return the # predicted relation and direction model.predictions = dict([('rel%d' % h, model.rels[h]) for h in range(FLAGS.num_hops)] + [('dir%d' % h, model.dirs[h]) for h in range(FLAGS.num_hops)] + [('answer', model.raw_y[-1])])
type=float, default=0.5, help="dropout_rate. test: 0.0, train=0.2") parser.add_argument("--nclass", type=int) parser.add_argument("--model", help="inception, resnet") parser.add_argument("--gpu_config", default=0, help="0:gpu0, 1:gpu1, -1:both") a = parser.parse_args() for k, v in a._get_kwargs(): print(k, "=", v) import tensorflow_hub as hub if a.model == "inception": model_size = 299 module = hub.Module( "https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1", trainable=False) elif a.model == "resnet": model_size = 224 module = hub.Module( "https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/1", trainable=False) #config config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) if a.gpu_config == '0': config = tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True, visible_device_list='0')) elif a.gpu_config == '1': config = tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True, visible_device_list='1'))
import tensorflow as tf import tensorflow_hub as hub import pandas as pd module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" embed = hub.Module(module_url) tf.logging.set_verbosity(tf.logging.ERROR) def text_features(corpus): """ Convert documents to text vector """ with tf.Session() as session: session.run([tf.global_variables_initializer(), tf.tables_initializer()]) message_embeddings = session.run(embed(messages)) return pd.DataFrame(message_embeddings)
# In[ ]: #data1 = data.iloc[1950:2000,] sentence = (data1['CUST_TEXT'].astype('str')).values.tolist() #loading spacy model import en_core_web_sm nlp = en_core_web_sm.load() #loading ELMo model from tensor hub url = "https://tfhub.dev/google/elmo/2" embed = hub.Module(url) #creating word embeddings using ELMo model embeddings = embed( sentence, signature="default", as_dict=True)["default"] # In[ ]: #creating session to store output for graph creation %%time with tf.Session() as sess: sess.run(tf.global_variables_initializer())
def call(self, inputs, training=True): """ Given inputs, return the logits. :param features: :param training: :return: """ inputs_seq, masks, length = inputs length = tf.squeeze(length) # Given inputs, to generate the embedding as the input to next layer. with tf.variable_scope(name_or_scope='input_embedding_scope', reuse=tf.AUTO_REUSE) as in_em_scope: # use elmo as the input embedding if self.params.get('elmo'): elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False) # change inputs to a list of words to fit into the elmo for i in range(len(inputs)): inputs_seq[i] = [self.dic[v] for v in inputs_seq[i]] # Size of input_embedding: batch_size * max_length * 1024(default) input_embedding = elmo(inputs={ 'tokens': inputs_seq, 'sequence_len': length }, signature='tokens', as_dict=True)['elmo'] # use Bert as the input embedding if self.params.get('bert'): # TODO embed bert model here. pass # Use Glove/word2vec embedding as the input if self.params.get('word_embedding'): assert self.embedding is not None input_embedding = tf.nn.embedding_lookup( self.embedding, inputs_seq, name='input_embedding') # Use char embedding as the supplementary embedding if self.params.get('char_embedding'): # TODO embed char embedding here, need to think about how to store the instance. pass mask_embedding = tf.nn.embedding_lookup(self._mask_embedding, masks, name='mask_embedding') # concat input and mask embedding input_embedding = tf.concat([input_embedding, mask_embedding], axis=-1) with tf.variable_scope('lstm_part', reuse=tf.AUTO_REUSE) as lstm_part: lstm_output = input_embedding for i in range(self.params.get('layer_num')): lstm_output = self.add_lstm_layer(inputs=lstm_output, length=length, layer_name=i) if self.params.get('if_residual'): lstm_output = input_embedding + tf.layers.dense( inputs=lstm_output, units=self.params.get('word_dimension') + self.params.get('mask_dim')) # CRF layer with tf.variable_scope('crf_layer', reuse=tf.AUTO_REUSE) as crf_layer_layer: crf_input = tf.layers.dense( lstm_output, units=2, bias_initializer=tf.glorot_uniform_initializer()) crf_layer_ = crf_layer(inputs=crf_input, sequence_lengths=length, transition_prob=self.transition) crf_output = crf_layer_.crf_output_prob( )[:, :, -1] # The size should be batch_size * seq_len # expand crf_output's shape to batch_size * 1 * seq_len for batch matrix multipcation crf_output = tf.expand_dims(crf_output, axis=1) # also can chose matmul # sentiment_vector = tf.squeeze( # tf.einsum('aij,ajk->aik', crf_output, lstm_output)) # output shape is batch_size * embedding_dim # sentiment_vector = tf.squeeze(tf.matmul(crf_output, lstm_output)) sentiment_vector = tf.matmul(crf_output, lstm_output) # logits layer with tf.variable_scope('logits', reuse=tf.AUTO_REUSE) as logits_layer: logits = tf.layers.dense( inputs=sentiment_vector, units=self.params.get('n_classes'), activation='softmax', bias_initializer=tf.glorot_uniform_initializer()) return logits
train_df = load_dataset( os.path.join(os.path.dirname(dataset), "aclImdb", "train")) test_df = load_dataset( os.path.join(os.path.dirname(dataset), "aclImdb", "test")) return train_df, test_df # Reduce logging output. tf.logging.set_verbosity(tf.logging.ERROR) train_df, test_df = download_and_load_datasets() # %% # Now instantiate the elmo model elmo_model = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) # %% # Build our model # We create a function to integrate the tensorflow model with a Keras model # This requires explicitly casting the tensor to a string, because of a Keras quirk def ElmoEmbedding(x): return elmo_model(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]
print('Outlet Classifier') classes_All = np.asarray([1 for i in range(len(n_s_b))] + \ [2 for i in range(len(n_s_p))] + [3 for i in range(len(n_s_a))] + \ [4 for i in range(len(n_s_n))]) #Bias classifier: print('Bias Classifier') classes_Bias = np.asarray([1 for i in range(len(n_s_b))] + \ [1 for i in range(len(n_s_p))] + [2 for i in range(len(n_s_a))] + \ [2 for i in range(len(n_s_n))]) # Load the encoder: print('Load Encoder') g = tf.Graph() with g.as_default(): text_input = tf.placeholder(dtype=tf.string, shape=[None]) embed = hub.Module( "https://tfhub.dev/google/universal-sentence-encoder-large/3") embedded_text = embed(text_input) init_op = tf.group( [tf.global_variables_initializer(), tf.tables_initializer()]) g.finalize() # Initialize session: print('Initialize session') session = tf.Session(graph=g) session.run(init_op) #Function to compute all embeddings for each sentence: #Be patient, takes a little while: def similarity_matrix(merge_list):
def __init__(self): print('Loading Universal Sentence Encoder...') self.embed = hub.Module( "https://tfhub.dev/google/universal-sentence-encoder-large/3") print('Loaded!')
plot_pred_mean(pred_means,pred_weights,pred_std,ymax,ymin,y_train) mean_diff, med_diff, std_diff, mean_sigma, med_sigma, std_sigma = per_stats(pred_means,pred_weights,pred_std,ymax,ymin,y_train) """ #plot_pred_peak(pred_means,pred_weights,pred_std,ymax,ymin,y_train) #plot_pred_weight(pred_means,pred_weights,pred_std,ymax,ymin,y_train) #contamp, contamw, pp, pw = contamination(pred_means,pred_weights,pred_std,ymax,ymin,y_train) #bin_contam, bin_pp, bin_tot = binning(pred_means,pred_weights,pred_std,ymax,ymin,y_train,params,cut=200,tbins=10,gbins=10) #load saved network neural_network_t = hub.Module(save_mod) ######testing """ test_weights, test_means, test_std = testing(X_test,y_test) plot_pdfs(test_means,test_weights,test_std,train=False) plot_pred_mean(test_means,test_weights,test_std,ymax,ymin,y_test) test_mean_diff, test_med_diff, test_std_diff, test_mean_sigma, test_med_sigma, test_std_sigma = per_stats(test_means,test_weights,test_std,ymax,ymin,y_test) """ def load_data(filein='lamost_rc_wise_gaia_PS1_2mass.fits', y_exist=True): filts = [ 'Jmag', 'Hmag', 'Kmag', 'phot_g_mean_mag', 'phot_bp_mean_mag',
def get_predictions_and_loss(self, inputs): tokens, lm_emb, text_len, is_training, gold_starts, gold_ends = inputs self.dropout = self.get_dropout(self.config["dropout_rate"], is_training) self.lexical_dropout = self.get_dropout( self.config["lexical_dropout_rate"], is_training) self.lstm_dropout = self.get_dropout(self.config["lstm_dropout_rate"], is_training) num_sentences = tf.shape(tokens)[0] max_sentence_length = tf.shape(tokens)[1] if not self.lm_file: elmo_module = hub.Module("https://tfhub.dev/google/elmo/2") lm_embeddings = elmo_module(inputs={ "tokens": tokens, "sequence_len": text_len }, signature="tokens", as_dict=True) word_emb = lm_embeddings[ "word_emb"] # [num_sentences, max_sentence_length, 512] lm_emb = tf.stack([ tf.concat([word_emb, word_emb], -1), lm_embeddings["lstm_outputs1"], lm_embeddings["lstm_outputs2"] ], -1) # [num_sentences, max_sentence_length, 1024, 3] lm_emb_size = util.shape(lm_emb, 2) lm_num_layers = util.shape(lm_emb, 3) with tf.variable_scope("lm_aggregation"): self.lm_weights = tf.nn.softmax( tf.get_variable("lm_scores", [lm_num_layers], initializer=tf.constant_initializer(0.0))) self.lm_scaling = tf.get_variable( "lm_scaling", [], initializer=tf.constant_initializer(1.0)) flattened_lm_emb = tf.reshape( lm_emb, [num_sentences * max_sentence_length * lm_emb_size, lm_num_layers]) flattened_aggregated_lm_emb = tf.matmul( flattened_lm_emb, tf.expand_dims( self.lm_weights, 1)) # [num_sentences * max_sentence_length * emb, 1] aggregated_lm_emb = tf.reshape( flattened_aggregated_lm_emb, [num_sentences, max_sentence_length, lm_emb_size]) aggregated_lm_emb *= self.lm_scaling context_emb = aggregated_lm_emb context_emb = tf.nn.dropout( context_emb, self.lexical_dropout) # [num_sentences, max_sentence_length, emb] text_len_mask = tf.sequence_mask( text_len, maxlen=max_sentence_length) # [num_sentence, max_sentence_length] num_words = tf.reduce_sum(text_len) sentence_indices = tf.tile( tf.expand_dims(tf.range(num_sentences), 1), [1, max_sentence_length]) # [num_sentences, max_sentence_length] flattened_sentence_indices = self.flatten_emb_by_sentence( sentence_indices, text_len_mask) # [num_words] candidate_starts = tf.tile( tf.expand_dims(tf.range(num_words), 1), [1, max_sentence_length]) # [num_words, max_sentence_length] candidate_ends = candidate_starts + tf.expand_dims( tf.range(max_sentence_length), 0) # [num_words, max_sentence_length] candidate_start_sentence_indices = tf.gather( flattened_sentence_indices, candidate_starts) # [num_words, max_sentence_length] candidate_end_sentence_indices = tf.gather( flattened_sentence_indices, tf.minimum(candidate_ends, num_words - 1)) # [num_words, max_sentence_length] candidate_mask = tf.logical_and(candidate_ends < num_words, tf.equal( candidate_start_sentence_indices, candidate_end_sentence_indices) ) # [num_words, max_sentence_length] flattened_candidate_mask = tf.reshape( candidate_mask, [-1]) # [num_words * max_sentence_length] candidate_starts = tf.boolean_mask( tf.reshape(candidate_starts, [-1]), flattened_candidate_mask) # [num_candidates] candidate_ends = tf.boolean_mask( tf.reshape(candidate_ends, [-1]), flattened_candidate_mask) # [num_candidates] candidate_labels = self.get_candidate_labels( candidate_starts, candidate_ends, gold_starts, gold_ends) # [num_candidates] candidate_scores_mask = tf.logical_and( tf.expand_dims(text_len_mask, [1]), tf.expand_dims( text_len_mask, [2])) #[num_sentence, max_sentence_length,max_sentence_length] sentence_ends_leq_starts = tf.tile( tf.expand_dims( tf.logical_not( tf.sequence_mask(tf.range(max_sentence_length), max_sentence_length)), 0), [num_sentences, 1, 1 ]) #[num_sentence, max_sentence_length,max_sentence_length] candidate_scores_mask = tf.logical_and(candidate_scores_mask, sentence_ends_leq_starts) flattened_candidate_scores_mask = tf.reshape( candidate_scores_mask, [-1]) #[num_sentence * max_sentence_length * max_sentence_length] context_outputs = self.lstm_contextualize( context_emb, text_len, text_len_mask, self.lstm_dropout, False) # [num_sentence, max_sentence_length, emb] with tf.variable_scope("candidate_starts_ffnn"): candidate_starts_emb = util.projection( context_outputs, self.config["ffnn_size"] ) #[num_sentences, max_sentences_length,emb] with tf.variable_scope("candidate_ends_ffnn"): candidate_ends_emb = util.projection( context_outputs, self.config["ffnn_size"] ) #[num_sentences, max_sentences_length, emb] candidate_mention_scores = util.bilinear_classifier( candidate_starts_emb, candidate_ends_emb, self.dropout ) #[num_sentence, max_sentence_length,max_sentence_length] candidate_mention_scores = tf.boolean_mask( tf.reshape(candidate_mention_scores, [-1]), flattened_candidate_scores_mask) loss = self.sigmoid_loss(candidate_mention_scores, candidate_labels) top_span_starts, top_span_ends = self.get_top_mentions( num_words, candidate_starts, candidate_ends, candidate_mention_scores) return [top_span_starts, top_span_ends], loss
def __init__(self, config): self.config = config self.context_embeddings = util.EmbeddingDictionary(config["context_embeddings"]) self.head_embeddings = util.EmbeddingDictionary(config["head_embeddings"], maybe_cache=self.context_embeddings) self.char_embedding_size = config["char_embedding_size"] self.char_dict = util.load_char_dict(config["char_vocab_path"]) self.lm_file = None self.lm_hub = None self.lm_layers = 0 # TODO: Remove these. self.lm_size = 0 if config["lm_path"]: if "tfhub" in config["lm_path"]: print("Using tensorflow hub:", config["lm_path"]) self.lm_hub = hub.Module(config["lm_path"], trainable=False) else: self.lm_file = h5py.File(self.config["lm_path"], "r") self.lm_layers = self.config["lm_layers"] self.lm_size = self.config["lm_size"] self.adjunct_roles, self.core_roles = split_srl_labels( config["srl_labels"], config["include_c_v"]) self.srl_labels_inv = [""] + self.adjunct_roles + self.core_roles self.srl_labels = { l:i for i,l in enumerate(self.srl_labels_inv) } # IO Stuff. # Need to make sure they are in the same order as input_names + label_names self.input_props = [ (tf.string, [None]), # String tokens. (tf.float32, [None, self.context_embeddings.size]), # Context embeddings. (tf.float32, [None, self.head_embeddings.size]), # Head embeddings. (tf.float32, [None, self.lm_size, self.lm_layers]), # LM embeddings. (tf.int32, [None, None]), # Character indices. (tf.int32, []), # Text length. (tf.int32, []), # Document ID. (tf.bool, []), # Is training. (tf.int32, [None]), # Gold predicate ids (for input). (tf.int32, []), # Num gold predicates (for input). (tf.int32, [None]), # Predicate ids (length=num_srl_relations). (tf.int32, [None]), # Argument starts. (tf.int32, [None]), # Argument ends. (tf.int32, [None]), # SRL labels. (tf.int32, []) # Number of SRL relations. ] self.input_names = _input_names self.label_names = _label_names self.predict_names = _predict_names self.batch_size = self.config["batch_size"] dtypes, shapes = zip(*self.input_props) if self.batch_size > 0 and self.config["max_tokens_per_batch"] < 0: # Use fixed batch size if number of words per batch is not limited (-1). self.queue_input_tensors = [tf.placeholder(dtype, shape) for dtype, shape in self.input_props] queue = tf.PaddingFIFOQueue(capacity=self.batch_size * 2, dtypes=dtypes, shapes=shapes) self.enqueue_op = queue.enqueue(self.queue_input_tensors) self.input_tensors = queue.dequeue_many(self.batch_size) else: # Use dynamic batch size. new_shapes = [[None] + shape for shape in shapes] self.queue_input_tensors = [tf.placeholder(dtype, shape) for dtype, shape in zip(dtypes, new_shapes)] queue = tf.PaddingFIFOQueue(capacity=2, dtypes=dtypes, shapes=new_shapes) self.enqueue_op = queue.enqueue(self.queue_input_tensors) self.input_tensors = queue.dequeue() num_features = len(self.input_names) self.input_dict = dict(zip(self.input_names, self.input_tensors[:num_features])) self.labels_dict = dict(zip(self.label_names, self.input_tensors[num_features:]))
'complib': 'blosc:zstd', 'complevel': 6 } VERBOSE = 0 # Enable gc gc.enable() ## Initialize tensorflow # ImageNet Pre-trained models mobilenet_v1 = "https://tfhub.dev/google/imagenet/mobilenet_v1_050_224/quantops/feature_vector/1" # dim 512 mobilenet_v2 = "https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/1" # dim 1280 inception_resnet_v2 = "https://tfhub.dev/google/imagenet/inception_resnet_v2/feature_vector/1" # dim 1536 hubModule = mobilenet_v1 tf.logging.set_verbosity(tf.logging.WARN) module = hub.Module(hubModule) # Specify path variables dataRootPath = '../../../data/avito-demand-prediction/images' imagePath = f'{dataRootPath}/{sourceImgDir}' featurePath = f'{dataRootPath}/{sourceImgDir}' # Check feature path if not os.path.exists(featurePath): os.makedirs(featurePath) # Get joblist fileList = np.array_split(np.array(sorted(listdir(imagePath)), dtype=str), nInstances)[instanceID - 1] fileList = np.array([f'{imagePath}/{img}' for img in fileList]) print(
import sys import tensorflow as tf import tensorflow_hub as hub import psycopg2 import psycopg2.extras import json from functools import reduce with tf.Graph().as_default(): with tf.Session() as sess: embed = hub.Module('/home/d/nl/sentence-embed') sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) def run(): with psycopg2.connect('dbname=nl user=nl password=logbase') as conn: J = json.loads("".join(sys.stdin.readlines())) embedding = sess.run(embed(J['body'])) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute('INSERT INTO sentence_embed (i, embed, sentence) VALUES %s' % ('(%s,%s,%s),' * embedding.shape[0])[:-1], reduce(lambda acc, a: acc + a + (J['id']), enumerate(embedding.tolist()), tuple())) conn.commit() # cur.execute('SELECT sentence, body FROM test.rsentence LIMIT 10;') # sentences = cur.fetchall() if __name__ == '__main__': run()
import tensorflow_hub as hub BERT_URL = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1' module = hub.Module(BERT_URL) print('Download complete')
def main(_): # -------------------- configuration ------------------------- # tf.logging.set_verbosity(tf.logging.INFO) task_name = FLAGS.task_name.lower() model_name = "elmo" processors = { "sst-2": extract.Sst2Processor, "cola": extract.ColaProcessor, } if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() # ------------------- preprocess dataset -------------------- # label_list = processor.get_labels() num_labels = len(label_list) if FLAGS.task_name == 'sst-2': sentences, labels = _load_shard_sst(FLAGS.data_dir) if FLAGS.task_name == 'cola': sentences, labels = _load_shard_cola(FLAGS.data_dir) sentences_input = np.array(sentences) tf.logging.info("***** Running evaluation *****") tf.logging.info(" Num examples = %d", len(sentences)) # ----------------------- build model --------------------- # # sess1 elmo = hub.Module(spec='../elmo/tf_module', trainable=False) Elmo_model = load_model(FLAGS.load_path, custom_objects={'elmo': elmo, "tf": tf}) print(Elmo_model.summary()) dense_layer_model = Model(inputs=Elmo_model.input, outputs=Elmo_model.get_layer('dense_2').output) output_logits = dense_layer_model.predict(sentences_input) embedding_model = Model(inputs=Elmo_model.input, outputs=Elmo_model.get_layer('lambda_1').output) # embeddings = embedding_model.predict(sentences_input) count = 0 print('Making explanations...') # for (i, example) in enumerate(eval_examples[:1]): # ============================================================================== res = [] res.append({"lr":FLAGS.lr, "g_sample_num":g_sample_num, "m_cnt":m_cnt, "epoch_num": FLAGS.epoch_num, "maximize": FLAGS.maximize_shap}) if resume: with open(FLAGS.resume_path,"r") as f: res = json.load(f) count = len(res) - 1 start = res[-1]["id"] + 1 else: start = 0 count = 0 for i, sentence in enumerate(sentences[start:]): id = i + start dic = {} # sentence = eval_examples[0] # 分析的句子 # tokens_a = tokenizer.tokenize(sentence.text_a) label = int(labels[id]) sentence = sentences[id] splitted = sentence.split() logit = output_logits[id] embedding = embedding_model.predict(np.array([sentence, ""]))[0] # cannot set batchsize=1? why? time_step = embedding.shape[0] # ========== predictor model ======================================= idx = 2 layer_input = Input(shape=(time_step, 1024)) # print(layer_input) x = layer_input for l, layer in enumerate(Elmo_model.layers[idx:-1]): x = layer(x) print(l, x.shape) predictor_model = Model(layer_input, x) # _ = predictor_model.predict(np.random.randn(10, time_step, 1024)) with predictor_model.input.graph.as_default(): predictor_model.sess = tf.Session(graph=predictor_model.input.graph) predictor_model.sess.run(tf.global_variables_initializer()) print(predictor_model.summary()) # =================================================================== dic["id"] = id dic["tokens"] = splitted a_len = len(splitted) if a_len < min_len or a_len > max_len: continue count += 1 print(count) # print(count) print(id, splitted) seg_len = random.choice(seg_len_range) seg = [0, 0, a_len] seg[0] = random.choice(range(a_len-seg_len)) seg[1] = seg[0] + seg_len dic["seg"] = seg # opt_res = manage_a_sentence(seg, embedding, label, predictor_model) # # print(res) FLAGS.maximize_shap = True opt_res_1 = manage_a_sentence(seg, embedding, label, predictor_model) FLAGS.maximize_shap = False opt_res_2 = manage_a_sentence(seg, embedding, label, predictor_model) opt_res = [] for i in range(len(opt_res_1)): item = {"p_max": opt_res_1[i]["p"], "p_min": opt_res_2[i]["p"], "loss": -1 * opt_res_1[i]["loss"] - opt_res_2[i]["loss"] } opt_res.append(item) dic["opt_res"] = opt_res min_gt_score, max_gt_score, min_gt_part, max_gt_part = get_min_max_shap(seg, embedding, label, predictor_model) gt_score = max_gt_score - min_gt_score dic["gt_score"] = gt_score difference = [] for i in range(FLAGS.epoch_num//l_step): opt_score = 0 for j in range(i*l_step,(i+1)*l_step): if FLAGS.maximize_shap: opt_score += -1* opt_res[j]["loss"] else: opt_score += opt_res[j]["loss"] opt_score /= l_step # step_dict = {"gt_score": gt_score, "diff": abs(gt_score-opt_score)} difference.append(abs(gt_score-opt_score)) dic["difference"] = difference res.append(dic) print("gt_score:", gt_score) with open('difference_%s_elmo.json'%FLAGS.task_name, 'w') as f: json.dump(res, f) print(sentences)
def gen_emb_model_fn(features, labels, mode, params): import tensorflow_hub as hub from utils import bert_utils num_block_records = params['n_blocks'] block_records_path = params['block_records_path'] reader_module_path = params['reader_module_path'] embedder_path = params['embedder_module_path'] max_seq_len = 512 blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024) blocks_dataset = blocks_dataset.batch(num_block_records, drop_remainder=True) blocks = tf.compat.v1.get_local_variable( "blocks", initializer=tf.data.experimental.get_single_element(blocks_dataset)) retrieved_blocks = tf.gather(blocks, features['block_ids']) tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer( reader_module_path) cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")), tf.int32) sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")), tf.int32) title_tok_id_seq = tf.constant([[6522, 9138, 15759, 102] for _ in range(12)], tf.int32) block_tok_id_seqs0 = tokenizer.tokenize(retrieved_blocks) block_tok_id_seqs1 = tf.cast( block_tok_id_seqs0.merge_dims(1, 2).to_tensor(), tf.int32) batch_size = tf.shape(block_tok_id_seqs1)[0] cls_tok_ids = tf.ones([batch_size, 1], tf.int32) * cls_token_id block_tok_id_seqs = tf.concat( (cls_tok_ids, title_tok_id_seq, block_tok_id_seqs1), axis=1) block_tok_id_seqs = block_tok_id_seqs[:, :max_seq_len - 1] block_tok_id_seqs = pad_sep_to_tensor(block_tok_id_seqs, sep_token_id) input_mask = 1 - tf.cast(tf.equal(block_tok_id_seqs, tf.constant(0)), tf.int32) retriever_module = hub.Module( embedder_path, tags={"train"} if mode == tf.estimator.ModeKeys.TRAIN else {}, trainable=False) segment_ids = np.zeros((12, 288), dtype=np.int32) for i in range(12): segment_ids[i, 5:] = 1 segment_ids = tf.constant(segment_ids) # segment_ids=tf.zeros_like(block_tok_id_seqs) # print(retriever_module.get_signature_names()) # exit() # [1, projection_size] block_emb = retriever_module( inputs=dict( input_ids=block_tok_id_seqs, # input_mask=tf.ones_like(query_token_id_seqs), input_mask=input_mask, segment_ids=segment_ids), signature="projected") predictions = block_emb loss = tf.constant(1.0) logging_hook = tf.estimator.LoggingTensorHook( { 'block_ids': features['block_ids'], 'id_seqs': block_tok_id_seqs, 'id_seqs_shape': tf.shape(block_tok_id_seqs), 'id_seqs1': block_tok_id_seqs1[:, :5] }, every_n_iter=1) return tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=None, predictions=predictions, prediction_hooks=[logging_hook], # training_hooks=[train_logging_hook], # evaluation_hooks=[logging_hook], # eval_metric_ops=eval_metric_ops )
import tensorflow as tf import tensorflow_hub as hub import numpy as np from scipy.stats import truncnorm import random import base64 from io import BytesIO import PIL.Image module_path = 'https://tfhub.dev/deepmind/biggan-256/2' rand_seed = 123 truncation = 0.5 tf.reset_default_graph() print('Loading BigGAN module from:', module_path) module = hub.Module(module_path) inputs = { k: tf.placeholder(v.dtype, v.get_shape().as_list(), k) for k, v in module.get_input_info_dict().items() } output = module(inputs) input_z = inputs['z'] input_y = inputs['y'] input_trunc = inputs['truncation'] random_state = np.random.RandomState(rand_seed) dim_z = input_z.shape.as_list()[1] vocab_size = input_y.shape.as_list()[1] initializer = tf.global_variables_initializer()
import numpy as np import tensorflow as tf import tensornets as nets import tensorflow_hub as hub inputs = tf.placeholder(tf.float32, [None, 224, 224, 3]) model = nets.MobileNet140v2(inputs) model_name = 'mobilenet_v2_140_224' url = 'https://tfhub.dev/google/imagenet' tfhub = hub.Module("%s/%s/classification/1" % (url, model_name)) features = tfhub(inputs, signature="image_classification", as_dict=True) model_tfhub = tf.nn.softmax(features['default']) img = nets.utils.load_img('cat.png', target_size=256, crop_size=224) with tf.Session() as sess: # Retrieve values sess.run(tf.global_variables_initializer()) weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='module/MobilenetV2') values = sess.run(weights) for i in range(-2, 0): values[i] = np.delete(np.squeeze(values[i]), 0, axis=-1) # Adjust the order of the values to cover TF < 1.4.0 names = [w.name for w in model.get_weights()] for i in range(len(names) - 1): if 'gamma:0' in names[i] and 'beta:0' in names[i + 1]: names[i], names[i + 1] = names[i + 1], names[i]
def create_model( is_predicting, input_ids, input_mask, segment_ids, labels, num_labels, bert_tfhub_module_handle=None, bert_config=None, use_one_hot_embeddings=True, ): """Creates a classification model.""" if bert_config: model = BertModel( config=bert_config, is_training=not is_predicting, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) output_layer = model.get_pooled_output() else: bert_module = hub.Module(bert_tfhub_module_handle, trainable=True) bert_inputs = dict( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids ) bert_outputs = bert_module(inputs=bert_inputs, signature="tokens", as_dict=True) output_layer = bert_outputs["pooled_output"] hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02), ) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer() ) with tf.variable_scope("loss"): if not is_predicting: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) predicted_labels = tf.squeeze( tf.argmax(log_probs, axis=-1, output_type=tf.int32) ) if is_predicting: return (predicted_labels, log_probs) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, predicted_labels, log_probs)
def main(arguments): ''' Main logic: parse args for tests to run and which models to evaluate ''' log.basicConfig(format='%(asctime)s: %(message)s', datefmt='%m/%d %I:%M:%S %p', level=log.INFO) args = handle_arguments(arguments) if args.seed >= 0: log.info('Seeding random number generators with {}'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) maybe_make_dir(args.exp_dir) if args.log_file: log.getLogger().addHandler(log.FileHandler(args.log_file)) log.info("Parsed args: \n%s", args) all_tests = sorted( [ entry[:-len(TEST_EXT)] for entry in os.listdir(args.data_dir) if not entry.startswith('.') and entry.endswith(TEST_EXT) ], key=test_sort_key ) log.debug('Tests found:') for test in all_tests: log.debug('\t{}'.format(test)) tests = split_comma_and_check(args.tests, all_tests, "test") if args.tests is not None else all_tests log.info('Tests selected:') for test in tests: log.info('\t{}'.format(test)) models = split_comma_and_check(args.models, MODEL_NAMES, "model") if args.models is not None else MODEL_NAMES log.info('Models selected:') for model in models: log.info('\t{}'.format(model)) results = [] for model_name in models: # Different models have different interfaces for things, but generally want to: # - if saved vectors aren't there: # - load the model # - load the test data # - encode the vectors # - dump the files into some storage # - else load the saved vectors ''' log.info('Running tests for model {}'.format(model_name)) if model_name == ModelName.BOW.value: model_options = '' if args.glove_path is None: raise Exception('glove_path must be specified for {} model'.format(model_name)) elif model_name == ModelName.INFERSENT.value: if args.glove_path is None: raise Exception('glove_path must be specified for {} model'.format(model_name)) if args.infersent_dir is None: raise Exception('infersent_dir must be specified for {} model'.format(model_name)) model_options = '' elif model_name == ModelName.GENSEN.value: if args.glove_h5_path is None: raise Exception('glove_h5_path must be specified for {} model'.format(model_name)) if args.gensen_dir is None: raise Exception('gensen_dir must be specified for {} model'.format(model_name)) gensen_version_list = split_comma_and_check(args.gensen_version, GENSEN_VERSIONS, "gensen_prefix") if len(gensen_version_list) > 2: raise ValueError('gensen_version can only have one or two elements') model_options = 'version=' + args.gensen_version elif model_name == ModelName.GUSE.value: model_options = '' elif model_name == ModelName.COVE.value: if args.cove_encs is None: raise Exception('cove_encs must be specified for {} model'.format(model_name)) model_options = '' elif model_name == ModelName.ELMO.value: model_options = 'time_combine={};layer_combine={}'.format( args.time_combine_method, args.layer_combine_method) elif model_name == ModelName.BERT.value: model_options = 'version=' + args.bert_version elif model_name == ModelName.OPENAI.value: if args.openai_encs is None: raise Exception('openai_encs must be specified for {} model'.format(model_name)) model_options = '' else: raise ValueError("Model %s not found!" % model_name) model = None for test in tests: log.info('Running test {} for model {}'.format(test, model_name)) enc_file = os.path.join(args.exp_dir, "%s.%s.h5" % ( "%s;%s" % (model_name, model_options) if model_options else model_name, test)) if not args.ignore_cached_encs and os.path.isfile(enc_file): log.info("Loading encodings from %s", enc_file) encs = load_encodings(enc_file) encs_targ1 = encs['targ1'] encs_targ2 = encs['targ2'] encs_attr1 = encs['attr1'] encs_attr2 = encs['attr2'] else: # load the test data encs = load_json(os.path.join(args.data_dir, "%s%s" % (test, TEST_EXT))) # load the model and do model-specific encoding procedure log.info('Computing sentence encodings') if model_name == ModelName.BOW.value: encs_targ1 = bow.encode(encs["targ1"]["examples"], args.glove_path) encs_targ2 = bow.encode(encs["targ2"]["examples"], args.glove_path) encs_attr1 = bow.encode(encs["attr1"]["examples"], args.glove_path) encs_attr2 = bow.encode(encs["attr2"]["examples"], args.glove_path) elif model_name == ModelName.INFERSENT.value: if model is None: model = infersent.load_infersent(args.infersent_dir, args.glove_path, train_data='all', use_cpu=args.use_cpu) model.build_vocab( [ example for k in ('targ1', 'targ2', 'attr1', 'attr2') for example in encs[k]['examples'] ], tokenize=True) log.info("Encoding sentences for test %s with model %s...", test, model_name) encs_targ1 = infersent.encode(model, encs["targ1"]["examples"]) encs_targ2 = infersent.encode(model, encs["targ2"]["examples"]) encs_attr1 = infersent.encode(model, encs["attr1"]["examples"]) encs_attr2 = infersent.encode(model, encs["attr2"]["examples"]) elif model_name == ModelName.GENSEN.value: if model is None: gensen_1 = gensen.GenSenSingle( model_folder=args.gensen_dir, filename_prefix=gensen_version_list[0], pretrained_emb=args.glove_h5_path, cuda=not args.use_cpu) model = gensen_1 if len(gensen_version_list) == 2: gensen_2 = gensen.GenSenSingle( model_folder=args.gensen_dir, filename_prefix=gensen_version_list[1], pretrained_emb=args.glove_h5_path, cuda=not args.use_cpu) model = gensen.GenSen(gensen_1, gensen_2) vocab = gensen.build_vocab([ s for set_name in ('targ1', 'targ2', 'attr1', 'attr2') for s in encs[set_name]["examples"] ]) model.vocab_expansion(vocab) encs_targ1 = gensen.encode(model, encs["targ1"]["examples"]) encs_targ2 = gensen.encode(model, encs["targ2"]["examples"]) encs_attr1 = gensen.encode(model, encs["attr1"]["examples"]) encs_attr2 = gensen.encode(model, encs["attr2"]["examples"]) elif model_name == ModelName.GUSE.value: model = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2") if args.use_cpu: kwargs = dict(device_count={'GPU': 0}) else: kwargs = dict() config = tf.ConfigProto(**kwargs) config.gpu_options.per_process_gpu_memory_fraction = 0.5 # maximum alloc gpu50% of MEM config.gpu_options.allow_growth = True # allocate dynamically with tf.Session(config=config) as session: session.run([tf.global_variables_initializer(), tf.tables_initializer()]) def guse_encode(sents): encs_node = model(sents) encs = session.run(encs_node) encs_d = {sents[j]: enc for j, enc in enumerate(np.array(encs).tolist())} return encs_d encs_targ1 = guse_encode(encs["targ1"]["examples"]) encs_targ2 = guse_encode(encs["targ2"]["examples"]) encs_attr1 = guse_encode(encs["attr1"]["examples"]) encs_attr2 = guse_encode(encs["attr2"]["examples"]) elif model_name == ModelName.COVE.value: load_encs_from = os.path.join(args.cove_encs, "%s.encs" % test) encs = load_jiant_encodings(load_encs_from, n_header=1) elif model_name == ModelName.ELMO.value: kwargs = dict(time_combine_method=args.time_combine_method, layer_combine_method=args.layer_combine_method) encs_targ1 = elmo.encode(encs["targ1"]["examples"], **kwargs) encs_targ2 = elmo.encode(encs["targ2"]["examples"], **kwargs) encs_attr1 = elmo.encode(encs["attr1"]["examples"], **kwargs) encs_attr2 = elmo.encode(encs["attr2"]["examples"], **kwargs) elif model_name == ModelName.BERT.value: model, tokenizer = bert.load_model(args.bert_version) encs_targ1 = bert.encode(model, tokenizer, encs["targ1"]["examples"]) encs_targ2 = bert.encode(model, tokenizer, encs["targ2"]["examples"]) encs_attr1 = bert.encode(model, tokenizer, encs["attr1"]["examples"]) encs_attr2 = bert.encode(model, tokenizer, encs["attr2"]["examples"]) elif model_name == ModelName.OPENAI.value: load_encs_from = os.path.join(args.openai_encs, "%s.encs" % test) #encs = load_jiant_encodings(load_encs_from, n_header=1, is_openai=True) encs = load_encodings(load_encs_from) encs_targ1 = encs["targ1"]["encs"] encs_targ2 = encs["targ2"]["encs"] encs_attr1 = encs["attr1"]["encs"] encs_attr2 = encs["attr2"]["encs"] else: raise ValueError("Model %s not found!" % model_name) encs["targ1"]["encs"] = encs_targ1 encs["targ2"]["encs"] = encs_targ2 encs["attr1"]["encs"] = encs_attr1 encs["attr2"]["encs"] = encs_attr2 log.info("\tDone!") if not args.dont_cache_encs: log.info("Saving encodings to %s", enc_file) save_encodings(encs, enc_file) enc = [e for e in encs["targ1"]['encs'].values()][0] d_rep = enc.size if isinstance(enc, np.ndarray) else len(enc) # run the test on the encodings log.info("Running SEAT...") log.info("Representation dimension: {}".format(d_rep)) esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric) results.append(dict( model=model_name, options=model_options, test=test, p_value=pval, effect_size=esize, num_targ1=len(encs['targ1']['encs']), num_targ2=len(encs['targ2']['encs']), num_attr1=len(encs['attr1']['encs']), num_attr2=len(encs['attr2']['encs']))) log.info("Model: %s", model_name) log.info('Options: {}'.format(model_options)) for r in results: log.info("\tTest {test}:\tp-val: {p_value:.9f}\tesize: {effect_size:.2f}".format(**r)) if args.results_path is not None: log.info('Writing results to {}'.format(args.results_path)) with open(args.results_path, 'w') as f: writer = DictWriter(f, fieldnames=results[0].keys(), delimiter='\t') writer.writeheader() for r in results: writer.writerow(r)
resize_image(srcfile, destfile, new_width, new_height) return destfolder def get_resized_db_image_paths( destfolder='../data/train_images_model_resize/%s' % (class_folder)): return sorted( list(glob.iglob(os.path.join(destfolder, '*.[Jj][Pp][Gg]')))) resize_images_folder('../data/train_images_model/%s' % (class_folder)) db_images = get_resized_db_image_paths() tf.reset_default_graph() tf.logging.set_verbosity(tf.logging.FATAL) m = hub.Module('https://tfhub.dev/google/delf/1') # The module operates on a single image at a time, so define a placeholder to feed an arbitrary image in. image_placeholder = tf.placeholder(tf.float32, shape=(None, None, 3), name='input_image') module_inputs = { 'image': image_placeholder, 'score_threshold': 100.0, 'image_scales': [0.25, 0.3536, 0.5, 0.7071, 1.0, 1.4142, 2.0], 'max_feature_num': 1000, } module_outputs = m(module_inputs, as_dict=True)
def create_id3_embedding(videos): """Embeds the given videos using the Inflated 3D Convolution network. Downloads the graph of the I3D from tf.hub and adds it to the graph on the first call. Args: videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3]. Expected range is [-1, 1]. Returns: embedding: <float32>[batch_size, embedding_size]. embedding_size depends on the model used. Raises: ValueError: when a provided embedding_layer is not supported. """ batch_size = 16 module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1" # Making sure that we import the graph separately for # each different input video tensor. module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str( videos.name).replace(":", "_") assert_ops = [ tf.Assert( tf.reduce_max(videos) <= 1.001, ["max value in frame is > 1", videos]), tf.Assert( tf.reduce_min(videos) >= -1.001, ["min value in frame is < -1", videos]), tf.assert_equal(tf.shape(videos)[0], batch_size, ["invalid frame batch size: ", tf.shape(videos)], summarize=6), ] with tf.control_dependencies(assert_ops): videos = tf.identity(videos) module_scope = "%s_apply_default/" % module_name # To check whether the module has already been loaded into the graph, we look # for a given tensor name. If this tensor name exists, we assume the function # has been called before and the graph was imported. Otherwise we import it. # Note: in theory, the tensor could exist, but have wrong shapes. # This will happen if create_id3_embedding is called with a frames_placehoder # of wrong size/batch size, because even though that will throw a tf.Assert # on graph-execution time, it will insert the tensor (with wrong shape) into # the graph. This is why we need the following assert. video_batch_size = int(videos.shape[0]) assert video_batch_size in [batch_size, -1, None], "Invalid batch size" tensor_name = module_scope + "RGB/inception_i3d/Mean:0" if not _is_in_graph(tensor_name): i3d_model = hub.Module(module_spec, name=module_name) i3d_model(videos) # gets the kinetics-i3d-400-logits layer tensor_name = module_scope + "RGB/inception_i3d/Mean:0" tensor = tf.get_default_graph().get_tensor_by_name(tensor_name) return tensor
def __init__(self, args): self.args = args self.num_labels = len(args.labels) # 读取并预处理数据 self.data_train = self._read_data( os.path.join(args.dataset_path, 'answers_train.txt')) self.data_dev = self._read_data( os.path.join(args.dataset_path, 'answers_dev.txt')) self.data_test = self._read_data( os.path.join(args.dataset_path, 'answers_test.txt')) for data in [self.data_train, self.data_dev, self.data_test]: self.labels_2_one_hot(data) print('Number of training data:', len(self.data_train)) print('Number of data for evaluation:', len(self.data_dev)) print('Number of data for testing:', len(self.data_test)) print('data_train[0:3] =') for i in range(3): print(self.data_train[i]) print('data_dev[0:3] =') for i in range(3): print(self.data_dev[i]) print('data_test[0:3] =') for i in range(3): print(self.data_test[i]) with tf.name_scope('labeled_text'): self.label_input = tf.placeholder(tf.int8, [None, self.num_labels], name='labels') self.text_input = tf.placeholder(tf.string, [None], name='texts') self.elmo = hub.Module("https://tfhub.dev/google/elmo/3") self.embeddings = self.elmo(self.text_input, signature="default", as_dict=True)["default"] with tf.name_scope('ELMo_Classifier'): self.h_size = int(self.embeddings.shape[-1]) # embedding维度 self.W = tf.Variable(tf.truncated_normal( [self.h_size, self.num_labels]), name='Weights') self.B = tf.Variable(tf.truncated_normal([self.num_labels]), name='Bias') self.Z = tf.matmul(self.embeddings, self.W) + self.B tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.contrib.layers.l2_regularizer( self.args.lamb)(self.W)) # 使用L2正则化,防止过拟合 self.prob = tf.nn.softmax(self.Z) self.pred_label = tf.argmax(self.prob, 1) self.true_label = tf.argmax(self.label_input, 1) self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( logits=self.Z, labels=self.label_input)) + tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) self.op = tf.train.AdamOptimizer( learning_rate=self.args.learning_rate).minimize(self.loss)
def embed_lines(args, unencoded_lines, output_dict, unencoded_lines_resps=None): """Embed a collection of lines to an output dictionary.""" # Import the Universal Sentence Encoder's TF Hub module module = hub.Module(args.module_path, trainable=False) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as session: # initialize the variables session.run( [tf.global_variables_initializer(), tf.tables_initializer()]) if args.use_sentence_piece: # spm_path now contains a path to the SentencePiece # model stored inside the TF-Hub module spm_path = session.run(module(signature="spm_path")) sp = spm.SentencePieceProcessor() sp.Load(spm_path) # build an input placeholder with tf.device('/gpu:0'): input_placeholder = tf.sparse_placeholder( tf.int64, shape=[None, None]) embeddings = module(inputs=dict( values=input_placeholder.values, indices=input_placeholder.indices, dense_shape=input_placeholder.dense_shape ) ) # size of chunk is how many lines will be encoded # with each pass of the model size_of_chunk = 256 # ensure that every line has a response assert len(unencoded_lines) == len(unencoded_lines_resps) all_id_chunks = get_id_chunks( range(len(unencoded_lines)), size_of_chunk) max_iter = len(unencoded_lines) // size_of_chunk for id_chunk in tqdm(all_id_chunks, total=max_iter): # get the chunk of lines and matching responses by list of ids chunk_unencoded_lines = [unencoded_lines[x] for x in id_chunk] chunck_unenc_resp = [unencoded_lines_resps[x] for x in id_chunk] if args.use_sentence_piece: # process unencoded lines to values and IDs in sparse format values, indices, dense_shape = process_to_IDs_in_sparse_format( sp=sp, sentences=chunk_unencoded_lines) # run the session with tf.device('/gpu:0'): chunk_line_embds = session.run( embeddings, feed_dict={ input_placeholder.values: values, input_placeholder.indices: indices, input_placeholder.dense_shape: dense_shape } ) else: with tf.device('/gpu:0'): chunk_line_embds = session.run( module(chunk_unencoded_lines)) # hash the object into the full output dataframe for i, line_embedding in enumerate( np.array(chunk_line_embds).tolist()): if args.verbose: tf.logging.info( "Line: {}".format(chunk_unencoded_lines[i])) tf.logging.info( "Embedding size: {}".format(len(line_embedding))) snippet = ", ".join((str(x) for x in line_embedding[:3])) tf.logging.info( "Embedding: [{}, ...]\n".format(snippet)) # Encode a hash for the string hash_object = hashlib.md5( chunk_unencoded_lines[i].encode('utf-8')) # Add a row to the dataframe output_dict[hash_object.hexdigest()] = { 'line': chunk_unencoded_lines[i], 'line_embedding': line_embedding, 'response': chunck_unenc_resp[i] } return output_dict
'BATCH_SIZE': 64, 'TOP_K': 5, # How many top classes should be predicted 'INFER_PATH': './Data/ImgsResize', 'LABEL_PATH': './Data/classes.txt' } dataset = Dataset(params) print("======> dataset.image_data: ", dataset.img_data) #module = hub.Module('https://tfhub.dev/google/imagenet/inception_v3/classification/1') #logits = module(dict(images=dataset.img_data)) #print(logits) #softmax = tf.nn.softmax(logits) #top_predictions = tf.nn.top_k(softmax, top_k, name='top_predictions') module = hub.Module( 'https://tfhub.dev/google/imagenet/resnet_v1_50/classification/1') #module = hub.Module('https://tfhub.dev/google/imagenet/resnet_v1_50/classification/1', # trainable=True) # Trainable is True since we are going to fine-tune the model print("\n========> output info dict:") [ print('{0}: {1}'.format(k, v)) for k, v in sorted( module.get_output_info_dict(signature='image_classification').items()) ] module_features = module(dict(images=dataset.img_data), signature="image_classification", as_dict=True) #features = module_features["default"] features = module_features["resnet_v1_50/block2"] print("\n==========> features: ", features)
import keras from keras import backend as k from keras.models import Model from keras.layers import Dense, Activation from keras.layers.core import Dense from keras.optimizers import Adam from keras.metrics import categorical_crossentropy import numpy as np import tensorflow as tf from keras.callbacks import EarlyStopping from keras.callbacks import History from keras.layers import Input, Dense import tensorflow_hub as hub elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True) # Audio recording parameters STREAMING_LIMIT = 290000 SAMPLE_RATE = 16000 CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms os.environ[ 'GOOGLE_APPLICATION_CREDENTIALS'] = "/Users/chandan/GoogleServiceAccountKey/serviceACKey.json" def get_current_time(): return int(round(time.time() * 1000)) def duration_to_secs(duration):
ytrain = ytrain.reshape(-1) ytest = ytest.reshape(-1) def _preprocess(x): x = preprocess_image(x, 224, 224, is_training=False, color_distort=False) return x batch_size = 100 x = tf.placeholder(shape=(batch_size, 32, 32, 3), dtype=tf.float32) x_preproc = tf.map_fn(_preprocess, x) print(x_preproc.get_shape().as_list()) hub_path = 'gs://simclr-checkpoints/simclrv2/pretrained/r50_2x_sk1/hub/' module = hub.Module(hub_path, trainable=False) features = module(inputs=x_preproc, signature='default') print(features.get_shape().as_list()) sess = tf.Session() sess.run(tf.global_variables_initializer()) print("model loaded!") features_train = [] for i in range(len(xtrain) // batch_size): x_batch = xtrain[i * batch_size:(i + 1) * batch_size] f = sess.run(features, feed_dict={x: x_batch}) features_train.append(f) features_train = np.concatenate(features_train, axis=0) print(features_train.shape)
all_scores = [] for i in class_indexes: all_scores.append([met_labels[i], logits[i]]) all_scores.sort(key=lambda tup: tup[1]) all_scores = all_scores[::-1] return all_scores met_labels = np.genfromtxt("imetv1_labelmap.csv", delimiter=',', dtype='str', usecols=[1], skip_header=True) module = hub.Module( "https://tfhub.dev/metmuseum/vision/classifier/imet_attributes_V1/1") print("Eingabedimension vom Bild: H:{} px X W:{} px ".format( hub.get_expected_image_size(module)[0], hub.get_expected_image_size(module)[1])) # Hier bekommen wir zusätzliche Informationen über das Module von TensorFlow Hub # Eingabe print(module.get_input_info_dict()) # Ausgabe print(module.get_output_info_dict()) # Eingabebilder input_image = plt.imread("DT11140.jpg") input_image = input_image.astype(np.float32)[np.newaxis, ...] / 255. input_image = tf.image.resize(input_image, (299, 299))
def load_tf_weights_in_bert_generation(model, tf_hub_path, model_class, is_encoder_named_decoder=False, is_encoder=False): try: import numpy as np import tensorflow.compat.v1 as tf import tensorflow_hub as hub import tensorflow_text # noqa: F401 tf.disable_eager_execution() except ImportError: logger.error( "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " "https://www.tensorflow.org/install/ for installation instructions." ) raise tf_model = hub.Module(tf_hub_path) init = tf.global_variables_initializer() with tf.Session() as sess: init.run() all_variables = tf_model.variable_map keep_track_variables = all_variables.copy() for key in list(all_variables.keys()): if "global" in key: logger.info(f"Skipping {key}...") continue if not is_encoder: model_pointer = getattr(model, model_class) else: model_pointer = model is_embedding = False logger.info(f"Trying to match {key}...") # remove start_string = "module/bert/" sub_layers = key.split("/")[2:] if is_encoder_named_decoder and sub_layers[0] == "encoder": logger.info(f"Skipping encoder layer {key} for decoder") continue if is_encoder and sub_layers[0] == "decoder": logger.info(f"Skipping decoder layer {key} for encoder") continue for i, sub_layer in enumerate(sub_layers): if sub_layer == "embeddings": is_embedding = True elif sub_layer == "LayerNorm": is_embedding = False if "layer" in sub_layer: model_pointer = model_pointer.layer[int( sub_layer.split("_")[-1])] elif sub_layer in ["kernel", "gamma"]: model_pointer = model_pointer.weight elif sub_layer == "beta": model_pointer = model_pointer.bias elif sub_layer == "encdec": model_pointer = model_pointer.crossattention.self elif sub_layer == "encdec_output": model_pointer = model_pointer.crossattention.output elif is_encoder_named_decoder and sub_layer == "decoder": model_pointer = model_pointer.encoder else: if sub_layer == "attention" and "encdec" in sub_layers[i + 1]: continue try: model_pointer = getattr(model_pointer, sub_layer) except AttributeError: logger.info( f"Skipping to initialize {key} at {sub_layer}...") raise AttributeError array = np.asarray(sess.run(all_variables[key])) if not is_embedding: logger.info( "Transposing numpy weight of shape {} for {}".format( array.shape, key)) array = np.transpose(array) else: model_pointer = model_pointer.weight try: assert ( model_pointer.shape == array.shape ), f"Pointer shape {model_pointer.shape} and array shape {array.shape} mismatched" except AssertionError as e: e.args += (model_pointer.shape, array.shape) raise logger.info(f"Initialize PyTorch weight {key}") model_pointer.data = torch.from_numpy(array.astype(np.float32)) keep_track_variables.pop(key, None) logger.info("Weights not copied to PyTorch model: {}".format(", ".join( keep_track_variables.keys()))) return model
# Check how many iterations we will do. max_iter = len(file_list) // batch_size + 1 ''' Make the graph that basically only holds the module Note in the graph the module works on a placeholder Since we do not know howmany images we will process at a time, we set the first parameter in the shape to be None This placeholder will later on be filled with images Module_256 is the module from tensorhub This will spit out a 256 feature vector called the features ''' tf.reset_default_graph() module = hub.Module(module_url) images = tf.placeholder(shape=[None, 224, 224, 3], dtype=tf.float32, name='input') features = module(images) init_op = tf.global_variables_initializer() times = [] with tf.Session() as sess: sess.run(init_op) # Finalize graph so that we not accidentely extend it. sess.graph.finalize()