예제 #1
0
    def config_model_prediction(self, model, feature_ph_dict, params=None):
        # we use the context a bit, so let's be brief
        c = model.context

        # the text will be encoded by a standard module from tf.hub
        model.text_encoder = hub.Module(FLAGS.module_handle)
        logging.info('encoding text with %s', FLAGS.module_handle)

        # one input is the question text, which is encoded by a specified
        # text_encoder module
        question_name = get_text_module_input_name()
        question_ph = feature_ph_dict[question_name]
        model.question_encoding = model.text_encoder(
            {question_name: question_ph})

        # we will want to be able to map the encoded text to a set of entities in a
        # given type. This function returns an NQL expression, over the specified
        # type, which is formed by running the text encoding through a learned
        # linear map to get the number of dimensions right, and then applying a
        # softmax
        def linear_text_remapper(type_name):
            num_input_dims = FLAGS.num_text_dims
            num_output_dims = c.get_max_id(type_name)
            initializer = tf.glorot_uniform_initializer()(
                [num_input_dims, num_output_dims])
            weight_matrix = tf.Variable(initializer)
            remapped_text = tf.matmul(model.question_encoding, weight_matrix)
            return c.as_nql(remapped_text, type_name)

        # the seeds, ie entities in the question, are the other
        # input to the model.  by convention inputs are passed in
        # in tensorflow format, so we'll wrap them as NQL
        model.seeds = c.as_nql(feature_ph_dict['seeds'], 'entity_t')
        model.rels = [
            linear_text_remapper('rel_g') for h in range(FLAGS.num_hops)
        ]
        model.dirs = [
            linear_text_remapper('direction_t') for h in range(FLAGS.num_hops)
        ]

        # finally we define the NQL part of the model
        # start with seeds and build a model that follows exactly num_hops hops
        model.raw_y = [model.seeds]
        for h in range(FLAGS.num_hops):
            prev_raw_y = model.raw_y[-1]
            cur_raw_y = \
                prev_raw_y.follow(model.rels[h], +1).if_any(
                    model.dirs[h] & c.one('forward', 'direction_t')) \
                | prev_raw_y.follow(model.rels[h], -1).if_any(
                    model.dirs[h] & c.one('backward', 'direction_t'))
            # mask out seed entities
            if h == 1 and FLAGS.mask_seeds:
                filtered_cur_raw_y = tf.where(
                    tf.equal(model.seeds.tf, 0), cur_raw_y.tf,
                    tf.fill(tf.shape(cur_raw_y.tf), 0.0))
                cur_raw_y = filtered_cur_raw_y

            cur_raw_y = c.as_nql(cur_raw_y, 'entity_t')
            model.raw_y.append(cur_raw_y)

        model.predicted_y = nql.nonneg_softmax(model.raw_y[-1].tf)
        # record the predictions: in addition to the answer we'll return the
        # predicted relation and direction
        model.predictions = dict([('rel%d' % h, model.rels[h])
                                  for h in range(FLAGS.num_hops)] +
                                 [('dir%d' % h, model.dirs[h])
                                  for h in range(FLAGS.num_hops)] +
                                 [('answer', model.raw_y[-1])])
예제 #2
0
                    type=float,
                    default=0.5,
                    help="dropout_rate. test: 0.0, train=0.2")
parser.add_argument("--nclass", type=int)
parser.add_argument("--model", help="inception, resnet")
parser.add_argument("--gpu_config", default=0, help="0:gpu0, 1:gpu1, -1:both")

a = parser.parse_args()
for k, v in a._get_kwargs():
    print(k, "=", v)

import tensorflow_hub as hub
if a.model == "inception":
    model_size = 299
    module = hub.Module(
        "https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1",
        trainable=False)
elif a.model == "resnet":
    model_size = 224
    module = hub.Module(
        "https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/1",
        trainable=False)

#config
config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
if a.gpu_config == '0':
    config = tf.ConfigProto(
        gpu_options=tf.GPUOptions(allow_growth=True, visible_device_list='0'))
elif a.gpu_config == '1':
    config = tf.ConfigProto(
        gpu_options=tf.GPUOptions(allow_growth=True, visible_device_list='1'))
예제 #3
0
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd

module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)
tf.logging.set_verbosity(tf.logging.ERROR)

def text_features(corpus):
    """ Convert documents to text vector """

    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(messages))
    
    return pd.DataFrame(message_embeddings)
예제 #4
0
# In[ ]:


#data1 = data.iloc[1950:2000,]
sentence = (data1['CUST_TEXT'].astype('str')).values.tolist()



#loading spacy model
import en_core_web_sm
nlp = en_core_web_sm.load()

#loading ELMo model from tensor hub
url = "https://tfhub.dev/google/elmo/2"
embed = hub.Module(url)

#creating word embeddings using ELMo model
embeddings = embed(
    sentence,
    signature="default",
    as_dict=True)["default"]


# In[ ]:


#creating session to store output for graph creation
%%time
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
예제 #5
0
    def call(self, inputs, training=True):
        """
        Given inputs, return the logits.
        :param features:
        :param training:
        :return:
        """
        inputs_seq, masks, length = inputs
        length = tf.squeeze(length)
        # Given inputs, to generate the embedding as the input to next layer.
        with tf.variable_scope(name_or_scope='input_embedding_scope',
                               reuse=tf.AUTO_REUSE) as in_em_scope:
            # use elmo as the input embedding
            if self.params.get('elmo'):
                elmo = hub.Module("https://tfhub.dev/google/elmo/2",
                                  trainable=False)
                # change inputs to a list of words to fit into the elmo
                for i in range(len(inputs)):
                    inputs_seq[i] = [self.dic[v] for v in inputs_seq[i]]

                # Size of input_embedding: batch_size * max_length * 1024(default)
                input_embedding = elmo(inputs={
                    'tokens': inputs_seq,
                    'sequence_len': length
                },
                                       signature='tokens',
                                       as_dict=True)['elmo']

            # use Bert as the input embedding
            if self.params.get('bert'):
                # TODO embed bert model here.
                pass

            # Use Glove/word2vec embedding as the input
            if self.params.get('word_embedding'):
                assert self.embedding is not None
                input_embedding = tf.nn.embedding_lookup(
                    self.embedding, inputs_seq, name='input_embedding')
            # Use char embedding as the supplementary embedding
            if self.params.get('char_embedding'):
                # TODO embed char embedding here, need to think about how to store the instance.
                pass

            mask_embedding = tf.nn.embedding_lookup(self._mask_embedding,
                                                    masks,
                                                    name='mask_embedding')

            # concat input and mask embedding
            input_embedding = tf.concat([input_embedding, mask_embedding],
                                        axis=-1)

        with tf.variable_scope('lstm_part', reuse=tf.AUTO_REUSE) as lstm_part:
            lstm_output = input_embedding
            for i in range(self.params.get('layer_num')):
                lstm_output = self.add_lstm_layer(inputs=lstm_output,
                                                  length=length,
                                                  layer_name=i)

            if self.params.get('if_residual'):
                lstm_output = input_embedding + tf.layers.dense(
                    inputs=lstm_output,
                    units=self.params.get('word_dimension') +
                    self.params.get('mask_dim'))

        # CRF layer
        with tf.variable_scope('crf_layer',
                               reuse=tf.AUTO_REUSE) as crf_layer_layer:
            crf_input = tf.layers.dense(
                lstm_output,
                units=2,
                bias_initializer=tf.glorot_uniform_initializer())
            crf_layer_ = crf_layer(inputs=crf_input,
                                   sequence_lengths=length,
                                   transition_prob=self.transition)
            crf_output = crf_layer_.crf_output_prob(
            )[:, :, -1]  # The size should be batch_size * seq_len

            # expand crf_output's shape to batch_size * 1 * seq_len for batch matrix multipcation
            crf_output = tf.expand_dims(crf_output, axis=1)

            # also can chose matmul
            # sentiment_vector = tf.squeeze(
            #     tf.einsum('aij,ajk->aik', crf_output, lstm_output))  # output shape is batch_size * embedding_dim

            # sentiment_vector = tf.squeeze(tf.matmul(crf_output, lstm_output))
            sentiment_vector = tf.matmul(crf_output, lstm_output)

        # logits layer
        with tf.variable_scope('logits', reuse=tf.AUTO_REUSE) as logits_layer:
            logits = tf.layers.dense(
                inputs=sentiment_vector,
                units=self.params.get('n_classes'),
                activation='softmax',
                bias_initializer=tf.glorot_uniform_initializer())

            return logits
예제 #6
0
    train_df = load_dataset(
        os.path.join(os.path.dirname(dataset), "aclImdb", "train"))
    test_df = load_dataset(
        os.path.join(os.path.dirname(dataset), "aclImdb", "test"))

    return train_df, test_df


# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

train_df, test_df = download_and_load_datasets()

# %%
# Now instantiate the elmo model
elmo_model = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

# %%

# Build our model


# We create a function to integrate the tensorflow model with a Keras model
# This requires explicitly casting the tensor to a string, because of a Keras quirk
def ElmoEmbedding(x):
    return elmo_model(tf.squeeze(tf.cast(x, tf.string)),
                      signature="default",
                      as_dict=True)["default"]
예제 #7
0
print('Outlet Classifier')
classes_All = np.asarray([1 for i in range(len(n_s_b))] + \
[2 for i in range(len(n_s_p))] + [3 for i in range(len(n_s_a))] + \
[4 for i in range(len(n_s_n))])
#Bias classifier:
print('Bias Classifier')
classes_Bias = np.asarray([1 for i in range(len(n_s_b))] + \
[1 for i in range(len(n_s_p))] + [2 for i in range(len(n_s_a))] + \
[2 for i in range(len(n_s_n))])

# Load the encoder:
print('Load Encoder')
g = tf.Graph()
with g.as_default():
    text_input = tf.placeholder(dtype=tf.string, shape=[None])
    embed = hub.Module(
        "https://tfhub.dev/google/universal-sentence-encoder-large/3")
    embedded_text = embed(text_input)
    init_op = tf.group(
        [tf.global_variables_initializer(),
         tf.tables_initializer()])
g.finalize()

# Initialize session:
print('Initialize session')
session = tf.Session(graph=g)
session.run(init_op)


#Function to compute all embeddings for each sentence:
#Be patient, takes a little while:
def similarity_matrix(merge_list):
예제 #8
0
 def __init__(self):
     print('Loading Universal Sentence Encoder...')
     self.embed = hub.Module(
         "https://tfhub.dev/google/universal-sentence-encoder-large/3")
     print('Loaded!')
예제 #9
0
plot_pred_mean(pred_means,pred_weights,pred_std,ymax,ymin,y_train)

mean_diff, med_diff, std_diff, mean_sigma, med_sigma, std_sigma = per_stats(pred_means,pred_weights,pred_std,ymax,ymin,y_train)


"""
#plot_pred_peak(pred_means,pred_weights,pred_std,ymax,ymin,y_train)
#plot_pred_weight(pred_means,pred_weights,pred_std,ymax,ymin,y_train)

#contamp, contamw, pp, pw = contamination(pred_means,pred_weights,pred_std,ymax,ymin,y_train)

#bin_contam, bin_pp, bin_tot = binning(pred_means,pred_weights,pred_std,ymax,ymin,y_train,params,cut=200,tbins=10,gbins=10)

#load saved network
neural_network_t = hub.Module(save_mod)

######testing
"""
test_weights, test_means, test_std = testing(X_test,y_test)
plot_pdfs(test_means,test_weights,test_std,train=False)

plot_pred_mean(test_means,test_weights,test_std,ymax,ymin,y_test)

test_mean_diff, test_med_diff, test_std_diff, test_mean_sigma, test_med_sigma, test_std_sigma = per_stats(test_means,test_weights,test_std,ymax,ymin,y_test)
"""


def load_data(filein='lamost_rc_wise_gaia_PS1_2mass.fits', y_exist=True):
    filts = [
        'Jmag', 'Hmag', 'Kmag', 'phot_g_mean_mag', 'phot_bp_mean_mag',
예제 #10
0
    def get_predictions_and_loss(self, inputs):
        tokens, lm_emb, text_len, is_training, gold_starts, gold_ends = inputs
        self.dropout = self.get_dropout(self.config["dropout_rate"],
                                        is_training)
        self.lexical_dropout = self.get_dropout(
            self.config["lexical_dropout_rate"], is_training)
        self.lstm_dropout = self.get_dropout(self.config["lstm_dropout_rate"],
                                             is_training)

        num_sentences = tf.shape(tokens)[0]
        max_sentence_length = tf.shape(tokens)[1]

        if not self.lm_file:
            elmo_module = hub.Module("https://tfhub.dev/google/elmo/2")
            lm_embeddings = elmo_module(inputs={
                "tokens": tokens,
                "sequence_len": text_len
            },
                                        signature="tokens",
                                        as_dict=True)
            word_emb = lm_embeddings[
                "word_emb"]  # [num_sentences, max_sentence_length, 512]
            lm_emb = tf.stack([
                tf.concat([word_emb, word_emb], -1),
                lm_embeddings["lstm_outputs1"], lm_embeddings["lstm_outputs2"]
            ], -1)  # [num_sentences, max_sentence_length, 1024, 3]
        lm_emb_size = util.shape(lm_emb, 2)
        lm_num_layers = util.shape(lm_emb, 3)
        with tf.variable_scope("lm_aggregation"):
            self.lm_weights = tf.nn.softmax(
                tf.get_variable("lm_scores", [lm_num_layers],
                                initializer=tf.constant_initializer(0.0)))
            self.lm_scaling = tf.get_variable(
                "lm_scaling", [], initializer=tf.constant_initializer(1.0))
        flattened_lm_emb = tf.reshape(
            lm_emb,
            [num_sentences * max_sentence_length * lm_emb_size, lm_num_layers])
        flattened_aggregated_lm_emb = tf.matmul(
            flattened_lm_emb, tf.expand_dims(
                self.lm_weights,
                1))  # [num_sentences * max_sentence_length * emb, 1]
        aggregated_lm_emb = tf.reshape(
            flattened_aggregated_lm_emb,
            [num_sentences, max_sentence_length, lm_emb_size])
        aggregated_lm_emb *= self.lm_scaling

        context_emb = aggregated_lm_emb

        context_emb = tf.nn.dropout(
            context_emb,
            self.lexical_dropout)  # [num_sentences, max_sentence_length, emb]

        text_len_mask = tf.sequence_mask(
            text_len,
            maxlen=max_sentence_length)  # [num_sentence, max_sentence_length]

        num_words = tf.reduce_sum(text_len)
        sentence_indices = tf.tile(
            tf.expand_dims(tf.range(num_sentences), 1),
            [1, max_sentence_length])  # [num_sentences, max_sentence_length]
        flattened_sentence_indices = self.flatten_emb_by_sentence(
            sentence_indices, text_len_mask)  # [num_words]

        candidate_starts = tf.tile(
            tf.expand_dims(tf.range(num_words), 1),
            [1, max_sentence_length])  # [num_words, max_sentence_length]
        candidate_ends = candidate_starts + tf.expand_dims(
            tf.range(max_sentence_length),
            0)  # [num_words, max_sentence_length]
        candidate_start_sentence_indices = tf.gather(
            flattened_sentence_indices,
            candidate_starts)  # [num_words, max_sentence_length]
        candidate_end_sentence_indices = tf.gather(
            flattened_sentence_indices,
            tf.minimum(candidate_ends,
                       num_words - 1))  # [num_words, max_sentence_length]
        candidate_mask = tf.logical_and(candidate_ends < num_words,
                                        tf.equal(
                                            candidate_start_sentence_indices,
                                            candidate_end_sentence_indices)
                                        )  # [num_words, max_sentence_length]
        flattened_candidate_mask = tf.reshape(
            candidate_mask, [-1])  # [num_words * max_sentence_length]

        candidate_starts = tf.boolean_mask(
            tf.reshape(candidate_starts,
                       [-1]), flattened_candidate_mask)  # [num_candidates]
        candidate_ends = tf.boolean_mask(
            tf.reshape(candidate_ends,
                       [-1]), flattened_candidate_mask)  # [num_candidates]

        candidate_labels = self.get_candidate_labels(
            candidate_starts, candidate_ends, gold_starts,
            gold_ends)  # [num_candidates]

        candidate_scores_mask = tf.logical_and(
            tf.expand_dims(text_len_mask, [1]),
            tf.expand_dims(
                text_len_mask,
                [2]))  #[num_sentence, max_sentence_length,max_sentence_length]
        sentence_ends_leq_starts = tf.tile(
            tf.expand_dims(
                tf.logical_not(
                    tf.sequence_mask(tf.range(max_sentence_length),
                                     max_sentence_length)), 0),
            [num_sentences, 1, 1
             ])  #[num_sentence, max_sentence_length,max_sentence_length]
        candidate_scores_mask = tf.logical_and(candidate_scores_mask,
                                               sentence_ends_leq_starts)

        flattened_candidate_scores_mask = tf.reshape(
            candidate_scores_mask,
            [-1])  #[num_sentence * max_sentence_length * max_sentence_length]

        context_outputs = self.lstm_contextualize(
            context_emb, text_len, text_len_mask, self.lstm_dropout,
            False)  # [num_sentence, max_sentence_length, emb]

        with tf.variable_scope("candidate_starts_ffnn"):
            candidate_starts_emb = util.projection(
                context_outputs, self.config["ffnn_size"]
            )  #[num_sentences, max_sentences_length,emb]
        with tf.variable_scope("candidate_ends_ffnn"):
            candidate_ends_emb = util.projection(
                context_outputs, self.config["ffnn_size"]
            )  #[num_sentences, max_sentences_length, emb]

        candidate_mention_scores = util.bilinear_classifier(
            candidate_starts_emb, candidate_ends_emb, self.dropout
        )  #[num_sentence, max_sentence_length,max_sentence_length]
        candidate_mention_scores = tf.boolean_mask(
            tf.reshape(candidate_mention_scores, [-1]),
            flattened_candidate_scores_mask)

        loss = self.sigmoid_loss(candidate_mention_scores, candidate_labels)
        top_span_starts, top_span_ends = self.get_top_mentions(
            num_words, candidate_starts, candidate_ends,
            candidate_mention_scores)

        return [top_span_starts, top_span_ends], loss
예제 #11
0
  def __init__(self, config):
    self.config = config
    self.context_embeddings = util.EmbeddingDictionary(config["context_embeddings"])
    self.head_embeddings = util.EmbeddingDictionary(config["head_embeddings"],
                                                    maybe_cache=self.context_embeddings)
    self.char_embedding_size = config["char_embedding_size"]
    self.char_dict = util.load_char_dict(config["char_vocab_path"])
      
    self.lm_file = None
    self.lm_hub = None
    self.lm_layers = 0  # TODO: Remove these.
    self.lm_size = 0
    if config["lm_path"]:
      if "tfhub" in config["lm_path"]:
        print("Using tensorflow hub:", config["lm_path"])
        self.lm_hub = hub.Module(config["lm_path"], trainable=False) 
      else:
        self.lm_file = h5py.File(self.config["lm_path"], "r")
      self.lm_layers = self.config["lm_layers"]
      self.lm_size = self.config["lm_size"]

    self.adjunct_roles, self.core_roles = split_srl_labels(
        config["srl_labels"], config["include_c_v"])
    self.srl_labels_inv  = [""] + self.adjunct_roles + self.core_roles
    self.srl_labels = { l:i for i,l in enumerate(self.srl_labels_inv) }

    # IO Stuff.
    # Need to make sure they are in the same order as input_names + label_names
    self.input_props = [
        (tf.string, [None]), # String tokens.
        (tf.float32, [None, self.context_embeddings.size]), # Context embeddings.
        (tf.float32, [None, self.head_embeddings.size]), # Head embeddings.
        (tf.float32, [None, self.lm_size, self.lm_layers]), # LM embeddings.
        (tf.int32, [None, None]), # Character indices.
        (tf.int32, []),  # Text length.
        (tf.int32, []),  # Document ID.
        (tf.bool, []),  # Is training.
        (tf.int32, [None]),  # Gold predicate ids (for input).
        (tf.int32, []),  # Num gold predicates (for input).
        (tf.int32, [None]),  # Predicate ids (length=num_srl_relations).
        (tf.int32, [None]),  # Argument starts.
        (tf.int32, [None]),  # Argument ends.
        (tf.int32, [None]),  # SRL labels.
        (tf.int32, [])  # Number of SRL relations.
    ]
    self.input_names = _input_names
    self.label_names = _label_names
    self.predict_names = _predict_names
    self.batch_size = self.config["batch_size"]
    dtypes, shapes = zip(*self.input_props)
    if self.batch_size > 0 and self.config["max_tokens_per_batch"] < 0:
      # Use fixed batch size if number of words per batch is not limited (-1).
      self.queue_input_tensors = [tf.placeholder(dtype, shape) for dtype, shape in self.input_props]
      queue = tf.PaddingFIFOQueue(capacity=self.batch_size * 2, dtypes=dtypes, shapes=shapes)
      self.enqueue_op = queue.enqueue(self.queue_input_tensors)
      self.input_tensors = queue.dequeue_many(self.batch_size)
    else:
      # Use dynamic batch size.
      new_shapes = [[None] + shape for shape in shapes]
      self.queue_input_tensors = [tf.placeholder(dtype, shape) for dtype, shape in zip(dtypes, new_shapes)]
      queue = tf.PaddingFIFOQueue(capacity=2, dtypes=dtypes, shapes=new_shapes)
      self.enqueue_op = queue.enqueue(self.queue_input_tensors)
      self.input_tensors = queue.dequeue()
    num_features = len(self.input_names)
    self.input_dict = dict(zip(self.input_names, self.input_tensors[:num_features]))
    self.labels_dict = dict(zip(self.label_names, self.input_tensors[num_features:]))
        'complib': 'blosc:zstd',
        'complevel': 6
    }
    VERBOSE = 0

    # Enable gc
    gc.enable()

    ## Initialize tensorflow
    # ImageNet Pre-trained models
    mobilenet_v1 = "https://tfhub.dev/google/imagenet/mobilenet_v1_050_224/quantops/feature_vector/1"  # dim 512
    mobilenet_v2 = "https://tfhub.dev/google/imagenet/mobilenet_v2_100_224/feature_vector/1"  # dim 1280
    inception_resnet_v2 = "https://tfhub.dev/google/imagenet/inception_resnet_v2/feature_vector/1"  # dim 1536
    hubModule = mobilenet_v1
    tf.logging.set_verbosity(tf.logging.WARN)
    module = hub.Module(hubModule)

    # Specify path variables
    dataRootPath = '../../../data/avito-demand-prediction/images'
    imagePath = f'{dataRootPath}/{sourceImgDir}'
    featurePath = f'{dataRootPath}/{sourceImgDir}'

    # Check feature path
    if not os.path.exists(featurePath):
        os.makedirs(featurePath)

    # Get joblist
    fileList = np.array_split(np.array(sorted(listdir(imagePath)), dtype=str),
                              nInstances)[instanceID - 1]
    fileList = np.array([f'{imagePath}/{img}' for img in fileList])
    print(
예제 #13
0
import sys
import tensorflow as tf
import tensorflow_hub as hub
import psycopg2
import psycopg2.extras
import json
from functools import reduce

with tf.Graph().as_default():
	with tf.Session() as sess:
		embed = hub.Module('/home/d/nl/sentence-embed')
		sess.run(tf.global_variables_initializer())
		sess.run(tf.tables_initializer())
		
		def run():
			with psycopg2.connect('dbname=nl user=nl password=logbase') as conn:
				J = json.loads("".join(sys.stdin.readlines()))
				embedding = sess.run(embed(J['body']))
				
				cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
				cur.execute('INSERT INTO sentence_embed (i, embed, sentence) VALUES %s' % ('(%s,%s,%s),' * embedding.shape[0])[:-1], reduce(lambda acc, a: acc + a + (J['id']), enumerate(embedding.tolist()), tuple()))
				conn.commit()
				# cur.execute('SELECT sentence, body FROM test.rsentence LIMIT 10;')
				# sentences = cur.fetchall()

		if __name__ == '__main__':
			run()
예제 #14
0
import tensorflow_hub as hub

BERT_URL = 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'
module = hub.Module(BERT_URL)

print('Download complete')
예제 #15
0
def main(_):
    # -------------------- configuration ------------------------- #
    tf.logging.set_verbosity(tf.logging.INFO)
    task_name = FLAGS.task_name.lower()
    model_name = "elmo"
    processors = {
        "sst-2": extract.Sst2Processor,
        "cola": extract.ColaProcessor,
    }
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    # ------------------- preprocess dataset -------------------- #
    label_list = processor.get_labels()
    num_labels = len(label_list)

    if FLAGS.task_name == 'sst-2':
        sentences, labels = _load_shard_sst(FLAGS.data_dir)
    if FLAGS.task_name == 'cola':
        sentences, labels = _load_shard_cola(FLAGS.data_dir)
    sentences_input = np.array(sentences)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num examples = %d", len(sentences))
    # ----------------------- build model --------------------- #

    # sess1
    elmo = hub.Module(spec='../elmo/tf_module', trainable=False)
    Elmo_model = load_model(FLAGS.load_path, custom_objects={'elmo': elmo, "tf": tf})
    print(Elmo_model.summary())

    dense_layer_model = Model(inputs=Elmo_model.input, outputs=Elmo_model.get_layer('dense_2').output)
    output_logits = dense_layer_model.predict(sentences_input)

    embedding_model = Model(inputs=Elmo_model.input, outputs=Elmo_model.get_layer('lambda_1').output)
    # embeddings = embedding_model.predict(sentences_input)
    count = 0

    print('Making explanations...')
    # for (i, example) in enumerate(eval_examples[:1]):
    # ==============================================================================

    res = []
    res.append({"lr":FLAGS.lr, "g_sample_num":g_sample_num, "m_cnt":m_cnt, "epoch_num": FLAGS.epoch_num, "maximize": FLAGS.maximize_shap})

    if resume:
        with open(FLAGS.resume_path,"r") as f:
            res = json.load(f)
        count = len(res) - 1
        start = res[-1]["id"] + 1
    else:
        start = 0
        count = 0

    for i, sentence in enumerate(sentences[start:]):
        id = i + start
        dic = {}
        # sentence = eval_examples[0] # 分析的句子
        # tokens_a = tokenizer.tokenize(sentence.text_a)
        label = int(labels[id])
        sentence = sentences[id]
        splitted = sentence.split()
        logit = output_logits[id]
        embedding = embedding_model.predict(np.array([sentence, ""]))[0]  # cannot set batchsize=1? why?
        time_step = embedding.shape[0]

        # ========== predictor model =======================================
        idx = 2
        layer_input = Input(shape=(time_step, 1024))
        # print(layer_input)
        x = layer_input
        for l, layer in enumerate(Elmo_model.layers[idx:-1]):
            x = layer(x)
            print(l, x.shape)
        predictor_model = Model(layer_input, x)
        # _ = predictor_model.predict(np.random.randn(10, time_step, 1024))
        with predictor_model.input.graph.as_default():
            predictor_model.sess = tf.Session(graph=predictor_model.input.graph)
            predictor_model.sess.run(tf.global_variables_initializer())
        print(predictor_model.summary())
        # ===================================================================


        dic["id"] = id
        dic["tokens"] = splitted

        a_len = len(splitted)
        if a_len < min_len or a_len > max_len:
            continue
        count += 1
        print(count)
        # print(count)

        print(id, splitted)

        seg_len = random.choice(seg_len_range)
        seg = [0, 0, a_len]
        seg[0] = random.choice(range(a_len-seg_len))
        seg[1] = seg[0] + seg_len

        dic["seg"] = seg

        # opt_res = manage_a_sentence(seg, embedding, label, predictor_model)
        # # print(res)

        FLAGS.maximize_shap = True
        opt_res_1 = manage_a_sentence(seg, embedding, label, predictor_model)
        FLAGS.maximize_shap = False
        opt_res_2 = manage_a_sentence(seg, embedding, label, predictor_model)

        opt_res = []
        for i in range(len(opt_res_1)):
            item = {"p_max": opt_res_1[i]["p"],
                    "p_min": opt_res_2[i]["p"],
                    "loss": -1 * opt_res_1[i]["loss"] - opt_res_2[i]["loss"]
                    }
            opt_res.append(item)

        dic["opt_res"] = opt_res
        min_gt_score, max_gt_score, min_gt_part, max_gt_part = get_min_max_shap(seg, embedding, label, predictor_model)
        gt_score = max_gt_score - min_gt_score
        dic["gt_score"] = gt_score

        difference = []
        for i in range(FLAGS.epoch_num//l_step):

            opt_score = 0
            for j in range(i*l_step,(i+1)*l_step):
                if FLAGS.maximize_shap:
                    opt_score += -1* opt_res[j]["loss"]
                else:
                    opt_score += opt_res[j]["loss"]
            opt_score /= l_step
            # step_dict = {"gt_score": gt_score, "diff": abs(gt_score-opt_score)}

            difference.append(abs(gt_score-opt_score))

        dic["difference"] = difference
        res.append(dic)
        print("gt_score:", gt_score)
        with open('difference_%s_elmo.json'%FLAGS.task_name, 'w') as f:
            json.dump(res, f)
    print(sentences)
예제 #16
0
def gen_emb_model_fn(features, labels, mode, params):
    import tensorflow_hub as hub
    from utils import bert_utils

    num_block_records = params['n_blocks']
    block_records_path = params['block_records_path']
    reader_module_path = params['reader_module_path']
    embedder_path = params['embedder_module_path']
    max_seq_len = 512

    blocks_dataset = tf.data.TFRecordDataset(block_records_path,
                                             buffer_size=512 * 1024 * 1024)
    blocks_dataset = blocks_dataset.batch(num_block_records,
                                          drop_remainder=True)
    blocks = tf.compat.v1.get_local_variable(
        "blocks",
        initializer=tf.data.experimental.get_single_element(blocks_dataset))
    retrieved_blocks = tf.gather(blocks, features['block_ids'])

    tokenizer, vocab_lookup_table = bert_utils.get_tf_tokenizer(
        reader_module_path)
    cls_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[CLS]")),
                           tf.int32)
    sep_token_id = tf.cast(vocab_lookup_table.lookup(tf.constant("[SEP]")),
                           tf.int32)
    title_tok_id_seq = tf.constant([[6522, 9138, 15759, 102]
                                    for _ in range(12)], tf.int32)

    block_tok_id_seqs0 = tokenizer.tokenize(retrieved_blocks)
    block_tok_id_seqs1 = tf.cast(
        block_tok_id_seqs0.merge_dims(1, 2).to_tensor(), tf.int32)
    batch_size = tf.shape(block_tok_id_seqs1)[0]
    cls_tok_ids = tf.ones([batch_size, 1], tf.int32) * cls_token_id
    block_tok_id_seqs = tf.concat(
        (cls_tok_ids, title_tok_id_seq, block_tok_id_seqs1), axis=1)
    block_tok_id_seqs = block_tok_id_seqs[:, :max_seq_len - 1]
    block_tok_id_seqs = pad_sep_to_tensor(block_tok_id_seqs, sep_token_id)
    input_mask = 1 - tf.cast(tf.equal(block_tok_id_seqs, tf.constant(0)),
                             tf.int32)

    retriever_module = hub.Module(
        embedder_path,
        tags={"train"} if mode == tf.estimator.ModeKeys.TRAIN else {},
        trainable=False)

    segment_ids = np.zeros((12, 288), dtype=np.int32)
    for i in range(12):
        segment_ids[i, 5:] = 1
    segment_ids = tf.constant(segment_ids)
    # segment_ids=tf.zeros_like(block_tok_id_seqs)
    # print(retriever_module.get_signature_names())
    # exit()
    # [1, projection_size]
    block_emb = retriever_module(
        inputs=dict(
            input_ids=block_tok_id_seqs,
            # input_mask=tf.ones_like(query_token_id_seqs),
            input_mask=input_mask,
            segment_ids=segment_ids),
        signature="projected")

    predictions = block_emb
    loss = tf.constant(1.0)
    logging_hook = tf.estimator.LoggingTensorHook(
        {
            'block_ids': features['block_ids'],
            'id_seqs': block_tok_id_seqs,
            'id_seqs_shape': tf.shape(block_tok_id_seqs),
            'id_seqs1': block_tok_id_seqs1[:, :5]
        },
        every_n_iter=1)
    return tf.estimator.EstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=None,
        predictions=predictions,
        prediction_hooks=[logging_hook],
        # training_hooks=[train_logging_hook],
        # evaluation_hooks=[logging_hook],
        # eval_metric_ops=eval_metric_ops
    )
예제 #17
0
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from scipy.stats import truncnorm
import random
import base64
from io import BytesIO
import PIL.Image

module_path = 'https://tfhub.dev/deepmind/biggan-256/2'
rand_seed = 123
truncation = 0.5

tf.reset_default_graph()
print('Loading BigGAN module from:', module_path)
module = hub.Module(module_path)
inputs = {
    k: tf.placeholder(v.dtype,
                      v.get_shape().as_list(), k)
    for k, v in module.get_input_info_dict().items()
}
output = module(inputs)

input_z = inputs['z']
input_y = inputs['y']
input_trunc = inputs['truncation']
random_state = np.random.RandomState(rand_seed)
dim_z = input_z.shape.as_list()[1]
vocab_size = input_y.shape.as_list()[1]

initializer = tf.global_variables_initializer()
예제 #18
0
import numpy as np
import tensorflow as tf
import tensornets as nets
import tensorflow_hub as hub

inputs = tf.placeholder(tf.float32, [None, 224, 224, 3])
model = nets.MobileNet140v2(inputs)
model_name = 'mobilenet_v2_140_224'

url = 'https://tfhub.dev/google/imagenet'
tfhub = hub.Module("%s/%s/classification/1" % (url, model_name))
features = tfhub(inputs, signature="image_classification", as_dict=True)
model_tfhub = tf.nn.softmax(features['default'])

img = nets.utils.load_img('cat.png', target_size=256, crop_size=224)

with tf.Session() as sess:

    # Retrieve values
    sess.run(tf.global_variables_initializer())
    weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope='module/MobilenetV2')
    values = sess.run(weights)
    for i in range(-2, 0):
        values[i] = np.delete(np.squeeze(values[i]), 0, axis=-1)

    # Adjust the order of the values to cover TF < 1.4.0
    names = [w.name for w in model.get_weights()]
    for i in range(len(names) - 1):
        if 'gamma:0' in names[i] and 'beta:0' in names[i + 1]:
            names[i], names[i + 1] = names[i + 1], names[i]
예제 #19
0
def create_model(
    is_predicting,
    input_ids,
    input_mask,
    segment_ids,
    labels,
    num_labels,
    bert_tfhub_module_handle=None,
    bert_config=None,
    use_one_hot_embeddings=True,
):
    """Creates a classification model."""

    if bert_config:

        model = BertModel(
            config=bert_config,
            is_training=not is_predicting,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings,
        )

        output_layer = model.get_pooled_output()

    else:
        bert_module = hub.Module(bert_tfhub_module_handle, trainable=True)
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        bert_outputs = bert_module(inputs=bert_inputs, signature="tokens", as_dict=True)

        output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights",
        [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02),
    )

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer()
    )

    with tf.variable_scope("loss"):
        if not is_predicting:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

        predicted_labels = tf.squeeze(
            tf.argmax(log_probs, axis=-1, output_type=tf.int32)
        )

        if is_predicting:
            return (predicted_labels, log_probs)

        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)
예제 #20
0
def main(arguments):
    ''' Main logic: parse args for tests to run and which models to evaluate '''
    log.basicConfig(format='%(asctime)s: %(message)s', datefmt='%m/%d %I:%M:%S %p', level=log.INFO)

    args = handle_arguments(arguments)
    if args.seed >= 0:
        log.info('Seeding random number generators with {}'.format(args.seed))
        random.seed(args.seed)
        np.random.seed(args.seed)
    maybe_make_dir(args.exp_dir)
    if args.log_file:
        log.getLogger().addHandler(log.FileHandler(args.log_file))
    log.info("Parsed args: \n%s", args)

    all_tests = sorted(
        [
            entry[:-len(TEST_EXT)]
            for entry in os.listdir(args.data_dir)
            if not entry.startswith('.') and entry.endswith(TEST_EXT)
        ],
        key=test_sort_key
    )
    log.debug('Tests found:')
    for test in all_tests:
        log.debug('\t{}'.format(test))

    tests = split_comma_and_check(args.tests, all_tests, "test") if args.tests is not None else all_tests
    log.info('Tests selected:')
    for test in tests:
        log.info('\t{}'.format(test))

    models = split_comma_and_check(args.models, MODEL_NAMES, "model") if args.models is not None else MODEL_NAMES
    log.info('Models selected:')
    for model in models:
        log.info('\t{}'.format(model))


    results = []
    for model_name in models:
        # Different models have different interfaces for things, but generally want to:
        # - if saved vectors aren't there:
        #    - load the model
        #    - load the test data
        #    - encode the vectors
        #    - dump the files into some storage
        # - else load the saved vectors '''
        log.info('Running tests for model {}'.format(model_name))

        if model_name == ModelName.BOW.value:
            model_options = ''
            if args.glove_path is None:
                raise Exception('glove_path must be specified for {} model'.format(model_name))
        elif model_name == ModelName.INFERSENT.value:
            if args.glove_path is None:
                raise Exception('glove_path must be specified for {} model'.format(model_name))
            if args.infersent_dir is None:
                raise Exception('infersent_dir must be specified for {} model'.format(model_name))
            model_options = ''
        elif model_name == ModelName.GENSEN.value:
            if args.glove_h5_path is None:
                raise Exception('glove_h5_path must be specified for {} model'.format(model_name))
            if args.gensen_dir is None:
                raise Exception('gensen_dir must be specified for {} model'.format(model_name))
            gensen_version_list = split_comma_and_check(args.gensen_version, GENSEN_VERSIONS, "gensen_prefix")
            if len(gensen_version_list) > 2:
                raise ValueError('gensen_version can only have one or two elements')
            model_options = 'version=' + args.gensen_version
        elif model_name == ModelName.GUSE.value:
            model_options = ''
        elif model_name == ModelName.COVE.value:
            if args.cove_encs is None:
                raise Exception('cove_encs must be specified for {} model'.format(model_name))
            model_options = ''
        elif model_name == ModelName.ELMO.value:
            model_options = 'time_combine={};layer_combine={}'.format(
                args.time_combine_method, args.layer_combine_method)
        elif model_name == ModelName.BERT.value:
            model_options = 'version=' + args.bert_version
        elif model_name == ModelName.OPENAI.value:
            if args.openai_encs is None:
                raise Exception('openai_encs must be specified for {} model'.format(model_name))
            model_options = ''
        else:
            raise ValueError("Model %s not found!" % model_name)

        model = None

        for test in tests:
            log.info('Running test {} for model {}'.format(test, model_name))
            enc_file = os.path.join(args.exp_dir, "%s.%s.h5" % (
                "%s;%s" % (model_name, model_options) if model_options else model_name,
                test))
            if not args.ignore_cached_encs and os.path.isfile(enc_file):
                log.info("Loading encodings from %s", enc_file)
                encs = load_encodings(enc_file)
                encs_targ1 = encs['targ1']
                encs_targ2 = encs['targ2']
                encs_attr1 = encs['attr1']
                encs_attr2 = encs['attr2']
            else:
                # load the test data
                encs = load_json(os.path.join(args.data_dir, "%s%s" % (test, TEST_EXT)))

                # load the model and do model-specific encoding procedure
                log.info('Computing sentence encodings')
                if model_name == ModelName.BOW.value:
                    encs_targ1 = bow.encode(encs["targ1"]["examples"], args.glove_path)
                    encs_targ2 = bow.encode(encs["targ2"]["examples"], args.glove_path)
                    encs_attr1 = bow.encode(encs["attr1"]["examples"], args.glove_path)
                    encs_attr2 = bow.encode(encs["attr2"]["examples"], args.glove_path)

                elif model_name == ModelName.INFERSENT.value:
                    if model is None:
                        model = infersent.load_infersent(args.infersent_dir, args.glove_path, train_data='all',
                                                         use_cpu=args.use_cpu)
                    model.build_vocab(
                        [
                            example
                            for k in ('targ1', 'targ2', 'attr1', 'attr2')
                            for example in encs[k]['examples']
                        ],
                        tokenize=True)
                    log.info("Encoding sentences for test %s with model %s...", test, model_name)
                    encs_targ1 = infersent.encode(model, encs["targ1"]["examples"])
                    encs_targ2 = infersent.encode(model, encs["targ2"]["examples"])
                    encs_attr1 = infersent.encode(model, encs["attr1"]["examples"])
                    encs_attr2 = infersent.encode(model, encs["attr2"]["examples"])

                elif model_name == ModelName.GENSEN.value:
                    if model is None:
                        gensen_1 = gensen.GenSenSingle(
                            model_folder=args.gensen_dir,
                            filename_prefix=gensen_version_list[0],
                            pretrained_emb=args.glove_h5_path,
                            cuda=not args.use_cpu)
                        model = gensen_1

                        if len(gensen_version_list) == 2:
                            gensen_2 = gensen.GenSenSingle(
                                model_folder=args.gensen_dir,
                                filename_prefix=gensen_version_list[1],
                                pretrained_emb=args.glove_h5_path,
                                cuda=not args.use_cpu)
                            model = gensen.GenSen(gensen_1, gensen_2)

                    vocab = gensen.build_vocab([
                        s
                        for set_name in ('targ1', 'targ2', 'attr1', 'attr2')
                        for s in encs[set_name]["examples"]
                    ])

                    model.vocab_expansion(vocab)

                    encs_targ1 = gensen.encode(model, encs["targ1"]["examples"])
                    encs_targ2 = gensen.encode(model, encs["targ2"]["examples"])
                    encs_attr1 = gensen.encode(model, encs["attr1"]["examples"])
                    encs_attr2 = gensen.encode(model, encs["attr2"]["examples"])

                elif model_name == ModelName.GUSE.value:
                    model = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
                    if args.use_cpu:
                        kwargs = dict(device_count={'GPU': 0})
                    else:
                        kwargs = dict()
                    config = tf.ConfigProto(**kwargs)
                    config.gpu_options.per_process_gpu_memory_fraction = 0.5  # maximum alloc gpu50% of MEM
                    config.gpu_options.allow_growth = True  # allocate dynamically
                    with tf.Session(config=config) as session:
                        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
                        def guse_encode(sents):
                            encs_node = model(sents)
                            encs = session.run(encs_node)
                            encs_d = {sents[j]: enc for j, enc in enumerate(np.array(encs).tolist())}
                            return encs_d

                        encs_targ1 = guse_encode(encs["targ1"]["examples"])
                        encs_targ2 = guse_encode(encs["targ2"]["examples"])
                        encs_attr1 = guse_encode(encs["attr1"]["examples"])
                        encs_attr2 = guse_encode(encs["attr2"]["examples"])

                elif model_name == ModelName.COVE.value:
                    load_encs_from = os.path.join(args.cove_encs, "%s.encs" % test)
                    encs = load_jiant_encodings(load_encs_from, n_header=1)

                elif model_name == ModelName.ELMO.value:
                    kwargs = dict(time_combine_method=args.time_combine_method,
                                  layer_combine_method=args.layer_combine_method)
                    encs_targ1 = elmo.encode(encs["targ1"]["examples"], **kwargs)
                    encs_targ2 = elmo.encode(encs["targ2"]["examples"], **kwargs)
                    encs_attr1 = elmo.encode(encs["attr1"]["examples"], **kwargs)
                    encs_attr2 = elmo.encode(encs["attr2"]["examples"], **kwargs)

                elif model_name == ModelName.BERT.value:
                    model, tokenizer = bert.load_model(args.bert_version)
                    encs_targ1 = bert.encode(model, tokenizer, encs["targ1"]["examples"])
                    encs_targ2 = bert.encode(model, tokenizer, encs["targ2"]["examples"])
                    encs_attr1 = bert.encode(model, tokenizer, encs["attr1"]["examples"])
                    encs_attr2 = bert.encode(model, tokenizer, encs["attr2"]["examples"])

                elif model_name == ModelName.OPENAI.value:
                    load_encs_from = os.path.join(args.openai_encs, "%s.encs" % test)
                    #encs = load_jiant_encodings(load_encs_from, n_header=1, is_openai=True)
                    encs = load_encodings(load_encs_from)
                    encs_targ1 = encs["targ1"]["encs"]
                    encs_targ2 = encs["targ2"]["encs"]
                    encs_attr1 = encs["attr1"]["encs"]
                    encs_attr2 = encs["attr2"]["encs"]

                else:
                    raise ValueError("Model %s not found!" % model_name)

                encs["targ1"]["encs"] = encs_targ1
                encs["targ2"]["encs"] = encs_targ2
                encs["attr1"]["encs"] = encs_attr1
                encs["attr2"]["encs"] = encs_attr2

                log.info("\tDone!")
                if not args.dont_cache_encs:
                    log.info("Saving encodings to %s", enc_file)
                    save_encodings(encs, enc_file)

            enc = [e for e in encs["targ1"]['encs'].values()][0]
            d_rep = enc.size if isinstance(enc, np.ndarray) else len(enc)

            # run the test on the encodings
            log.info("Running SEAT...")
            log.info("Representation dimension: {}".format(d_rep))
            esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric)
            results.append(dict(
                model=model_name,
                options=model_options,
                test=test,
                p_value=pval,
                effect_size=esize,
                num_targ1=len(encs['targ1']['encs']),
                num_targ2=len(encs['targ2']['encs']),
                num_attr1=len(encs['attr1']['encs']),
                num_attr2=len(encs['attr2']['encs'])))

        log.info("Model: %s", model_name)
        log.info('Options: {}'.format(model_options))
        for r in results:
            log.info("\tTest {test}:\tp-val: {p_value:.9f}\tesize: {effect_size:.2f}".format(**r))

    if args.results_path is not None:
        log.info('Writing results to {}'.format(args.results_path))
        with open(args.results_path, 'w') as f:
            writer = DictWriter(f, fieldnames=results[0].keys(), delimiter='\t')
            writer.writeheader()
            for r in results:
                writer.writerow(r)
예제 #21
0
            resize_image(srcfile, destfile, new_width, new_height)
        return destfolder

    def get_resized_db_image_paths(
            destfolder='../data/train_images_model_resize/%s' %
        (class_folder)):
        return sorted(
            list(glob.iglob(os.path.join(destfolder, '*.[Jj][Pp][Gg]'))))

    resize_images_folder('../data/train_images_model/%s' % (class_folder))
    db_images = get_resized_db_image_paths()

    tf.reset_default_graph()
    tf.logging.set_verbosity(tf.logging.FATAL)

    m = hub.Module('https://tfhub.dev/google/delf/1')

    # The module operates on a single image at a time, so define a placeholder to feed an arbitrary image in.
    image_placeholder = tf.placeholder(tf.float32,
                                       shape=(None, None, 3),
                                       name='input_image')

    module_inputs = {
        'image': image_placeholder,
        'score_threshold': 100.0,
        'image_scales': [0.25, 0.3536, 0.5, 0.7071, 1.0, 1.4142, 2.0],
        'max_feature_num': 1000,
    }

    module_outputs = m(module_inputs, as_dict=True)
def create_id3_embedding(videos):
    """Embeds the given videos using the Inflated 3D Convolution network.

  Downloads the graph of the I3D from tf.hub and adds it to the graph on the
  first call.

  Args:
    videos: <float32>[batch_size, num_frames, height=224, width=224, depth=3].
      Expected range is [-1, 1].

  Returns:
    embedding: <float32>[batch_size, embedding_size]. embedding_size depends
               on the model used.

  Raises:
    ValueError: when a provided embedding_layer is not supported.
  """

    batch_size = 16
    module_spec = "https://tfhub.dev/deepmind/i3d-kinetics-400/1"

    # Making sure that we import the graph separately for
    # each different input video tensor.
    module_name = "fvd_kinetics-400_id3_module_" + six.ensure_str(
        videos.name).replace(":", "_")

    assert_ops = [
        tf.Assert(
            tf.reduce_max(videos) <= 1.001,
            ["max value in frame is > 1", videos]),
        tf.Assert(
            tf.reduce_min(videos) >= -1.001,
            ["min value in frame is < -1", videos]),
        tf.assert_equal(tf.shape(videos)[0],
                        batch_size,
                        ["invalid frame batch size: ",
                         tf.shape(videos)],
                        summarize=6),
    ]
    with tf.control_dependencies(assert_ops):
        videos = tf.identity(videos)

    module_scope = "%s_apply_default/" % module_name

    # To check whether the module has already been loaded into the graph, we look
    # for a given tensor name. If this tensor name exists, we assume the function
    # has been called before and the graph was imported. Otherwise we import it.
    # Note: in theory, the tensor could exist, but have wrong shapes.
    # This will happen if create_id3_embedding is called with a frames_placehoder
    # of wrong size/batch size, because even though that will throw a tf.Assert
    # on graph-execution time, it will insert the tensor (with wrong shape) into
    # the graph. This is why we need the following assert.
    video_batch_size = int(videos.shape[0])
    assert video_batch_size in [batch_size, -1, None], "Invalid batch size"
    tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
    if not _is_in_graph(tensor_name):
        i3d_model = hub.Module(module_spec, name=module_name)
        i3d_model(videos)

    # gets the kinetics-i3d-400-logits layer
    tensor_name = module_scope + "RGB/inception_i3d/Mean:0"
    tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)
    return tensor
예제 #23
0
    def __init__(self, args):
        self.args = args

        self.num_labels = len(args.labels)
        # 读取并预处理数据
        self.data_train = self._read_data(
            os.path.join(args.dataset_path, 'answers_train.txt'))
        self.data_dev = self._read_data(
            os.path.join(args.dataset_path, 'answers_dev.txt'))
        self.data_test = self._read_data(
            os.path.join(args.dataset_path, 'answers_test.txt'))

        for data in [self.data_train, self.data_dev, self.data_test]:
            self.labels_2_one_hot(data)

        print('Number of training data:', len(self.data_train))
        print('Number of data for evaluation:', len(self.data_dev))
        print('Number of data for testing:', len(self.data_test))
        print('data_train[0:3] =')
        for i in range(3):
            print(self.data_train[i])
        print('data_dev[0:3] =')
        for i in range(3):
            print(self.data_dev[i])
        print('data_test[0:3] =')
        for i in range(3):
            print(self.data_test[i])

        with tf.name_scope('labeled_text'):
            self.label_input = tf.placeholder(tf.int8, [None, self.num_labels],
                                              name='labels')
            self.text_input = tf.placeholder(tf.string, [None], name='texts')
        self.elmo = hub.Module("https://tfhub.dev/google/elmo/3")
        self.embeddings = self.elmo(self.text_input,
                                    signature="default",
                                    as_dict=True)["default"]

        with tf.name_scope('ELMo_Classifier'):
            self.h_size = int(self.embeddings.shape[-1])  # embedding维度

            self.W = tf.Variable(tf.truncated_normal(
                [self.h_size, self.num_labels]),
                                 name='Weights')
            self.B = tf.Variable(tf.truncated_normal([self.num_labels]),
                                 name='Bias')
            self.Z = tf.matmul(self.embeddings, self.W) + self.B

            tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES,
                                 tf.contrib.layers.l2_regularizer(
                                     self.args.lamb)(self.W))  # 使用L2正则化,防止过拟合

            self.prob = tf.nn.softmax(self.Z)
            self.pred_label = tf.argmax(self.prob, 1)
            self.true_label = tf.argmax(self.label_input, 1)
            self.loss = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=self.Z,
                    labels=self.label_input)) + tf.get_collection(
                        tf.GraphKeys.REGULARIZATION_LOSSES)
            self.op = tf.train.AdamOptimizer(
                learning_rate=self.args.learning_rate).minimize(self.loss)
예제 #24
0
파일: utils.py 프로젝트: yondu22/jann
def embed_lines(args, unencoded_lines, output_dict,
                unencoded_lines_resps=None):
    """Embed a collection of lines to an output dictionary."""

    # Import the Universal Sentence Encoder's TF Hub module
    module = hub.Module(args.module_path, trainable=False)
    config = tf.ConfigProto(allow_soft_placement=True)

    with tf.Session(config=config) as session:
        # initialize the variables
        session.run(
          [tf.global_variables_initializer(), tf.tables_initializer()])

        if args.use_sentence_piece:
            # spm_path now contains a path to the SentencePiece
            # model stored inside the TF-Hub module
            spm_path = session.run(module(signature="spm_path"))
            sp = spm.SentencePieceProcessor()
            sp.Load(spm_path)

            # build an input placeholder
            with tf.device('/gpu:0'):
                input_placeholder = tf.sparse_placeholder(
                  tf.int64, shape=[None, None])
                embeddings = module(inputs=dict(
                  values=input_placeholder.values,
                  indices=input_placeholder.indices,
                  dense_shape=input_placeholder.dense_shape
                  )
                )

        # size of chunk is how many lines will be encoded
        # with each pass of the model
        size_of_chunk = 256

        # ensure that every line has a response
        assert len(unencoded_lines) == len(unencoded_lines_resps)
        all_id_chunks = get_id_chunks(
          range(len(unencoded_lines)), size_of_chunk)

        max_iter = len(unencoded_lines) // size_of_chunk
        for id_chunk in tqdm(all_id_chunks, total=max_iter):
            # get the chunk of lines and matching responses by list of ids
            chunk_unencoded_lines = [unencoded_lines[x] for x in id_chunk]
            chunck_unenc_resp = [unencoded_lines_resps[x] for x in id_chunk]

            if args.use_sentence_piece:
                # process unencoded lines to values and IDs in sparse format
                values, indices, dense_shape = process_to_IDs_in_sparse_format(
                  sp=sp, sentences=chunk_unencoded_lines)

                # run the session
                with tf.device('/gpu:0'):
                    chunk_line_embds = session.run(
                        embeddings,
                        feed_dict={
                            input_placeholder.values: values,
                            input_placeholder.indices: indices,
                            input_placeholder.dense_shape: dense_shape
                        }
                    )
            else:
                with tf.device('/gpu:0'):
                    chunk_line_embds = session.run(
                      module(chunk_unencoded_lines))

            # hash the object into the full output dataframe
            for i, line_embedding in enumerate(
              np.array(chunk_line_embds).tolist()):
                if args.verbose:
                    tf.logging.info(
                        "Line: {}".format(chunk_unencoded_lines[i]))
                    tf.logging.info(
                        "Embedding size: {}".format(len(line_embedding)))
                    snippet = ", ".join((str(x) for x in line_embedding[:3]))
                    tf.logging.info(
                        "Embedding: [{}, ...]\n".format(snippet))

                # Encode a hash for the string
                hash_object = hashlib.md5(
                    chunk_unencoded_lines[i].encode('utf-8'))

                # Add a row to the dataframe
                output_dict[hash_object.hexdigest()] = {
                  'line': chunk_unencoded_lines[i],
                  'line_embedding': line_embedding,
                  'response': chunck_unenc_resp[i]
                }
    return output_dict
예제 #25
0
    'BATCH_SIZE': 64,
    'TOP_K': 5,  # How many top classes should be predicted
    'INFER_PATH': './Data/ImgsResize',
    'LABEL_PATH': './Data/classes.txt'
}

dataset = Dataset(params)
print("======> dataset.image_data: ", dataset.img_data)
#module = hub.Module('https://tfhub.dev/google/imagenet/inception_v3/classification/1')
#logits = module(dict(images=dataset.img_data))
#print(logits)

#softmax = tf.nn.softmax(logits)
#top_predictions = tf.nn.top_k(softmax, top_k, name='top_predictions')

module = hub.Module(
    'https://tfhub.dev/google/imagenet/resnet_v1_50/classification/1')
#module = hub.Module('https://tfhub.dev/google/imagenet/resnet_v1_50/classification/1',
#                    trainable=True)   # Trainable is True since we are going to fine-tune the model
print("\n========> output info dict:")
[
    print('{0}: {1}'.format(k, v)) for k, v in sorted(
        module.get_output_info_dict(signature='image_classification').items())
]

module_features = module(dict(images=dataset.img_data),
                         signature="image_classification",
                         as_dict=True)
#features = module_features["default"]
features = module_features["resnet_v1_50/block2"]

print("\n==========> features: ", features)
import keras
from keras import backend as k
from keras.models import Model

from keras.layers import Dense, Activation
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
import numpy as np
import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.callbacks import History
from keras.layers import Input, Dense
import tensorflow_hub as hub

elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

# Audio recording parameters
STREAMING_LIMIT = 290000
SAMPLE_RATE = 16000
CHUNK_SIZE = int(SAMPLE_RATE / 10)  # 100ms

os.environ[
    'GOOGLE_APPLICATION_CREDENTIALS'] = "/Users/chandan/GoogleServiceAccountKey/serviceACKey.json"


def get_current_time():
    return int(round(time.time() * 1000))


def duration_to_secs(duration):
예제 #27
0
ytrain = ytrain.reshape(-1)
ytest = ytest.reshape(-1)


def _preprocess(x):
    x = preprocess_image(x, 224, 224, is_training=False, color_distort=False)
    return x


batch_size = 100
x = tf.placeholder(shape=(batch_size, 32, 32, 3), dtype=tf.float32)
x_preproc = tf.map_fn(_preprocess, x)
print(x_preproc.get_shape().as_list())

hub_path = 'gs://simclr-checkpoints/simclrv2/pretrained/r50_2x_sk1/hub/'
module = hub.Module(hub_path, trainable=False)
features = module(inputs=x_preproc, signature='default')
print(features.get_shape().as_list())

sess = tf.Session()
sess.run(tf.global_variables_initializer())
print("model loaded!")

features_train = []
for i in range(len(xtrain) // batch_size):
    x_batch = xtrain[i * batch_size:(i + 1) * batch_size]
    f = sess.run(features, feed_dict={x: x_batch})
    features_train.append(f)

features_train = np.concatenate(features_train, axis=0)
print(features_train.shape)
예제 #28
0
    all_scores = []
    for i in class_indexes:
        all_scores.append([met_labels[i], logits[i]])

    all_scores.sort(key=lambda tup: tup[1])
    all_scores = all_scores[::-1]
    return all_scores


met_labels = np.genfromtxt("imetv1_labelmap.csv",
                           delimiter=',',
                           dtype='str',
                           usecols=[1],
                           skip_header=True)

module = hub.Module(
    "https://tfhub.dev/metmuseum/vision/classifier/imet_attributes_V1/1")
print("Eingabedimension vom Bild: H:{} px X W:{} px ".format(
    hub.get_expected_image_size(module)[0],
    hub.get_expected_image_size(module)[1]))

# Hier bekommen wir zusätzliche Informationen über das Module von TensorFlow Hub
# Eingabe
print(module.get_input_info_dict())
# Ausgabe
print(module.get_output_info_dict())

# Eingabebilder
input_image = plt.imread("DT11140.jpg")
input_image = input_image.astype(np.float32)[np.newaxis, ...] / 255.
input_image = tf.image.resize(input_image, (299, 299))
def load_tf_weights_in_bert_generation(model,
                                       tf_hub_path,
                                       model_class,
                                       is_encoder_named_decoder=False,
                                       is_encoder=False):
    try:
        import numpy as np
        import tensorflow.compat.v1 as tf

        import tensorflow_hub as hub
        import tensorflow_text  # noqa: F401

        tf.disable_eager_execution()
    except ImportError:
        logger.error(
            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
            "https://www.tensorflow.org/install/ for installation instructions."
        )
        raise
    tf_model = hub.Module(tf_hub_path)
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        init.run()
        all_variables = tf_model.variable_map
        keep_track_variables = all_variables.copy()
        for key in list(all_variables.keys()):
            if "global" in key:
                logger.info(f"Skipping {key}...")
                continue
            if not is_encoder:
                model_pointer = getattr(model, model_class)
            else:
                model_pointer = model
            is_embedding = False
            logger.info(f"Trying to match {key}...")
            # remove start_string = "module/bert/"
            sub_layers = key.split("/")[2:]
            if is_encoder_named_decoder and sub_layers[0] == "encoder":
                logger.info(f"Skipping encoder layer {key} for decoder")
                continue
            if is_encoder and sub_layers[0] == "decoder":
                logger.info(f"Skipping decoder layer {key} for encoder")
                continue
            for i, sub_layer in enumerate(sub_layers):
                if sub_layer == "embeddings":
                    is_embedding = True
                elif sub_layer == "LayerNorm":
                    is_embedding = False
                if "layer" in sub_layer:
                    model_pointer = model_pointer.layer[int(
                        sub_layer.split("_")[-1])]
                elif sub_layer in ["kernel", "gamma"]:
                    model_pointer = model_pointer.weight
                elif sub_layer == "beta":
                    model_pointer = model_pointer.bias
                elif sub_layer == "encdec":
                    model_pointer = model_pointer.crossattention.self
                elif sub_layer == "encdec_output":
                    model_pointer = model_pointer.crossattention.output
                elif is_encoder_named_decoder and sub_layer == "decoder":
                    model_pointer = model_pointer.encoder
                else:
                    if sub_layer == "attention" and "encdec" in sub_layers[i +
                                                                           1]:
                        continue
                    try:
                        model_pointer = getattr(model_pointer, sub_layer)
                    except AttributeError:
                        logger.info(
                            f"Skipping to initialize {key} at {sub_layer}...")
                        raise AttributeError

            array = np.asarray(sess.run(all_variables[key]))
            if not is_embedding:
                logger.info(
                    "Transposing numpy weight of shape {} for {}".format(
                        array.shape, key))
                array = np.transpose(array)
            else:
                model_pointer = model_pointer.weight

            try:
                assert (
                    model_pointer.shape == array.shape
                ), f"Pointer shape {model_pointer.shape} and array shape {array.shape} mismatched"
            except AssertionError as e:
                e.args += (model_pointer.shape, array.shape)
                raise
            logger.info(f"Initialize PyTorch weight {key}")

            model_pointer.data = torch.from_numpy(array.astype(np.float32))
            keep_track_variables.pop(key, None)

        logger.info("Weights not copied to PyTorch model: {}".format(", ".join(
            keep_track_variables.keys())))
        return model
예제 #30
0
# Check how many iterations we will do.
max_iter = len(file_list) // batch_size + 1
'''
Make the graph that basically only holds the module
Note in the graph the module works on a placeholder
  Since we do not know howmany images we will process at a time,
  we set the first parameter in the shape to be None
  This placeholder will later on be filled with images


Module_256 is the module from tensorhub
  This will spit out a 256 feature vector called the features

'''
tf.reset_default_graph()
module = hub.Module(module_url)
images = tf.placeholder(shape=[None, 224, 224, 3],
                        dtype=tf.float32,
                        name='input')
features = module(images)

init_op = tf.global_variables_initializer()

times = []

with tf.Session() as sess:
    sess.run(init_op)

    # Finalize graph so that we not accidentely extend it.
    sess.graph.finalize()