Exemplo n.º 1
0
def make_embedding(CV, MODEL, DATA, EMBED):
    DATA_FOLD = DATA + f"/FOLD-{CV}"
    if not os.path.exists(EMBED):
        os.mkdir(EMBED)

    graph, features, labels = load_dataset(DATA, DATA_FOLD)
    fltr = GraphConv.preprocess(graph).astype('f4')
    fltr = ops.sp_matrix_to_sp_tensor(fltr)

    X_in = Input((features.shape[1], ))
    fltr_in = Input((features.shape[0], ), sparse=True)
    X_1 = GraphConv(512, 'relu', True,
                    kernel_regularizer=l2(5e-4))([X_in, fltr_in])
    X_1 = Dropout(0.5)(X_1)
    X_2 = GraphConv(256, 'relu', True,
                    kernel_regularizer=l2(5e-4))([X_1, fltr_in])
    X_2 = Dropout(0.5)(X_2)
    X_3 = GraphConv(128, 'relu', True,
                    kernel_regularizer=l2(5e-4))([X_2, fltr_in])
    X_3 = Dropout(0.5)(X_3)
    X_4 = GraphConv(64, 'linear', True,
                    kernel_regularizer=l2(5e-4))([X_3, fltr_in])
    X_5 = Dense(labels.shape[1], use_bias=True)(X_4)

    loaded_model = load_model(f"{MODEL}")
    model_without_task = Model(inputs=[X_in, fltr_in], outputs=X_4)
    model_without_task.set_weights(loaded_model.get_weights()[:8])

    final_node_representations = model_without_task([features, fltr],
                                                    training=False)
    save_embedding(final_node_representations, EMBED, DATA_FOLD, CV)
def get_image():
    """Gets an image file via POST request, feeds the image to the FaceNet model then saves both the original image
     and its resulting embedding from the FaceNet model in their designated folders.

        'uploads' folder: for image files
        'embeddings' folder: for embedding numpy files.
    """

    if request.method == 'POST':
        if 'file' not in request.files:
            return render_template("warning.html", status="No 'file' field in POST request!")

        file = request.files['file']
        filename = file.filename

        if filename == "":
            return render_template("warning.html", status="No selected file!")

        if file and allowed_file(filename=filename, allowed_set=allowed_set):
            filename = secure_filename(filename=filename)
            # Read image file as numpy array of RGB dimension
            img = imread(name=file, mode='RGB')
            # Detect and crop a 160 x 160 image containing a human face in the image file
            img = get_face(img=img, pnet=pnet, rnet=rnet, onet=onet, image_size=image_size)

            # If a human face is detected
            if img is not None:

                embedding = forward_pass(
                    img=img,
                    session=facenet_persistent_session,
                    images_placeholder=images_placeholder,
                    embeddings=embeddings,
                    phase_train_placeholder=phase_train_placeholder,
                    image_size=image_size
                )
                # Save cropped face image to 'uploads/' folder
                save_image(img=img, filename=filename, uploads_path=uploads_path)
                # Remove file extension from image filename for numpy file storage being based on image filename
                filename = remove_file_extension(filename=filename)
                # Save embedding to 'embeddings/' folder
                save_embedding(embedding=embedding, filename=filename, embeddings_path=embeddings_path)

                return render_template(
                    "upload_result.html",
                    status="Image uploaded and embedded successfully!"
                )

            else:
                return render_template(
                    "upload_result.html",
                    status="Image upload was unsuccessful! No human face was detected!"
                )

    else:
        return render_template("warning.html", status="POST HTTP method required!")
Exemplo n.º 3
0
def get_image():

    if request.method == 'POST':
        if 'file' not in request.files:
            return "No file part"

        file = request.files['file']
        filename = file.filename

        if filename == "":
            return "No selected file"

        if file and allowed_file(filename=filename, allowed_set=allowed_set):
            # Read image file as numpy array of RGB dimension
            img = io.imread(fname=file, mode='RGB')
            # Detect and crop a 160 x 160 image containing a human face in the image file
            img = get_face(img=img,
                           pnet=pnet,
                           rnet=rnet,
                           onet=onet,
                           image_size=image_size)

            # If a human face is detected
            if img is not None:

                embedding = forward_pass(
                    img=img,
                    session=facenet_persistent_session,
                    images_placeholder=images_placeholder,
                    embeddings=embeddings,
                    phase_train_placeholder=phase_train_placeholder,
                    image_size=image_size)
                # Save cropped face image to 'uploads/' folder
                save_image(img=img,
                           filename=filename,
                           uploads_path=uploads_path)
                # Remove file extension from image filename for numpy file storage being based on image filename
                filename = remove_file_extension(filename=filename)
                # Save embedding to 'embeddings/' folder
                save_embedding(embedding=embedding,
                               filename=filename,
                               embeddings_path=embeddings_path)

                return render_template(
                    "upload_result.html",
                    status="Image uploaded and embedded successfully!")

            else:
                return render_template(
                    "upload_result.html",
                    status=
                    "Image upload was unsuccessful! No human face was detected."
                )

    else:
        return "POST HTTP method required!"
Exemplo n.º 4
0
def main(args):
    """
    Characteristic function embedding wrapper.
    :param args: Arguments object parsed up.
    """
    if args.model_type == "FEATHER":
        print("\nFitting a node embedding.\n")
        graph = load_graph(args.graph_input)
        features = load_features(args.feature_input)
        model = FEATHER()
        model.fit(graph, features)
    elif args.model_type == "FEATHER-G":
        print("\nFitting a graph level embedding.\n")
        graphs = load_graphs(args.graphs_input)
        model = FEATHERG()
        model.fit(graphs)
    else:
        quit()
    X = model.get_embedding()
    save_embedding(X, args.output)
Exemplo n.º 5
0
def main(args):

    print("Loading data...")
    data = load_data(args.graphdir, supervised=False,
                     with_authors=args.use_authors,
                     collate_coauthorship=(not args.first_class_authors),
                     undirected=True)

    # Switch case on main training function
    labels, embedding = {
        'random': embed_random,
        'lsa': embed_lsa,
        'deepwalk': embed_deepwalk,
        'gcn_cv_sc': embed_control_variate
    }[args.model](args, data)


    with open(os.path.join(args.out, MDS['args_file']), 'w') as argsfile:
        print(args, file=argsfile)


    embedding_file = os.path.join(args.out, MDS['embedding_file'])
    save_embedding(labels, embedding, embedding_file)
Exemplo n.º 6
0
def point_embed_mesh1d(model, mesh1d, bounding_shape, **kwargs):
    '''
    Embed points of mesh1d into Xd bounding shape. An attempt is made 
    to insert intermediate points so that also edges are embedded 
    '''
    x = mesh1d.coordinates()

    foo = df.MeshFunction('size_t', mesh1d, 1, 0)
    foo.array()[:] = np.arange(1, 1 + mesh1d.num_cells())
    df.File('foo.pvd') << foo
    
    mesh1d.init(1, 0)
    e2v = mesh1d.topology()(1, 0)
    topology = [list(e2v(e)) for e in range(mesh1d.num_entities(1))]

    target_l = trim.edge_lengths(mesh1d).vector().get_local()

    converged, nneeds = False, [mesh1d.num_cells()]
    niters = kwargs.get('niters', 5)
    base_geo = kwargs['save_geo']
    for k in range(niters):
        # Some mesh which embeds points but where these points are not
        # necessarily edges
        if base_geo:
            kwargs['save_geo'] = '_'.join([base_geo, str(k)]) 
        t = utils.Timer('%d-th iteration of %d point embedding' % (k, len(x)), 1)
        embedding_mesh, vmap = _embed_points(model, x, bounding_shape, **kwargs)
        t.done()
        
        assert _embeds_points(embedding_mesh, x, vmap)
        # See which edges need to be improved
        needs_embedding = _not_embedded_edges(topology, vmap, embedding_mesh)
        nneeds.append(len(filter(bool, needs_embedding)))
        utils.print_green(' ', '# edges need embedding %d (was %r)' % (nneeds[-1], nneeds[:-1]))
        converged = not any(needs_embedding)

        if kwargs['debug'] and k == niters - 1:
            gmsh.fltk.initialize()
            gmsh.fltk.run()

        # Here's some debugging functionality which saves progress on emebdding
        if kwargs['monitor']:
            # Force current mesh1d embedding
            help_topology = _force_embed_edges(deepcopy([list(vmap[edge]) for edge in topology]),
                                               embedding_mesh,
                                               needs_embedding,
                                               defaultdict(list))
            # And see about the length of edges under that embedding
            new_l = _edge_lengths(embedding_mesh.coordinates(),
                                  help_topology, needs_embedding)
            np.savetxt(os.path.join(kwargs['monitor'], 'length_diff_iter%d.txt' % k), (new_l-target_l)/new_l)
            utils.print_green(' ', 'Max relative length error', np.max(new_l))
                       
            # And distance
            new_d = _edge_distances(embedding_mesh.coordinates(),
                                    help_topology, needs_embedding)
            np.savetxt(os.path.join(kwargs['monitor'], 'distance_diff_iter%d.txt' % k), new_d)
            utils.print_green(' ', 'Max relative distance error', np.max(new_d))

            old_l = target_l.sum()
            new_l = new_l.sum()
            utils.print_green(' ', 'Target %g, Current %g, Relative Error %g' % (old_l, new_l, (new_l-old_l)/old_l))
            
            # Save the edges which needed embedding
            embedding_mesh.init(1, 0)
            e2v = embedding_mesh.topology()(1, 0)
            edge_lookup = {tuple(sorted(e2v(e))): e for e in range(embedding_mesh.num_entities(1))}
            
            edge_f = df.MeshFunction('size_t', embedding_mesh, 1, 0)
            topology_as_edge = []
    
            for tag, edge in enumerate(help_topology, 1):
                if needs_embedding[tag-1]:
                    the_edge = []
                    for e in zip(edge[:-1], edge[1:]):
                        edge_index = edge_lookup[tuple(sorted(e))]
                        # assert edge_f[edge_index] == 0  # Never seen
                        edge_f[edge_index] = tag
                        the_edge.append(edge_index)
                        topology_as_edge.append(the_edge)
                
            df.File(os.path.join(kwargs['monitor'], 'need_embedding_iter%d.pvd' % k)) << edge_f

        if converged: break            

        # Insert auxiliary points and retry
        t = utils.Timer('%d-th iteration of point insert' % k, 1)        
        x, topology = _embed_edges(topology, x, needs_embedding)
        t.done()
        assert len(topology) == mesh1d.num_cells()
        utils.print_green(' ', '# num points increased to %d' % len(x))

    skew_embed_vertex = defaultdict(list)
    # We capitulate and make approximations;    
    if not converged:
        utils.print_red(' ', 'Falling back to non-conforming `embedding`')
        if base_geo:
            kwargs['save_geo'] = '_'.join([base_geo, str(niters)]) 
        
        embedding_mesh, vmap = _embed_points(model, x, bounding_shape, **kwargs)
        assert _embeds_points(embedding_mesh, x, vmap)

        needs_embedding = _not_embedded_edges(topology, vmap, embedding_mesh)
        # We "embed" the mesh using __only__ existing vertices - translate topology
        topology = [list(vmap[edge]) for edge in topology]
        # An edges that need embedding is a branch with terminal vertices - so the
        # idea is to insert the interior path vertices
        t = utils.Timer('Force embedding edges', 1)
        topology = _force_embed_edges(topology, embedding_mesh, needs_embedding, skew_embed_vertex)
        t.done()

        if kwargs['monitor']:        
            # And see about the length of edges under that embedding
            new_l = _edge_lengths(embedding_mesh.coordinates(), topology, needs_embedding)
            np.savetxt(os.path.join(kwargs['monitor'], 'length_diff_final.txt'), (new_l-target_l)/target_l)
            utils.print_green(' ', 'Max relative length error', np.max(new_l))
                       
            # And distance
            new_d = _edge_distances(embedding_mesh.coordinates(), topology, needs_embedding)
            np.savetxt(os.path.join(kwargs['monitor'], 'distance_diff_final.txt'), new_d)
            utils.print_green(' ', 'Max relative distance error', np.max(new_d))


            old_l = target_l.sum()
            new_l = new_l.sum()
            utils.print_green(' ', 'Target %g, Current %g, Relative Error %g' % (old_l, new_l, (new_l-old_l)/old_l))
        
            # Save the edges which needed embedding
            embedding_mesh.init(1, 0)
            e2v = embedding_mesh.topology()(1, 0)
            edge_lookup = {tuple(sorted(e2v(e))): e for e in range(embedding_mesh.num_entities(1))}
            
            edge_f = df.MeshFunction('size_t', embedding_mesh, 1, 0)
            topology_as_edge = []
    
            for tag, edge in enumerate(topology, 1):
                if needs_embedding[tag-1]:
                    the_edge = []
                    for e in zip(edge[:-1], edge[1:]):
                        edge_index = edge_lookup[tuple(sorted(e))]
                        # assert edge_f[edge_index] == 0  # Never seen
                        edge_f[edge_index] = tag
                        the_edge.append(edge_index)
                    topology_as_edge.append(the_edge)
                
            df.File(os.path.join(kwargs['monitor'], 'need_embedding_final.pvd')) << edge_f
    else:
        # Since the original 1d mesh likely has been changed we give
        # topology wrt to node numbering of the embedding mesh
        topology = [list(vmap[edge]) for edge in topology]
    assert len(topology) == mesh1d.num_cells()        

    t = utils.Timer('Fishing for edges', 1)
    # Need to color the edge function;
    embedding_mesh.init(1, 0)
    e2v = embedding_mesh.topology()(1, 0)
    edge_lookup = {tuple(sorted(e2v(e))): e for e in range(embedding_mesh.num_entities(1))}

    edge_f = df.MeshFunction('size_t', embedding_mesh, 1, 0)
    topology_as_edge = []
    
    for tag, edge in enumerate(topology, 1):
        the_edge = []
        for e in zip(edge[:-1], edge[1:]):
            edge_index = edge_lookup[tuple(sorted(e))]
            # assert edge_f[edge_index] == 0  # Never seen
            edge_f[edge_index] = tag
            the_edge.append(edge_index)
        topology_as_edge.append(the_edge)

    encode_edge = lambda path: [edge_lookup[tuple(sorted(e))] for e in zip(path[:-1], path[1:])]
    # Finally encode skew edges as edges
    skew_embed_edge = {k: map(encode_edge, edge_as_vertex)
                       for k, edge_as_vertex in skew_embed_vertex.items()}
    t.done()

    df.File('foo_final.pvd') << edge_f

    ans = utils.LineMeshEmbedding(embedding_mesh,
                                  # The others were not part of original data
                                  vmap[:mesh1d.num_vertices()],  
                                  edge_f,
                                  utils.EdgeMap(topology, topology_as_edge),
                                  utils.EdgeMap(skew_embed_vertex, skew_embed_edge))

    kwargs['save_embedding'] and utils.save_embedding(ans, kwargs['save_embedding'])

    return ans
Exemplo n.º 7
0
model = Sequential()
embedding = Embedding(vocab_size,
                      embedding_size,
                      input_length=max_len,
                      weights=[embedding_matrix])
model.add(embedding)
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(100, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)
model.fit(data,
          labels,
          epochs=100,
          verbose=1,
          batch_size=32,
          shuffle=True,
          validation_data=(test_data, test_labels))

save_embedding('glove-embedding_labeled.txt',
               embedding.get_weights()[0], vocab)
tsne_plot(embedding,
          vocab,
          figure_name='glove-embedding_labeled',
          max_words=200,
          pos=['ADJ', 'VERB', 'NOUN'])
Exemplo n.º 8
0
def test_mlp(learning_rate=0.01, L2_reg=0.00000001,  n_epochs=2000,
             dataset='theano.join.data', ref_dataset = 'ref.theano.join.data', batch_size=10000, max_iter = 5000, 
             output='theano.model.out', validation_freq = 100, ada_epsilon = 0.000001, alpha_share = 0.9, reg_join = 10, map_file = "labels.mapping", 
             bidict_file = 'theano.en.sv.translation', english_file = ''):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path to the theano.classifier.data


   """
    print (" Learning with params : ")
    print (" Learning rate : " + str(learn_rate)); 
    print (" Regularlization params : " + str(L2_reg))
    print (" Alpha  of tieing together : " + str(alpha_share))
    print (" Batch size : "  + str(batch_size))
    print (" Max Iter : " + str(max_iter))
    print (" Evaluation frequency  : " + str(validation_freq))
    
    print ('... loading data ')
    
    ##### LOAD DATASET ORIGINAL and REF ##############
    print (' ----> load the mapping matrix ')
    mapping_matrix = load_mapping_matrix(map_file)
    
    print (' ----> load translation vectors  ')
    ref_tras_idx, tras_idx = load_translation_vector(bidict_file)
    
    print (' ----> load the original data ')
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    E = datasets[2]
    W1 = datasets[3]
    B1 = datasets[4]
    W2 = datasets[5]
    print (' ----> load the ref data ')
    ref_datasets = load_data(ref_dataset)
    ref_train_set_x, ref_train_set_y = ref_datasets[0]
    ref_valid_set_x, ref_valid_set_y = ref_datasets[1]
    refE = ref_datasets[2]
    refW1 = ref_datasets[3]
    refB1 = ref_datasets[4]
    refW2 = ref_datasets[5]

    # compute number of minibatches for training, validation and testing
    
    n_train_batches = train_set_x.owner.inputs[0].get_value(borrow=True).shape[0] / batch_size
    n_ref_train_batches = ref_train_set_x.owner.inputs[0].get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.owner.inputs[0].get_value(borrow=True).shape[0] / batch_size
    
    if train_set_x.owner.inputs[0].get_value(borrow=True).shape[0]  % batch_size > 100: n_train_batches +=1
    if valid_set_x.owner.inputs[0].get_value(borrow=True).shape[0] % batch_size > 100 : n_valid_batches +=1 
    
    print 'Training batches : ' + str(n_train_batches) 
    print 'Ref training batches : ' + str(n_ref_train_batches)
    print 'Valid batches : ' + str(n_valid_batches)
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    ref_index = T.lscalar() # Reference index to the source minibatch 
    
    x = T.imatrix('x')  # the data is presented as rasterized images
    xref = T.imatrix('xref')  # the data is presented as rasterized images
    yref = T.ivector('yref')  # the labels are presented as 1D vector of
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    rng = numpy.random.RandomState(1234)

    ###### DROP OUT RATE #############
    
    dropout_rate_hidden = 0.5
    dropout_rate_visible = 0.2
    #############################
        
    # construct the MLP class
    classifier = MLP(rng,
        input=x,
        refInput=xref,
        E=E, 
        W1=W1,
        B1=B1,
        W2 = W2,
        refE = refE,
        refW1 = refW1, 
        refB1 = refB1, 
        refW2 = refW2, 
        mapping = mapping_matrix,
        drop_out_rate=dropout_rate_hidden,
        drop_out_embedding_rate=dropout_rate_visible, 
        ref_tras_idx = ref_tras_idx,
        tras_idx =  tras_idx, 
    )

    train_errors = (classifier.errors(y))
    if (alpha_share > 1):
        raise ValueError(" Value of Alpha must be [0,1] ")
        
    cost = (
         alpha_share * classifier.negative_log_likelihood(y)
        + (1 - alpha_share) * classifier.refNegative_log_likelihood(yref)
        + L2_reg * classifier.L2_sqr
        + reg_join * classifier.reg_L2_sqr
    )
    

    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch (remember index should always to even) 
    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],   # x,y here is symbolic variable 
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
            #xref: numpy.zeros(batch_size), 
            #yref: numpy.zeros(batch_size)
        }
    )

    # compute the gradient of cost with respect to theta 
    gparams = [T.grad(cost, param) for param in classifier.params]
    # Put the adagrad here 

    #learning_rate = T.fscalar('lr')  # learning rate to use
    updates = OrderedDict()
    for accugrad, param, gparam in zip(classifier._accugrads, classifier.params, gparams):
            agrad = accugrad + gparam * gparam
            dx = - (learning_rate / T.sqrt(agrad + ada_epsilon)) * gparam
            updates[param] = param + dx
            updates[accugrad] = agrad

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index,ref_index],
        outputs=(cost, train_errors),
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],   # x,y here is symbolic variable 
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            xref: ref_train_set_x[ref_index * batch_size: (ref_index + 1) * batch_size],
            yref: ref_train_set_y[ref_index * batch_size: (ref_index + 1) * batch_size]
        }
    )
    # end-snippet-5

    ###############
    # TRAIN MODEL #
    ###############
    print '... training '

    # early-stopping parameters
    patience = 2000  # Long Duong : At least have to went through this much iteration 
    patience_increase = 2  # wait this much longer when a new best is found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    #validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch
    validation_frequency = validation_freq
    
    ######## FOR TESTING ONLY ##################
    #validation_frequency = 5 
    #n_train_batches = 10 
    #n_epochs = 1 
    ######################################
    
    
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    ref_batch_idx = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches): 
            (minibatch_avg_cost, minibatch_avg_error) = train_model(minibatch_index, ref_batch_idx)
            ref_batch_idx += 1
            if ref_batch_idx >= n_ref_train_batches:
                    ref_batch_idx = 0 
                
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index
            print (' Iteration :  ' + str(iter) + ' with Cost  (join) = ' + str(minibatch_avg_cost) + '  with errors (target only) = ' + str(minibatch_avg_error))
            # Long Duong : since in each epoch => n_train_batches has covered 
            # iter : is the number of update for the parameters (~ number of batches considered) 

            if (iter + 1) % validation_frequency == 0:
                # Note that because we 
                validation_losses = [validate_model( i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index ,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if (this_validation_loss < best_validation_loss * improvement_threshold):
                        # Long Duong : this is the key : need iter to get this good result => Waiting this much iter to expect 
                        # other better result ....  
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    # Save the model  
                    save_model(output,classifier.embeddingLayer.E.get_value(), 
                                       (classifier.dropout_HiddenLayer.W.get_value() * (1-dropout_rate_visible )).T, 
                                       classifier.dropout_HiddenLayer.b.get_value(), 
                                       (classifier.dropout_LogRegressionLayer.W.get_value() * (1- dropout_rate_hidden)).T)
                    
                    if english_file is not None:
                        save_embedding(english_file,classifier.refEmbeddingLayer.E.get_value(),classifier.embeddingLayer.E.get_value())

            # Long Duong : add max_iter criterion 
            if (patience <= iter) or (iter > max_iter) :
                done_looping = True
                break
            
    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i') %
          (best_validation_loss * 100., best_iter + 1))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
vocab, couples, labels = process_data(train_filename, window_size)


# generate model
vocab_size = len(vocab)
word_target, word_context = zip(*couples)

input_target = Input((1,))
input_context = Input((1,))

embedding = Embedding(vocab_size, embedding_size, input_length=1)
target = embedding(input_target)
target = Reshape((embedding_size,))(target)
context = embedding(input_context)
context = Reshape((embedding_size,))(context)

dot_product = dot([target, context], 1)
dot_product = Reshape((1,))(dot_product)

output = Dense(1, activation='sigmoid')(dot_product)

model = Model(input=[input_target, input_context], output=output)
model.summary()
model.compile(loss=loss, optimizer=optimizer, metrics=metrics)

model.fit_generator(batch_generator(word_target, word_context, labels, batch_size),
                    steps_per_epoch=batch_size, epochs=epochs)

save_embedding('skipgram-embedding_labeled.txt',
               embedding.get_weights()[0], vocab)
Exemplo n.º 10
0
def line_embed_mesh1d(model, mesh1d, bounding_shape, **kwargs):
    '''Embed mesh1d in Xd square mesh'''
    time_model = utils.Timer('Line embedding model definition', 1)
    npoints, tdim = mesh1d.coordinates().shape
    # Figure out how to bound it
    counts = bounding_shape.create_volume(model, mesh1d.coordinates())

    # In gmsh Point(4) will be returned as fourth node
    vertex_map = []  # mesh_1d.x[i] is embedding_mesh[vertex_map[i]]
    if tdim == 2:
        for xi in mesh1d.coordinates():
            vertex_map.append(model.geo.addPoint(*np.r_[xi, 0]) - 1)
    else:
        for xi in mesh1d.coordinates():
            vertex_map.append(model.geo.addPoint(*xi) - 1)

    vertex_map = np.array(vertex_map)  # Dolfin to gmsh

    # Add lines of 1d
    mesh1d.init(1, 0)
    e2v = mesh1d.topology()(1, 0)
    lines, edge_encoding = [], []
    for edge in tqdm.tqdm(range(mesh1d.num_entities(1))):
        v0, v1 = vertex_map[e2v(edge)] + 1
        line = model.geo.addLine(v0, v1)
        # There will be a edge function such that edge corresponding
        # to edge `i` in mesh1d will have tag `i`
        model.addPhysicalGroup(1, [line], edge + 1)
        lines.append(line)
        # FIXME:
        edge_encoding.append([v0 - 1, v1 - 1])

    model.addPhysicalGroup(tdim, [counts[tdim]], 1)

    model.geo.synchronize()
    model.mesh.embed(1, lines, tdim, counts[tdim])
    model.geo.synchronize()
    # --
    time_model.done()

    if kwargs['debug']:
        gmsh.fltk.initialize()
        gmsh.fltk.run()

    kwargs['save_geo'] and gmsh.write('%s.geo_unrolled' % kwargs['save_geo'])

    time_gen = utils.Timer('Generation line embedded mesh', 1)
    model.mesh.generate(tdim)
    time_gen.done()

    kwargs['save_msh'] and gmsh.write('%s.msh' % kwargs['save_msh'])

    time_conv = utils.Timer('Mesh conversion', 1)
    # FIXME: as part of debugging do this with mesh convert
    if kwargs.get('return_mesh_only', False):
        return conversion.mesh_from_gmshModel(model,
                                              include_mesh_functions=None)[0]

    # maybe the mesh_fs[1] is wrong
    embedding_mesh, mesh_fs = conversion.mesh_from_gmshModel(
        model, include_mesh_functions=1)
    time_conv.done()

    gmsh.clear()

    time_edge_encode = utils.Timer('Fishing for embedded edges', 1)
    edge_f = mesh_fs[1]
    edge_values = edge_f.array()

    embedding_mesh.init(1, 0)
    e2v = embedding_mesh.topology()(1, 0)
    x = embedding_mesh.coordinates()
    # It remains to account for the nodes that might have been inserted
    # on the edge
    E2V = mesh1d.topology()(1, 0)
    topology_as_edge = []
    # FIXME: rewrite in terms of mesh1d?
    for tag, edge in enumerate(edge_encoding, 1):
        edges, = np.where(edge_values == tag)
        topology_as_edge.append(list(edges))
        if len(edges) > 1:
            nodes = np.unique(np.hstack([e2v(e) for e in edges]))
            assert set(edge) <= set(nodes), (
                edge, nodes, tag, embedding_mesh.coordinates()[edge],
                embedding_mesh.coordinates()[nodes],
                mesh1d.coordinates()[E2V(tag - 1)])

            #    print(edge, nodes, tag)
            # NOTE: Here we use the fact that we have a straight line so
            # we simply order interior nodes of the edge by their distance
            # from start
            idx = np.argsort(np.linalg.norm(x[nodes] - x[edge[0]], 2, axis=1))
            nodes = nodes[idx]
            assert nodes[-1] == edge[1], (tag, edge, nodes)
            # Insder them<
            for i, n in enumerate(nodes[1:-1], 1):
                edge.insert(i, n)
    time_edge_encode.done()

    # Combine
    edge_encoding = utils.EdgeMap(edge_encoding, topology_as_edge)
    skew_encoding = utils.EdgeMap({}, {})

    ans = utils.LineMeshEmbedding(embedding_mesh, vertex_map, edge_f,
                                  edge_encoding, skew_encoding)

    kwargs['save_embedding'] and utils.save_embedding(ans,
                                                      kwargs['save_embedding'])

    return ans
def test_mlp(learning_rate=0.01, L2_reg=0.00000001,  n_epochs=2000,
             dataset='theano.join.data', ref_dataset = 'ref.theano.join.data', batch_size=10000, max_iter = 5000, 
             output='theano.model.out', validation_freq = 100, ada_epsilon = 0.000001, alpha_share = 0.9, map_file = "labels.mapping", 
             english_file = ''):
    """
    Demonstrate stochastic gradient descent optimization for a multilayer
    perceptron

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
    gradient

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path to the theano.classifier.data


   """
    print (" Learning with params : ")
    print (" Learning rate : " + str(learn_rate)); 
    print (" Regularlization params : " + str(L2_reg))
    print (" Alpha  of tieing together : " + str(alpha_share))
    print (" Batch size : "  + str(batch_size))
    print (" Max Iter : " + str(max_iter))
    print (" Evaluation frequency  : " + str(validation_freq))
    
    print ('... loading data ')
    
    ##### LOAD DATASET ORIGINAL and REF ##############
    print (' ----> load the mapping matrix ')
    mapping_matrix = load_mapping_matrix(map_file)
    
    print (' ----> load the original data ')
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    E = datasets[2]
    W1 = datasets[3]
    B1 = datasets[4]
    W2 = datasets[5]
    print (' ----> load the ref data ')
    ref_datasets = load_data(ref_dataset)
    ref_train_set_x, ref_train_set_y = ref_datasets[0]
    ref_valid_set_x, ref_valid_set_y = ref_datasets[1]
    refE = ref_datasets[2]
    refW1 = ref_datasets[3]
    refB1 = ref_datasets[4]
    refW2 = ref_datasets[5]

    # compute number of minibatches for training, validation and testing
    
    n_train_batches = train_set_x.owner.inputs[0].get_value(borrow=True).shape[0] / batch_size
    n_ref_train_batches = ref_train_set_x.owner.inputs[0].get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.owner.inputs[0].get_value(borrow=True).shape[0] / batch_size
    
    if train_set_x.owner.inputs[0].get_value(borrow=True).shape[0]  % batch_size > 100: n_train_batches +=1
    if valid_set_x.owner.inputs[0].get_value(borrow=True).shape[0] % batch_size > 100 : n_valid_batches +=1 
    
    print 'Training batches : ' + str(n_train_batches) 
    print 'Ref training batches : ' + str(n_ref_train_batches)
    print 'Valid batches : ' + str(n_valid_batches)
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    ref_index = T.lscalar() # Reference index to the source minibatch 
    
    x = T.imatrix('x')  # the data is presented as rasterized images
    xref = T.imatrix('xref')  # the data is presented as rasterized images
    yref = T.ivector('yref')  # the labels are presented as 1D vector of
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    rng = numpy.random.RandomState(1234)

    ###### DROP OUT RATE #############
    
    dropout_rate_hidden = 0.5
    dropout_rate_visible = 0.2
    #############################
        
    # construct the MLP class
    classifier = MLP(rng,
        input=x,
        refInput=xref,
        E=E, 
        W1=W1,
        B1=B1,
        W2 = W2,
        refE = refE,
        refW1 = refW1, 
        refB1 = refB1, 
        refW2 = refW2, 
        mapping = mapping_matrix,
        drop_out_rate=dropout_rate_hidden,
        drop_out_embedding_rate=dropout_rate_visible
    )

    train_errors = (classifier.errors(y))
    if (alpha_share > 1):
        raise ValueError(" Value of Alpha must be [0,1] ")
        
    cost = (
         alpha_share * classifier.negative_log_likelihood(y)
        + (1 - alpha_share) * classifier.refNegative_log_likelihood(yref)
        + L2_reg * classifier.L2_sqr
    )
    

    # end-snippet-4

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch (remember index should always to even) 
    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],   # x,y here is symbolic variable 
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
            #xref: numpy.zeros(batch_size), 
            #yref: numpy.zeros(batch_size)
        }
    )

    # compute the gradient of cost with respect to theta 
    gparams = [T.grad(cost, param) for param in classifier.params]
    # Put the adagrad here 

    #learning_rate = T.fscalar('lr')  # learning rate to use
    updates = OrderedDict()
    for accugrad, param, gparam in zip(classifier._accugrads, classifier.params, gparams):
            agrad = accugrad + gparam * gparam
            dx = - (learning_rate / T.sqrt(agrad + ada_epsilon)) * gparam
            updates[param] = param + dx
            updates[accugrad] = agrad

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index,ref_index],
        outputs=(cost, train_errors),
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],   # x,y here is symbolic variable 
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            xref: ref_train_set_x[ref_index * batch_size: (ref_index + 1) * batch_size],
            yref: ref_train_set_y[ref_index * batch_size: (ref_index + 1) * batch_size]
        }
    )
    # end-snippet-5

    ###############
    # TRAIN MODEL #
    ###############
    print '... training '

    # early-stopping parameters
    patience = 2000  # Long Duong : At least have to went through this much iteration 
    patience_increase = 2  # wait this much longer when a new best is found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    #validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch
    validation_frequency = validation_freq
    
    ######## FOR TESTING ONLY ##################
    #validation_frequency = 5 
    #n_train_batches = 10 
    #n_epochs = 1 
    ######################################
    
    
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    ref_batch_idx = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches): 
            (minibatch_avg_cost, minibatch_avg_error) = train_model(minibatch_index, ref_batch_idx)
            ref_batch_idx += 1
            if ref_batch_idx >= n_ref_train_batches:
                    ref_batch_idx = 0 
                
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index
            print (' Iteration :  ' + str(iter) + ' with Cost  (join) = ' + str(minibatch_avg_cost) + '  with errors (target only) = ' + str(minibatch_avg_error))
            # Long Duong : since in each epoch => n_train_batches has covered 
            # iter : is the number of update for the parameters (~ number of batches considered) 

            if (iter + 1) % validation_frequency == 0:
                # Note that because we 
                validation_losses = [validate_model( i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print(
                    'epoch %i, minibatch %i/%i, validation error %f %%' %
                    (
                        epoch,
                        minibatch_index ,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if (this_validation_loss < best_validation_loss * improvement_threshold):
                        # Long Duong : this is the key : need iter to get this good result => Waiting this much iter to expect 
                        # other better result ....  
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    # Save the model  
                    save_model(output,classifier.embeddingLayer.E.get_value(), 
                                       (classifier.dropout_HiddenLayer.W.get_value() * (1-dropout_rate_visible )).T, 
                                       classifier.dropout_HiddenLayer.b.get_value(), 
                                       (classifier.dropout_LogRegressionLayer.W.get_value() * (1- dropout_rate_hidden)).T)
                    # Now save the nglish model
                    
                    if english_file is not None:
                        save_embedding(english_file,classifier.refEmbeddingLayer.E.get_value(),classifier.embeddingLayer.E.get_value())
                                            
            # Long Duong : add max_iter criterion 
            if (patience <= iter) or (iter > max_iter) :
                done_looping = True
                break

    
    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i') %
          (best_validation_loss * 100., best_iter + 1))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Exemplo n.º 12
0
def get_image():
    """Gets an image file via POST request, feeds the image to the FaceNet model then saves both the original image
     and its resulting embedding from the FaceNet model in their designated folders.

        'uploads' folder: for image files
        'embeddings' folder: for embedding numpy files.
    """

    if request.method == 'POST':
        if 'file' not in request.files:
            return "No 'file' field in POST request!"
            
        list_success=[]        
        #file = request.files['file']
        for file in request.files.getlist('file'):
         
            filename = file.filename
            if filename == "": 
                return "No selected file!"
               

            if file and allowed_file(filename=filename, allowed_set=allowed_set):
                filename = secure_filename(filename=filename)
                try:
                # Read image file as numpy array of RGB dimension
                    img = imread(name=file, mode='RGB')

                    # Detect and crop a 160 x 160 image containing a human face in the image file
                    img = get_face(
                        img=img,
                        pnet=pnet,
                        rnet=rnet,
                        onet=onet,
                        image_size=image_size
                    )

                    # If a human face is detected
                    if img is not None:

                        embedding = img_to_encoding(
                            img,FRmodel
                        )
                        # Save cropped face image to 'uploads/' folder
                        save_image(img=img, filename=filename, uploads_path=uploads_path)

                        # Remove file extension from image filename for numpy file storage being based on image filename
                        filename = remove_file_extension(filename=filename)

                        # Save embedding to 'embeddings/' folder
                        save_embedding(
                            embedding=embedding,
                            filename=filename,
                            embeddings_path=embeddings_path
                        )
                        embedding_dict[filename]=embedding
                        
                        list_success.append(filename)
                        #return "Image uploaded and embedded successfully:- "+str(filename)

                    #else:
                     #   return "Image upload was unsuccessful! No human face was detected!"
                except :
                    return 'error'+str(filename)+'Image uploaded and embedded successfully ' +str(len(list_success))

        return "Image uploaded and embedded successfully:- "+str(len(list_success))

    else:
        return "POST HTTP method required!"
Exemplo n.º 13
0
def detail():
    ''' Detects text and face in Aadhaar Card '''

    if request.method == 'POST':

        # saving current timestamp
        current_time = str(datetime.datetime.now()).replace('-', '_').replace(
            ':', '_')

        # The type of image i.e. Front or Back image
        image_type1 = 'Front'
        image_type2 = 'Back'

        # Path for Front image and the face image that will be croppped
        filename1 = uploads_path + image_type1 + '/' + current_time + '.jpg'
        photo_path = uploads_path + image_type1 + '/' + 'faces' + '/' + current_time + '.png'

        # Path for Back image and the face image that will be croppped
        filename2 = uploads_path + image_type2 + '/' + current_time + '.jpg'
        crop_path = uploads_path + image_type2 + '/temp/' + current_time + '.png'

        # if the Front folder (in uploads) doesn't already exist, create it
        if not os.path.exists(uploads_path + image_type1):
            os.mkdir(uploads_path + image_type1)
            # directory for saving faces in the id cards
            os.mkdir(uploads_path + image_type1 + '/' + 'faces')

        # if the Back folder (in uploads) doesn't already exist, create it
        if not os.path.exists(uploads_path + image_type2):
            os.mkdir(uploads_path + image_type2)
            os.mkdir(uploads_path + image_type2 + '/temp')

        # variable to store details extracted from card
        details = {}

        # get Front Card Photo from user
        photo1 = request.files['photo-front']
        photo1.save(filename1)

        # get Front Card Photo from user
        photo2 = request.files['photo-back']
        photo2.save(filename2)

        print("Processing Front Image ......")

        # Process The Front Card Image
        data, photo_path = recognise_text(filename1, photo_path)
        details = get_labels_from_aadhar(data)
        print("Processing Front Image ...... DONE")

        print("Processing Back Image .......")

        # Process The Back Card Image
        crop_aadhar(filename2, crop_path)
        data2, photo_path2 = recognise_text(crop_path, 'none')
        details.update(get_address(data2))
        print("Processing Back Image ....... DONE")

        os.remove(crop_path)

        data_dict = {
            'status': True,
            'fields': details,
            'image_path_front': filename1,
            'image_path_back': filename2,
            'photo_path': photo_path
        }

        print("save into json files")
        # the json file where the output must be stored
        with open('myfile.json', 'a+') as out_file:
            json.dump(data_dict, out_file, indent=6)

        img = imread(name=photo_path, mode='RGB')
        print("Processing Face Image .......")
        # Detect and crop a 160 x 160 image containing a human face in the image file
        img = get_face(img=img,
                       pnet=pnet,
                       rnet=rnet,
                       onet=onet,
                       image_size=image_size)
        embedding = forward_pass(
            img=img,
            session=facenet_persistent_session,
            images_placeholder=images_placeholder,
            embeddings=embeddings,
            phase_train_placeholder=phase_train_placeholder,
            image_size=image_size)

        print("Processing Face Image ....... DONE")
        # Save The Face embedding as the name of the Person
        filename = data_dict['fields']['Name']
        filename = secure_filename(filename=filename)
        # Save embedding to 'embeddings/' folder
        save_embedding(embedding=embedding,
                       filename=filename,
                       embeddings_path=embeddings_path)

        # Write the Raw and Cleaned Text detected from the Card
        with open('outputs.txt', 'a+') as f:
            f.write(
                "##########################################################################\n\n"
            )
            f.write(
                '######################## Raw Output for Front Card Image #############################\n\n'
            )
            for value in data:
                f.write(str(value) + '\n')
            f.write(
                "##########################################################################\n\n"
            )
            f.write(
                '######################## Raw Output for Back Card Image #############################\n\n'
            )
            for value in data2:
                f.write(str(value) + '\n')
            f.write(
                '\n\n######################## Cleaned Output #############################\n\n'
            )
            for key, value in details.items():
                f.write(str(key) + ' : ' + str(value) + '\n')
            f.write(
                "##########################################################################\n\n"
            )

        return jsonify(data_dict)

    else:
        # if not POST, terminate
        return jsonify({'status': False})