예제 #1
0
def build_model(params, with_dis):
    """
    Build all components of the model.
    """
    # source embeddings
    src_dico, _src_emb = load_embeddings(params, source=True)
    params.src_dico = src_dico
    src_emb = nn.Embedding(len(src_dico), params.emb_dim, sparse=True)
    src_emb.weight.data.copy_(_src_emb)

    # target embeddings
    if params.tgt_lang:
        tgt_dico, _tgt_emb = load_embeddings(params, source=False)
        params.tgt_dico = tgt_dico
        tgt_emb = nn.Embedding(len(tgt_dico), params.emb_dim, sparse=True)
        tgt_emb.weight.data.copy_(_tgt_emb)
    else:
        tgt_emb = None

    # mapping
    mapping = nn.Linear(params.emb_dim, params.emb_dim, bias=False)
    if getattr(params, 'map_id_init', True):
        mapping.weight.data.copy_(torch.diag(torch.ones(params.emb_dim)))

    # normalize embeddings
    params.src_mean = normalize_embeddings(src_emb.weight.data,
                                           params.normalize_embeddings)
    if params.tgt_lang:
        params.tgt_mean = normalize_embeddings(tgt_emb.weight.data,
                                               params.normalize_embeddings)

    return src_emb, tgt_emb, mapping
예제 #2
0
    def export(self):
        """
        Export embeddings.
        """
        params = self.params

        # load all embeddings
        params.src_dico, src_emb = load_embeddings(params,
                                                   source=True,
                                                   full_vocab=True)
        params.tgt_dico, tgt_emb = load_embeddings(params,
                                                   source=False,
                                                   full_vocab=True)

        # apply same normalization as during training
        normalize_embeddings(src_emb,
                             params.normalize_embeddings,
                             mean=params.src_mean)
        normalize_embeddings(tgt_emb,
                             params.normalize_embeddings,
                             mean=params.tgt_mean)

        # map source embeddings to the target space
        bs = 4096
        for i, k in enumerate(range(0, len(src_emb), bs)):
            x = Variable(src_emb[k:k + bs], volatile=True)
            src_emb[k:k + bs] = self.mapping(x).data

        # write embeddings to the disk
        export_embeddings(src_emb, tgt_emb, params)
예제 #3
0
def load_embedding_dict(vocab_path="",
                        vector_path="",
                        embeddings_path="",
                        glove=False,
                        postspec=False):  #i can ignore
    """
  >>> _load_embedding_dict()
  :param vocab_path:
  :param vector_path:
  :return: embd_dict
  """
    if glove and postspec:
        raise ValueError("Glove and postspec cannot both be true")
    elif glove:
        if os.name == "nt":
            embd_dict = utils.load_embeddings(
                "C:/Users/anlausch/workspace/embedding_files/glove.6B/glove.6B.300d.txt",
                word2vec=False)
        else:
            embd_dict = utils.load_embeddings(
                "/work/anlausch/glove.6B.300d.txt", word2vec=False)
        return embd_dict
    elif postspec:
        embd_dict_temp = utils.load_embeddings(
            "/work/anlausch/ft_postspec.txt", word2vec=False)
        embd_dict = {}
        for key, value in embd_dict_temp.items():
            embd_dict[key.split("en_")[1]] = value
        assert ("test" in embd_dict)
        assert ("house" in embd_dict)
        return embd_dict
    elif embeddings_path.endswith("p"):
        with open(embeddings_path, 'rb') as handle:
            embd_dict = pickle.load(handle)
            return embd_dict

            # elif embeddings_path == "pickleTrue":  # todo: add load from pickle file
            #     with open('./data/embbedding_dict.p', 'rb') as handle:
            #         embd_dict = pickle.load(handle)
            return embd_dict
    elif embeddings_path != "":
        embd_dict = utils.load_embeddings(embeddings_path, word2vec=False)
        return embd_dict

    else:
        embd_dict = {}
        vocab = load_vocab_goran(vocab_path)
        vectors = load_vectors_goran(vector_path)
        for term, index in vocab.items():
            embd_dict[term] = vectors[index]
        assert len(embd_dict) == len(vocab)
        return embd_dict
예제 #4
0
    def _init(self):
        logger.info("Initializing ...")
        self.entity2id, self.id2entity, self.entid2tags = utils.generate_entity_property_idx(
            self.entityPath)
        self.property2id, self.id2property, self.proid2tags = utils.generate_entity_property_idx(
            self.propertyPath)
        self.entid2tycid = utils.generate_entity_tyc_idx(
            self.tycWordsPath, self.entity2id)
        self.train2id = utils.generate_data_idx(self.trainPath, self.entity2id,
                                                self.property2id)
        self.train2id_set = set([' '.join(map(str, t))
                                 for t in self.train2id])  # use for sampling
        self.conid2attrid = utils.generate_conceptid_to_attributesid(
            self.conceptAttrPath, self.entity2id, self.property2id,
            self.max_attr_size)
        self.conAttr2id, self.conAttr2id_set = utils.generate_concept_attributes_idx(
            self.conceptAttrPath, self.entity2id, self.property2id)
        self.dev2id = utils.generate_data_idx(self.devPath, self.entity2id,
                                              self.property2id)
        self.test2id = utils.generate_data_idx(self.testPath, self.entity2id,
                                               self.property2id)

        self.test_entity_candidate_ids = utils.read_sample_candidates(
            self.test_entity_candi_path, self.entity2id)
        self.test_attr_candidate_ids = utils.read_sample_candidates(
            self.test_attr_candi_path, self.property2id)

        self.sample_ent_cand_ids = utils.read_sample_candidates(
            self.sample_ent_candi_path, self.entity2id)
        self.sample_attr_cand_ids = utils.read_sample_candidates(
            self.sample_attr_candi_path, self.property2id)

        self.trainTotal = len(self.train2id)
        self.conceptAttrTotal = len(self.conid2attrid)
        self.devTotal = len(self.dev2id)
        self.testTotal = len(self.test2id)
        self.entityTotal = len(self.entity2id)
        self.propertyTotal = len(self.property2id)

        # tencent init
        if self.embeddingPath is not None:
            self.ent_embeddings = utils.load_embeddings(
                self.entity2id, self.embeddingPath, self.entityTotal,
                self.ent_size)
            self.rel_embeddings = utils.load_embeddings(
                self.property2id, self.embeddingPath, self.propertyTotal,
                self.rel_size)

        self.dev2id_batches = utils.get_batches(self.dev2id, self.batch_size)
        self.test2id_batches = utils.get_batches(self.test2id, self.batch_size)
예제 #5
0
def main(_):
    config = load_config(FLAGS.config)

    # Load saved model
    print "Loading model"
    model_path = os.path.join(config.data.ckpt, 'model.pt')
    model = torch.load(model_path)
    model.eval()

    # Load embeddings and (test) datasets
    l1_embeddings, l1_vocab = load_embeddings(path=config.data.l1_embeddings)
    l2_embeddings, l2_vocab = load_embeddings(path=config.data.l2_embeddings)

    # Translate all test files
    start = time.time()
    beam_size = 12

    test_dirs = ['data/test_en', 'data/test_fr']

    for test_dir in test_dirs:
        src_lang = test_dir.split('_')[-1]
        if src_lang == 'en':
            src_lang = 'l1'
            src_vocab = l1_vocab
            tgt_lang = 'l2'
            tgt_vocab = l2_vocab
        elif src_lang == 'fr':
            src_lang = 'l2'
            src_vocab = l2_vocab
            tgt_lang = 'l1'
            tgt_vocab = l1_vocab
        else:
            ValueError('source language')

        test_dataset = MonolingualDataset(folder=test_dir, vocab=src_vocab)
        test_loader = MonolingualDataLoader(test_dataset)
        test_file = test_dataset._paths[0].split('/')[2]
        print test_file, src_lang
        f = open('test_translated/' + test_file + '_translated', 'w')
        for i, sample in enumerate(test_loader):
            sample = {k: v.cuda() for k, v in sample.items() if v is not None}
            src, lengths, _, _ = transform_inputs(
                src=sample['src'],
                lengths=sample['src_len'],
                tgt=sample['src'])
            translated = translate(model, src, src_lang, lengths.data, beam_size,
                                   config.data.max_length, tgt_vocab)
            f.write(translated+'\n')
        f.close()
        print("Time to translate file (secs): ",  time.time() - start)
예제 #6
0
def _get_data(config, logger, name):
    """ get all the required data for training/testing the classifier """

    # load bert-related stuff
    bert_models = {'bert':'allenai/scibert_scivocab_uncased',
                   'roberta' : 'allenai/biomed_roberta_base'}
    vocab = bert_models[config['bert_model']]

    # load data
    partition, training_generator, validation_generator = load_data(config, vocab, max_len = config['max_len'])

    # get the embeddings: either from scratch, or from cache
    logger.info(f" Getting {config['embedding_type']} embeddings ...")
    
    if (config['embedding_type'] == 'bert') | (config['embedding_type'] == 'roberta'):
        embed_shape, train_embeddings, valid_embeddings = load_embeddings(config, name, vocab, training_generator, validation_generator)
    elif config['embedding_type'] == 'specter':
        # load from filepath
        embed_shape, train_embeddings, valid_embeddings = pickle.load(os.path(config["precomputed_embedding_path"]))
    else:
        raise logger.error("Only BERT, ROBERTA, and Specter embeddings accepted.")

    # dimension reduction: PCA (either from scratch, or from cache)
    if config["do_pca"]:
        logger.info(' Reducing embedding dimensions...')
        embed_shape, train_embeddings, valid_embeddings = get_pca_embeddings(config, name, train_embeddings, valid_embeddings)

    logger.info(' Dataset is: {} and PCA was performed: {}'.format(name, config["do_pca"]))
    logger.info(f'\n Num. training samples: {len(training_generator)} \
                  \n Num. validation samples: {len(validation_generator)}')

    return embed_shape, train_embeddings, valid_embeddings
예제 #7
0
def load_data(feature_type='identity', embedding_file=None):
    # Load graph.
    graph = utils.load_graph()
    node_ids = list(range(len(graph.nodes)))

    # Choose node features from identity, adjacency matrix, or embeddings.
    if feature_type == 'identity':
        node_features = np.eye(len(graph.nodes))
    elif feature_type == 'adjacency':
        node_features = nx.to_numpy_matrix(graph, node_ids)
    elif feature_type == 'embedding':
        embedding_path = 'node2vec/embeddings/' + embedding_file
        embeddings = utils.load_embeddings(embedding_path)
        node_features = np.array([embeddings[nid] for nid in node_ids])

    # Extract graph info to create torch geometric data object.
    x = torch.tensor(node_features, dtype=torch.float)
    y = torch.tensor(get_labels(graph), dtype=torch.long)
    edge_index, edge_attr = get_edges(graph)
    data = Data(x=x, edge_index=edge_index, y=y)

    # Obtain train/val/test splits.
    get_masks(data)

    return data
    def test_kmedoids(self, emb_filename, res_filename, budget):

        print(res_filename)

        # stats = ut.graph_stats(self.G, print_stats=False)
        v, em = ut.load_embeddings(emb_filename, self.G.nodes())

        influenced, influenced_grouped = [], []
        seeds = []
        for k in range(1, budget + 1):
            print('--------', k)
            S = ut.get_kmedoids_centers(em, k, v)

            I, I_grouped = map_fair_IC((self.G, S))
            influenced.append(I)
            influenced_grouped.append(I_grouped)

            S_g = {
                c: []
                for c in np.unique(
                    [self.G.nodes[v]['color'] for v in self.G.nodes])
            }
            for n in S:
                c = self.G.nodes[n]['color']
                S_g[c].append(n)

            seeds.append(
                S_g)  # id's of the seeds so the influence can be recreated

        ut.write_files(res_filename, influenced, influenced_grouped, seeds)
예제 #9
0
def read_files(folder, parent):
    print("[Local-embedding] Reading file:", parent)
    emb_file = '%s/embeddings.txt' % folder
    hier_file = '%s/hierarchy.txt' % folder
    keyword_file = '%s/keywords.txt' % folder  ## here only consider those remaining keywords

    embs = utils.load_embeddings(emb_file)
    keywords = set()
    cates = {}

    with open(keyword_file) as f:
        for line in f:
            keywords.add(line.strip('\r\n'))

    tmp_embs = {}
    for k in keywords:
        if k in embs:
            tmp_embs[k] = embs[k]
    embs = tmp_embs

    with open(hier_file) as f:
        for line in f:
            segs = line.strip('\r\n').split(' ')
            if segs[1] == parent:
                cates[segs[0]] = set()

    print(
        '[Local-embedding] Finish reading embedding, hierarchy and keywords files.'
    )

    return embs, keywords, cates
예제 #10
0
def main():

    # 在训练集上构建一元和二元词典
    word2id = load_word2id(length=VOCAB_SIZE)

    # 为深度学习算法准备数据loader
    train_loader_dl = DataLoader(
        dataset=DianPingDataSet("train"),
        batch_size=64,
        collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN)
    )
    test_loader_dl = DataLoader(
        dataset=DianPingDataSet("test"),
        batch_size=64,
        collate_fn=partial(collate_fn_dl, word2id, SENT_MAX_LEN)
    )
    vocab_size = len(word2id)
    print("Vocab Size:", vocab_size)
    print("加载词向量....")
    try:
        embedding = load_embeddings(word2id)
    except FileNotFoundError:
        embedding = None

    # 在深度学习模型上训练测试(CNN, LSTM)
    print("在BiLSTM模型上训练...")
    lstm_model = DeepModel(vocab_size, embedding, method="lstm")
    lstm_model.train_and_eval(train_loader_dl, test_loader_dl)

    print("在CNN模型上训练...")
    cnn_model = DeepModel(vocab_size, embedding, method="cnn")
    cnn_model.train_and_eval(train_loader_dl, test_loader_dl)
예제 #11
0
def get_related_figures(identifiers_dir, text_data_dir, embeddings_dir,
                        test_identifiers_dir, output_dir):
    """Get semantically related figures for a set of test figures.

    Args:
      identifiers_dir: (string) identifiers of all figures in the collection.
      text_data_dir: (string) the file with the text for each figure (for keyword retrieval purposes).
      embeddings_dir: (string) the embedding vectors for all figures in the collection.
      test_identifiers_dir: (string) the figures for which we want to find related figures (a subset of full collection)
      output_dir: (string) directory for the output data.

    Returns:
      None. Outputs the related figures to a file.
    """
    test_identifiers = utils.read_lines_from_file(test_identifiers_dir)
    all_identifiers = utils.read_lines_from_file(identifiers_dir)
    tf_idf_matrix = KnnSearcher.get_tf_idf_embeddings(text_data_dir)
    searcher = KnnSearcher(tf_idf_matrix, all_identifiers, 100)
    initial_result_list = searcher.perform_keyword_retrieval(test_identifiers)
    embedding_matrix = utils.load_embeddings(embeddings_dir)
    final_result_list = re_rank_with_embeddings(initial_result_list,
                                                embedding_matrix,
                                                all_identifiers)

    with open(output_dir, 'w+') as output_file:
        for figure_id in final_result_list:
            line = figure_id
            for other_figure in final_result_list[figure_id]:
                if other_figure[0] != figure_id:
                    line += ',' + other_figure[0]
            output_file.write(line + '\n')
예제 #12
0
def predict():
    source_data,target_data,test_data,word2id=utils.load_data()
    embeddings=utils.load_embeddings(word2id)

    print "测试集大小 %d" % len(test_data)

    results=[]

    #HybridCNNSS
    g1 = Graph('HybridCNNSS', 'HybridCNNSS1', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS2', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS3', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS4', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS5', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS6', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS7', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS8', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS9', embeddings)
    results.append(g1.run(test_data))
    g1 = Graph('HybridCNNSS', 'HybridCNNSS10', embeddings)
    results.append(g1.run(test_data))

    predicts=[]
    for predict in np.stack(results,axis=1):
        predicts.append(1.0*sum(predict)/len(predict))

    utils.generate_file(predicts)
예제 #13
0
def recursion(root, lvl):

    q = Queue.Queue()
    q.put((root, -1, 1, '*'))

    dbi_scores = {}

    while not q.empty():

        (c_folder, c_id, level, c_name) = q.get()

        if level >= int(lvl):
            continue

        hier_f = '%s/hierarchy.txt' % c_folder
        clus_kws_f = '%s/cluster_keywords.txt' % c_folder
        emb_f = '%s/embeddings.txt' % c_folder
        if not exists(hier_f):
            continue

        hier_map = utils.load_hier_f(hier_f)
        clus_map = get_clus_keywords(clus_kws_f)
        embs = utils.load_embeddings(emb_f)

        for cluster in hier_map:
            cc_id = hier_map[cluster]
            cluster_folder = '%s/%s' % (c_folder, cluster)
            cluster_namespace = '%s/%s' % (c_name, cluster)
            q.put((cluster_folder, cc_id, level + 1, cluster_namespace))

        # handle current
        dbi = compute_dbi(embs, clus_map, hier_map)
        print 'Computing DBI for %s: %f' % (c_name, dbi)
        dbi_scores[c_name] = (dbi, level)
    output_dbi(dbi_scores)
예제 #14
0
def init():
    print 'Loading training samples..'
    training_samples = utils.load_samples('../data/askubuntu/train_random.txt')
    print len(training_samples)

    print 'Loading dev samples..'
    dev_samples = utils.load_samples('../data/askubuntu/dev.txt')
    print len(dev_samples)

    print 'Loading test samples..'
    test_samples = utils.load_samples('../data/askubuntu/test.txt')
    print len(test_samples)

    print 'Loading corpus..'
    question_map = utils.load_corpus('../data/askubuntu/text_tokenized.txt')
    print len(question_map)

    print 'Loading stop words..'
    stop_words = utils.load_stop_words('../data/english_stop_words.txt')
    print len(stop_words)

    corpus_texts = map(lambda (t, b): t + ' ' + b, question_map.values())

    print 'Loading embeddings..'
    embedding_map = utils.load_embeddings(
        '../data/pruned_askubuntu_android_vector.txt', corpus_texts,
        stop_words)
    print len(embedding_map)
    print

    utils.store_embedding_map(embedding_map)

    return (training_samples, dev_samples, test_samples, question_map,
            embedding_map)
def predict_image():
    """Gets an image file via POST request, feeds the image to the FaceNet model, the resulting embedding is then
    sent to be compared with the embeddings database. The image file is not stored.

    An html page is then rendered showing the prediction result.
    """
    if request.method == 'POST':
        if 'file' not in request.files:
            return "No file part"

        file = request.files['file']
        filename = file.filename

        if filename == "":
            return "No selected file"

        if file and allowed_file(filename=filename, allowed_set=allowed_set):
            # Read image file as numpy array of RGB dimension
            img = imread(name=file, mode='RGB')
            # Detect and crop a 160 x 160 image containing a human face in the image file
            img = get_face(img=img,
                           pnet=pnet,
                           rnet=rnet,
                           onet=onet,
                           image_size=image_size)

            # If a human face is detected
            if img is not None:

                embedding = forward_pass(
                    img=img,
                    session=facenet_persistent_session,
                    images_placeholder=images_placeholder,
                    embeddings=embeddings,
                    phase_train_placeholder=phase_train_placeholder,
                    image_size=image_size)

                embedding_dict = load_embeddings()
                if embedding_dict:
                    # Compare euclidean distance between this embedding and the embeddings in 'embeddings/'
                    identity = identify_face(embedding=embedding,
                                             embedding_dict=embedding_dict)
                    return render_template('predict_result.html',
                                           identity=identity)

                else:
                    return render_template(
                        'predict_result.html',
                        identity=
                        "No embedding files detected! Please upload image files for embedding!"
                    )

            else:
                return render_template(
                    'predict_result.html',
                    identity=
                    "Operation was unsuccessful! No human face was detected.")
    else:
        return "POST HTTP method required!"
예제 #16
0
 def __init__(self, emb_size, vocab_size=11004):
     super(Baseline_Embeddings, self).__init__()
     self.embedding_prem = nn.Embedding(vocab_size, emb_size)
     self.embedding_hypo = nn.Embedding(vocab_size, emb_size)
     self.linear = nn.Linear(emb_size * 2, 3)
     embeddings_mat = load_embeddings()
     self.embedding_prem.weight.data.copy_(embeddings_mat)
     self.embedding_hypo.weight.data.copy_(embeddings_mat)
예제 #17
0
def label_emb_centric(folder, c_id):
    print 'Start labeling for %s, %s ========================' % (folder, c_id)
    # print folder
    par_folder = dirname(folder)
    cur_label = basename(folder)
    p_case_f = '%s/caseolap.txt' % par_folder
    c_case_f = '%s/caseolap.txt' % folder
    emb_f = '%s/embeddings.txt' % par_folder

    # generate word2vec phrases
    embs = utils.load_embeddings(emb_f)
    if cur_label not in embs:
        print 'Error!!!'
        exit(1)
    N = 100
    worst = -100
    bestw = [-100] * (N + 1)
    bestp = [''] * (N + 1)

    for ph in embs:
        sim = utils.cossim(embs[cur_label], embs[ph])
        if sim > worst:
            for i in range(N):
                if sim >= bestw[i]:
                    for j in range(N - 1, i - 1, -1):
                        bestw[j + 1] = bestw[j]
                        bestp[j + 1] = bestp[j]
                    bestw[i] = sim
                    bestp[i] = ph
                    worst = bestw[N - 1]
                    break

    cands = [(bestp[idx], bestw[idx]) for idx, x in enumerate(bestp)]

    phrase_map_p, cell_map_p, tmp = read_caseolap_result(p_case_f)
    parent_dist_ranking = cell_map_p[c_id]
    parent_dist_map = {ph: float(dist) for (ph, dist) in parent_dist_ranking}
    child_kl_ranking = rank_phrase(c_case_f)
    child_kl_map = {ph: dist for (ph, dist) in child_kl_ranking}
    min_score = 0.12
    label_cands = {}

    # for (ph, score) in parent_dist_ranking:
    for (ph, score) in cands:
        if ph not in parent_dist_map:
            continue

        if ph in child_kl_map:
            continue

        label_cands[ph] = score

    ranked_list = sorted(label_cands.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    print ranked_list

    return ranked_list[0][0]
예제 #18
0
파일: camera.py 프로젝트: Eklavya42/AuthX
    def get_frame(self):
        success, frame_orig = self.video.read()
        # We are using Motion JPEG, but OpenCV defaults to capture raw images,
        # so we must encode it into JPEG in order to correctly display the
        # video stream.
        # faces = face_cascade.detectMultiScale(image, 1.3, 5)
        frame = cv2.resize(src=frame_orig, dsize=(0, 0), fx=0.5, fy=0.5)
        embedding_dict = load_embeddings()

        frame = frame[:, :, ::-1]

        if frame.size > 0:
            faces, rects = get_faces_live(img=frame,
                                          pnet=pnet,
                                          rnet=rnet,
                                          onet=onet,
                                          image_size=image_size)

            # If there are human faces detected
            if faces:
                for i in range(len(faces)):
                    face_img = faces[i]
                    rect = rects[i]

                    # Scale coordinates of face locations by the resize ratio
                    rect = [coordinate * 2 for coordinate in rect]

                    face_embedding = forward_pass(
                        img=face_img,
                        session=facenet_persistent_session,
                        images_placeholder=images_placeholder,
                        embeddings=embeddings,
                        phase_train_placeholder=phase_train_placeholder,
                        image_size=image_size)

                    # Compare euclidean distance between this embedding and the embeddings in 'embeddings/'
                    identity = identify_face(embedding=face_embedding,
                                             embedding_dict=embedding_dict)

                    cv2.rectangle(img=frame_orig,
                                  pt1=(rect[0], rect[1]),
                                  pt2=(rect[2], rect[3]),
                                  color=(255, 215, 0),
                                  thickness=2)

                    W = int(rect[2] - rect[0]) // 2

                    cv2.putText(img=frame_orig,
                                text=identity,
                                org=(rect[0] + W - (W // 2), rect[1] - 7),
                                fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                                fontScale=0.5,
                                color=(255, 215, 0),
                                thickness=1,
                                lineType=cv2.LINE_AA)

        ret, jpeg = cv2.imencode('.jpg', frame_orig)
        return jpeg.tobytes()
예제 #19
0
파일: generate.py 프로젝트: wbakst/food
def main(analysis=False):
    # Load mappings and embeddings for specified network(s)
    mappings = ut.load_mappings()
    embeddings = ut.load_embeddings()
    # Extract IIDs for specified seed ingredients
    ingredient_to_iid = {
        ingredient: iid
        for iid, ingredient in
        mappings['IID_to_Ingredient_Mapping'].iteritems()
    }
    if args.seed_ingredients is not None:
        args.seed_ingredients = [
            ingredient_to_iid[ingredient]
            for ingredient in args.seed_ingredients
        ]

    if args.accent > 0 and not args.network == 'ocn_fph':
        raise Exception(
            'You set accent > 1 but did not use network \'ocn_fph\'.')

    num_ingredients = np.random.randint(args.min, args.max + 1)
    if args.accent > args.min:
        raise Exception(
            'Number of accent ingredients cannot be greater than the minimum number of ingredients.'
        )

    if args.cuisine == 'random':
        cuisine = np.random.choice(
            mappings['Cuisine_to_List_of_Ingredients_Mapping'].keys())
        print 'Randomly Chosen Cuisine: {}'.format(cuisine)
    else:
        cuisine = args.cuisine

    if args.network == 'ocn_fph':
        recipe = base_accent_generate(get_embeddings(embeddings, 'ocn', mappings, cuisine), \
                get_embeddings(embeddings, 'fph', mappings, cuisine), args.seed_ingredients, num_ingredients, args.accent)
    elif args.network == 'ucn':
        recipe = generate(get_embeddings(embeddings, 'ucn', mappings, cuisine),
                          args.seed_ingredients, num_ingredients)
    else:
        raise NotImplementedError

    if args.avoids is not None:
        avoid_iids = [ingredient_to_iid[a] for a in args.avoids]
        SN, SW = ut.load_sn()
        recipe = substitute_avoids(SW, \
                get_embeddings(embeddings, 'ocn', mappings, cuisine), avoid_iids, recipe)

    if analysis:
        return recipe

    base_ingredients = num_ingredients - args.accent
    for i, iid in enumerate(recipe):
        if args.network == 'ocn_fph' and i >= base_ingredients:
            print mappings['IID_to_Ingredient_Mapping'][iid], '(accent)'
        else:
            print mappings['IID_to_Ingredient_Mapping'][iid]
예제 #20
0
    def __init__(self,
                 pickle_path,
                 eval_path,
                 encoding="utf8"):

        self.eval_path = eval_path
        self.encoding = encoding
        self.embeddings, self.word2index = load_embeddings(pickle_path)
        self.top_results = min(self.embeddings.shape[0] - 2, 10)
예제 #21
0
    def load_all_embeddings(self, word_index, num_words):
        # Word cover rate in the embedding is: 0.8724167059563099
        glove_embeddings = load_embeddings(GLOVE_PATH, word_index, num_words)
        # Word cover rate in the embedding is: 0.6717114568599717
        wiki_embeddings = load_embeddings(WIKI_PATH, word_index, num_words)
        # google_new_embeddings = load_embeddings(GOOGLE_NEWS_PATH, word_index, num_words)
        # paragram_embeddings = load_embeddings(PARAGRAM_PATH, word_index, num_words)

        embedding_matrix = np.concatenate(
            (
                glove_embeddings,
                wiki_embeddings,
                # google_new_embeddings,
                # paragram_embeddings,
            ),
            axis=1)

        return torch.tensor(glove_embeddings, dtype=torch.float32)
예제 #22
0
def find_every_words_not_in_embeddings(embedding_path, vocab):
    oov = set()
    embeddings = load_embeddings(embedding_path)
    for token in vocab:
        if token not in embeddings and token.capitalize(
        ) not in embeddings and token.upper(
        ) not in embeddings and token.lower() not in embeddings:
            oov.add(token)
    return oov
예제 #23
0
def init():
    print 'Loading askubuntu training samples..'
    askubuntu_training_samples = utils.load_samples(
        '../data/askubuntu/train_random.txt')
    print len(askubuntu_training_samples)

    print 'Loading askubuntu dev samples..'
    askubuntu_dev_samples = utils.load_samples('../data/askubuntu/dev.txt')
    print len(askubuntu_dev_samples)

    print 'Loading askubuntu test samples..'
    askubuntu_test_samples = utils.load_samples('../data/askubuntu/test.txt')
    print len(askubuntu_test_samples)

    print 'Loading askubuntu corpus..'
    askubuntu_question_map = utils.load_corpus(
        '../data/askubuntu/text_tokenized.txt')
    print len(askubuntu_question_map)

    print 'Loading android dev samples..'
    android_dev_samples = utils.load_samples_stupid_format(
        '../data/android/dev.pos.txt', '../data/android/dev.neg.txt')
    print len(android_dev_samples)

    print 'Loading android test samples..'
    android_test_samples = utils.load_samples_stupid_format(
        '../data/android/test.pos.txt', '../data/android/test.neg.txt')
    print len(android_test_samples)

    print 'Loading android corpus..'
    android_question_map = utils.load_corpus('../data/android/corpus.tsv')
    print len(android_question_map)
    
    print 'Loading stop words..'
    stop_words = utils.load_stop_words('../data/english_stop_words.txt')
    print len(stop_words)

    corpus_texts = map(lambda (t, b): t + ' ' + b,
                       askubuntu_question_map.values() + android_question_map.values())
    
    print 'Loading embeddings..'
    embedding_map = utils.load_embeddings(
        '../data/pruned_android_vector.txt', corpus_texts, stop_words)  # pruned_askubuntu_android_vector.txt
    print len(embedding_map)
    print

    utils.store_embedding_map(embedding_map)

    return (
        askubuntu_training_samples,
        askubuntu_dev_samples,
        askubuntu_test_samples,
        askubuntu_question_map,
        android_dev_samples,
        android_test_samples,
        android_question_map,
        embedding_map)
예제 #24
0
def train_network(vectorizer, network_type, task_type, train_table,
                  setting_name):
    """
    Main function of vectorization for neural network

    network_type : str
        type of the network, which should be presented in NETWORKS dictionary.
    task_type : str
        TTK_TASK or BANK_TASK
    train_table : str
        Train table filepath

    returns : None
    """
    message_settings, features_settings = utils.load_embeddings()

    features = Features(
            TwitterMessageParser(message_settings, task_type),
            features_settings)

    term_vocabulary = TermVocabulary()
    doc_vocabulary = DocVocabulary()

    problem = utils.create_problem(task_type,
                                   'train',
                                   train_table,
                                   vectorizer,
                                   features,
                                   term_vocabulary,
                                   doc_vocabulary,
                                   message_settings)

    assert(len(problem) > 0)

    X, y = get_problem(problem, get_results=True)

    embedding_size = X.shape[1]
    logging.info("embedding_size: {}".format(embedding_size))
    logging.info("Create RNN network model ...")

    # TODO:
    # Network setting should be presented in json configuration (apperently
    # rnn.conf)
    hidden_size = 400
    model = get_network(network_type, embedding_size, hidden_size)
    paths = get_model_paths(task_type, network_type, setting_name)

    logging.info("Pack embedding settings: {} ...".format(
        paths['embedding_output']))
    save_embeddings(paths['embedding_output'])

    logging.info("Save term vocabulary: {} ...".format(
        paths['term_vocabulary']))
    term_vocabulary.save(paths['term_vocabulary'])

    optimizer.train_network(model, X, y, paths['model_output'])
예제 #25
0
    def __init__(self, csvpath, mode='train'):
        self.mode = mode
        df = pd.read_csv(csvpath)
        le = LabelEncoder()
        if self.mode == 'train':
            '''
            Load the train data and split into train test sets
            '''
            X = list(df['text'])
            '''
            tokenize the input X_train data
            '''
            tok = keras.preprocessing.text.Tokenizer(num_words=1000)
            tok.fit_on_texts(X)
            if self.mode == 'train':
                self.tok = tok
                # integer encode documents
                X_train = tok.texts_to_sequences(X)
                #padd so all same length
                X_train = keras.preprocessing.sequence.pad_sequences(
                    X_train, padding='post')
                self.maxpad = X_train.shape[1]
                self.inp = self.X_train.values
                self.oup = self.list(df['target'])
                '''
                Load word embeddings
                '''
                word_counts = pd.DataFrame(
                    dict(tok.word_counts),
                    index=['count']).transpose().sort_values(by='count',
                                                             ascending=False)
                num_words = len(word_counts)
                tok_dict = dict(tok.index_word)
                word_embeddings_dict = utils.load_embeddings(
                    './data/non_tracked/glove.6B.100d.txt')
                '''
                Create the embedding_matrix for the words in our vocabulary
                '''
                embeddings_words = list(word_embeddings_dict.keys())
                wordvec_dim = word_embeddings_dict[
                    embeddings_words[0]].shape[0]
                embedding_matrix = np.zeros((num_words, wordvec_dim))
                for i, word in tok_dict.items():
                    # Look up the word embedding
                    vector = word_embeddings_dict.get(word, None)
                    # Record in matrix
                    if vector is not None:
                        embedding_matrix[i, :] = vector
                self.embedding_matrix = embedding_matrix

            else:
                #transform test data
                X_test = self.tok.texts_to_sequences(X)
                X_test = keras.preprocessing.sequence.pad_sequences(
                    X_test, padding='post', maxlen=self.maxpad)
                self.inp = self.X_test.values
def create_oov(dataset, embeddings_path):
    sentences = dataset.get_train_sentences + dataset.get_valid_sentences + dataset.get_test_sentences
    
    embeddings = load_embeddings(embeddings_path)

    oov = sorted(set(word for sentence in sentences for word in sentence if word not in embeddings))

    filepath = './' + dataset.dataset_name + '/oov.txt'
    with open(filepath, 'w', encoding='utf-8') as file:
        file.write('\n'.join(oov)+'\n')
예제 #27
0
def main():
    print("Loading embeddings")
    #load embeddings
    if os.name == "nt":
        # embd_dict = utils.load_embeddings(
        #    "C:/Users/anlausch/workspace/cnn-text-classification/data/GoogleNews-vectors-negative300.bin", word2vec=True)
        embd_dict = utils.load_embeddings(
            "C:/Users/anlausch/workspace/embedding_files/glove.6B/glove.6B.50d.txt",
            word2vec=False)
    else:
        # embd_dict = utils.load_embeddings("~/GoogleNews-vectors-negative300.bin", word2vec=True)
        embd_dict = utils.load_embeddings("./glove.6B.300d.txt",
                                          word2vec=False)

    # print("Grid Search with SVM for TFIDF Embedding Features")
    # print("===========================================")
    # for task in ["discourse", "aspect", "summary"]:
    #     #grid_search_linear_svm_tfidf(task=task)
    #     grid_search_rbf_svm_tfidf_embeddings(embd_dict=embd_dict, task=task)
    #
    # print("Grid Search with SVM for Embedding Features")
    # print("===========================================")
    # for task in ["discourse", "aspect", "summary"]:
    #     #grid_search_linear_svm_tfidf(task=task)
    #     grid_search_rbf_svm_embeddings(embd_dict=embd_dict, task=task)

    print("Grid Search with SVM for TFIDF")
    print("===========================================")
    for task in ["discourse", "aspect", "summary"]:
        #grid_search_linear_svm_tfidf(task=task)
        grid_search_rbf_svm_tfidf(task=task)

    print("Grid Search with SVM linear embeddings")
    print("===========================================")
    for task in ["discourse", "aspect", "summary"]:
        #grid_search_linear_svm_tfidf(task=task)
        grid_search_linear_svm_embeddings(embd_dict=embd_dict, task=task)

    print("Grid Search with SVM linear for TFIDF")
    print("===========================================")
    for task in ["discourse", "aspect", "summary"]:
        #grid_search_linear_svm_tfidf(task=task)
        grid_search_linear_svm_tfidf(task=task)
예제 #28
0
def load_embedding_dict(vocab_path="",
                        vector_path="",
                        embeddings_path="",
                        glove=False,
                        postspec=False):
    """
  >>> _load_embedding_dict()
  :param vocab_path:
  :param vector_path:
  :return: embd_dict
  """
    if glove and postspec:
        raise ValueError("Glove and postspec cannot both be true")
    elif glove:
        if os.name == "nt":
            embd_dict = utils.load_embeddings(
                "C:/Users/anlausch/workspace/embedding_files/glove.6B/glove.6B.300d.txt",
                word2vec=False)
        else:
            embd_dict = utils.load_embeddings(
                "/work/anlausch/glove.6B.300d.txt", word2vec=False)
        return embd_dict
    elif postspec:
        embd_dict_temp = utils.load_embeddings(
            "/work/anlausch/ft_postspec.txt", word2vec=False)
        embd_dict = {}
        for key, value in embd_dict_temp.items():
            embd_dict[key.split("en_")[1]] = value
        assert ("test" in embd_dict)
        assert ("house" in embd_dict)
        return embd_dict
    elif embeddings_path != "":
        embd_dict = utils.load_embeddings(embeddings_path, word2vec=True)
        return embd_dict
    else:
        embd_dict = {}
        vocab = load_vocab_goran(vocab_path)
        vectors = load_vectors_goran(vector_path)
        for term, index in vocab.items():
            embd_dict[term] = vectors[index]
        assert len(embd_dict) == len(vocab)
        return embd_dict
예제 #29
0
 def __init__(self, task):
     self.ckpt_path = './ckpt/{}/'.format(task)
     if not os.path.exists(self.ckpt_path):
         os.makedirs(self.ckpt_path)
     source_dir = os.path.join('.', 'dataset', 'data', task)
     self.word_vocab, _ = load_vocab(os.path.join(source_dir, 'words.vocab'))
     self.char_vocab, _ = load_vocab(os.path.join(source_dir, 'chars.vocab'))
     self.vocab_size = len(self.word_vocab)
     self.char_vocab_size = len(self.char_vocab)
     self.label_size = load_json(os.path.join(source_dir, 'label.json'))["label_size"]
     self.word_emb = load_embeddings(os.path.join(source_dir, 'glove.filtered.npz'))
예제 #30
0
def predict_image(file):
    # file = request.files['file']
    # file = os.path.join(APP_ROOT, 'uploads/Abdulrahman Safh.png')
    # Read image file as numpy array of RGB dimension
    #img = io.imread(fname=file)
    img = imread(name=file, mode='RGB')
    # Detect and crop a 160 x 160 image containing a human face in the image file
    faces, rects = get_faces_live(img=img, pnet=pnet, rnet=rnet,
                           onet=onet, image_size=image_size)
    #global d
    # If there are human faces detected
    if faces:
        embedding_dict = load_embeddings()
        if embedding_dict:
            people_found = []
            for i in range(len(faces)):
                face_img = faces[i]
                rect = rects[i]

                face_embedding = forward_pass(
                    img=face_img, session=facenet_persistent_session,
                    images_placeholder=images_placeholder, embeddings=embeddings,
                    phase_train_placeholder=phase_train_placeholder,
                    image_size=image_size
                )

                # Compare euclidean distance between this embedding and the embeddings in 'embeddings/'
                identity = identify_face(
                    embedding=face_embedding, embedding_dict=embedding_dict)
                people_found.append(identity)

                cv2.rectangle(img, (rect[0], rect[1]), (rect[2], rect[3]), (0, 255, 0), 3)

                W = int(rect[2] - rect[0]) // 2
                H = int(rect[3] - rect[1]) // 2

                cv2.putText(img, identity, (rect[0] + W - (W // 2), rect[1] - 7),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 255), 1, cv2.LINE_AA)

            # code for saving the output images
            # cv2.imwrite("SavedImgesFull/file_%d.jpg" % d, img)
            #d += 1
            return people_found

        else:
            # return ["No Face"]
            return None
            # return render_template(
            #     'predict_result.html',
            #     identity="No embedding files detected! Please upload image files for embedding!"
            # )
    else:
        # return ["No Image"]
        return None
    def __init__(self):

        # Logger
        self.logger = logger_init()
        # Use Cuda
        Config.cuda = True
        self.device = None
        if Config.cuda and torch.cuda.is_available():
            self.device = torch.device('cuda')
        else:
            self.device = torch.device('cpu')

        ################## Data ###################
        # Load Sparse Adjacency Matrix
        file_name = 'adj_input.pkl'
        (data, rows, columns, vocab_dict) = pd.read_pickle(file_name)
        id_word_map = {v: k for k, v in vocab_dict.items()}
        rel_list = ['ISA']
        num_entities = len(vocab_dict)
        num_relations = len(rel_list)

        # Build the adjacency matrix and remove the edges which fre < 10.
        rows = rows + [i for i in range(num_entities)]
        columns = columns + [i for i in range(num_entities)]
        data = data + [1 for i in range(num_entities)]
        adjs = coo_matrix((data, (rows, columns)),
                          shape=(num_entities, num_entities)).toarray()
        # only hyponym-hypernym candidate pairs observed more than 10 times are used to create a noisy graph.
        adjs = np.where(adjs >= 10, 1, 0)
        self.adjs = torch.FloatTensor(adjs).to(device=self.device)
        del rows
        del columns
        del data

        # Use X as index for the randomly initialized embeddings
        self.X = torch.LongTensor([i for i in range(num_entities)
                                   ]).to(device=self.device)
        # Load the word embedding if we use it.
        self.word_embs = load_embeddings(vocab_dict).to(device=self.device)
        logging.info('Finished the preprocessing')

        ################## Model, Optimizer, LossFunction ###################
        self.model = GRAPH2TAXO(num_entities,
                                num_relations).to(device=self.device)
        self.opt = torch.optim.Adam(self.model.parameters(),
                                    lr=Config.learning_rate,
                                    weight_decay=Config.L2)
        self.f1_loss = F1_Loss().to(device=self.device)

        ################## Part of Hyperparameters ###################
        # Hyperparameters for the constraints
        self.lambda_A = 1.0  # 1.0
        self.c_A = 0.5  # 0.5
        self.tau_A = 1.0  # 1.0
예제 #32
0
파일: train.py 프로젝트: MERCURYCOA/ESIM
def train(preproc_dir, n_classes, max_length, hidden_units, dropout,
          batch_size, epochs, output_dir):
    """
    Train the ESIM model on some dataset and save the learned weights.

    Args:
        preproc_dir: The directory where the preprocessed data is saved.
        n_classes: The number of classes in the problem.
        max_length: The maximum length of the sentences in the premises and
                    hypotheses of the dataset.
        hidden_units: The number of hidden units to use in the various layers
                      of the model.
        dropout: The dropout rate to use in the model.
        batch_size: The size of the batches to use for training.
        epochs: The number of epochs to apply during training.
        output_dir: The path to the directory where the weights learned during
                    training must be saved.
    """
    print("Loading training and validation data...")
    train_premises, train_hyps, train_labels = prepare_data(
        preproc_dir, 'train', n_classes, max_length)
    valid_premises, valid_hyps, valid_labels = prepare_data(
        preproc_dir, 'dev', n_classes, max_length)
    # train_premises是如下形式:
    # [[5, 6, 7, 8, 9, 3, 10, 11, 12, 13, 14, 2, 15, 16, 3,0,0,0,0],
    #  [17, 18, 19, 20, 21, 22, 4, 23, 2, 24,0,0,0,0,0,0,0,0,0],
    #  [25, 26, 27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]]

    print("Loading embedding weights...")
    embedding_weights = load_embeddings(
        os.path.join(preproc_dir, "embedding_weights.pkl"))

    # Build the model.
    esim = ESIM(n_classes, embedding_weights, max_length, hidden_units,
                dropout)
    model = esim.build_model()

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    filepath = os.path.join(output_dir,
                            "weights-{epoch:02d}-{val_acc:.2f}.hdf5")
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')

    model.fit(x=[train_premises, train_hyps],
              y=train_labels,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=([valid_premises, valid_hyps], valid_labels),
              callbacks=[checkpoint],
              shuffle=True)
예제 #33
0
def prepare_problem(vectorizer, task_type, train_table, test_table,
                    etalon_table):
    """
    Main function of vectorization for neural network
    """
    message_settings, features_settings = utils.load_embeddings()

    features = Features(
            TwitterMessageParser(message_settings, task_type),
            features_settings)

    term_vocabulary = TermVocabulary()
    doc_vocabulary = DocVocabulary()

    train_problem = utils.create_problem(task_type, 'train', train_table,
                                         vectorizer, features, term_vocabulary,
                                         doc_vocabulary, message_settings)

    test_problem = utils.create_problem(task_type, 'test', test_table,
                                        vectorizer, features, term_vocabulary,
                                        doc_vocabulary, message_settings)

    return (train_problem, test_problem)
예제 #34
0
cwd = os.getcwd()
vectorizer = Vectorizer(min_frequency=config.min_freq)

validation_data_path = cwd + config.relative_dev_path
validation_abstracts = headline2abstractdataset(validation_data_path, vectorizer, args.cuda, max_len=1000)

data_path = cwd + config.relative_data_path
abstracts = headline2abstractdataset(data_path, vectorizer, args.cuda, max_len=1000)
print("number of training examples: %d" % len(abstracts))

vocab_size = abstracts.vectorizer.vocabulary_size
embedding = nn.Embedding(vocab_size, config.emsize, padding_idx=0)

if config.pretrained:
    embedding = load_embeddings(embedding, abstracts.vectorizer.word2idx, config.pretrained, config.emsize)

context_encoder = ContextEncoder(config.context_dim, len(abstracts.context_vectorizer), config.emsize)

encoder_title = EncoderRNN(vocab_size, embedding, abstracts.head_len, config.emsize, input_dropout_p=config.dropout,
                     n_layers=config.nlayers, bidirectional=config.bidirectional, rnn_cell=config.cell)
encoder = EncoderRNN(vocab_size, embedding, abstracts.abs_len, config.emsize, input_dropout_p=config.dropout, variable_lengths = False,
                  n_layers=config.nlayers, bidirectional=config.bidirectional, rnn_cell=config.cell)
decoder = DecoderRNNFB(vocab_size, embedding, abstracts.abs_len, config.emsize, sos_id=2, eos_id=1,
                     n_layers=config.nlayers, rnn_cell=config.cell, bidirectional=config.bidirectional,
                     input_dropout_p=config.dropout, dropout_p=config.dropout)
model = FbSeq2seq(encoder_title, encoder, context_encoder, decoder)
total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in model.parameters())
print('Model total parameters:', total_params, flush=True)

if config.dataparallel and torch.cuda.device_count() > 1:
def train_network(vectorizer, network_type, task_type, train_table,
                  test_table, etalon_table, setting_name):
    """
    Main function of vectorization for neural network
    """
    message_settings, features_settings = utils.load_embeddings()

    features = Features(
            TwitterMessageParser(message_settings, task_type),
            features_settings)

    term_vocabulary = TermVocabulary()
    doc_vocabulary = DocVocabulary()

    train_problem = utils.create_problem(task_type, 'train', train_table,
                                         vectorizer, features, term_vocabulary,
                                         doc_vocabulary, message_settings)

    test_problem = utils.create_problem(task_type, 'test', test_table,
                                        vectorizer, features, term_vocabulary,
                                        doc_vocabulary, message_settings)

    assert(len(train_problem) > 0 and len(test_problem) > 0)

    # Transform into appliable for neural network collections
    X_test = get_problem(test_problem, get_results=False)
    X_train, Y = get_problem(train_problem, get_results=True)

    assert(X_test.shape[1] == X_train.shape[1])

    embedding_size = X_test.shape[1]
    logging.info("embedding_size: {}".format(embedding_size))
    logging.info("Create {} network model ...".format(network_type))

    # TODO:
    # Network setting should be presented in json configuration (apperently
    # rnn.conf)
    hidden_layer_size = 400
    model = get_network(network_type, embedding_size, hidden_layer_size)
    paths = get_model_paths(task_type, network_type, setting_name)

    diagnostic_output = join(configs.NETWORK_MODELS_ROOT,
                             "{}.diag".format(setting_name))

    logging.info("Pack embedding settings: {} ...".format(
        paths['embedding_output']))
    save_embeddings(paths['embedding_output'])

    def callback(model, X_test, X_train, Y, task_type, result_table,
                 etalon_table, diagnostic_output):
        """
        Test model
        """
        logging.info("Testing model ...")
        loss = model.calculate_loss(X_train, Y)
        predict(model, X_test, task_type, result_table)
        result = check(task_type, result_table, etalon_table)
        logging.info("Appending results: {} ...".format(diagnostic_output))
        with open(diagnostic_output, 'a') as output:
            output.writelines("{} {} {}\n".format(
                loss, result["F_macro"], result["F_micro"]))

    model_output = paths['model_output']
    if (exists(model_output)):
        logging.info("Loading existed model: {} ...".format(model_output))
        model.load(model_output)

    output_table = test_table + '.result.csv'
    prepare_result_table(test_table, output_table)

    test = lambda: callback(model, X_test, X_train, Y, task_type,
                            output_table, etalon_table, diagnostic_output)

    optimizer.train_network(model, X_train, Y, model_output, callback=test)

    with open(diagnostic_output, 'a') as output:
        output.writelines("-----")
예제 #36
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")
FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
vocab, embeddings = utils.load_embeddings()
train_data = utils.load_train_data(vocab, FLAGS.sequence_length)
test_data = utils.load_test_data(vocab, FLAGS.sequence_length)
print("Load done...")

# Training
# ==================================================

prev_auc = 0
with tf.Graph().as_default():
  with tf.device("/gpu:1"):
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():