示例#1
0
    def __init__(self,
                 input_size: int = INPUT_SIZE,
                 output_size: int = OUTPUT_SIZE,
                 hidden_size: int = HIDDEN_SIZE,
                 embed_size: int = EMBED_SIZE,
                 lr: float = LEARNING_RATE,
                 clip_grad: float = CLIP_GRAD,
                 init_range: float = INIT_RANGE):
        input_layers = [
            Embedding(input_size, embed_size, init_range),
            LSTM(embed_size, hidden_size, init_range)
        ]

        output_layers = [
            Embedding(output_size, embed_size, init_range),
            LSTM(embed_size, hidden_size, init_range,
                 previous=input_layers[1]),
            Softmax(hidden_size, output_size, init_range)
        ]

        self.input_layers, self.output_layers = input_layers, output_layers
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.input_size = input_size
        self.output_size = output_size
        self.lr = lr
        self.clip_grad = clip_grad
示例#2
0
def get_image_feature(img_path, img_list_path, model_path, epoch, gpu_id):
    img_list = open(img_list_path)
    embedding = Embedding(model_path, epoch, gpu_id)
    files = img_list.readlines()
    print('files:', len(files))
    faceness_scores = []
    img_feats = []
    for img_index, each_line in enumerate(files):
        if img_index % 500 == 0:
            print('processing', img_index)
            if img_index == 2000:
                break
        name_lmk_score = each_line.strip().split(' ')
        img_name = os.path.join(img_path, name_lmk_score[0])
        img = cv2.imread(img_name)
        lmk = np.array([float(x) for x in name_lmk_score[1:-1]],
                       dtype=np.float32)
        lmk = lmk.reshape((5, 2))
        img_feats.append(embedding.get(img, lmk))
        faceness_scores.append(name_lmk_score[-1])
    img_feats = np.array(img_feats).astype(np.float32)
    faceness_scores = np.array(faceness_scores).astype(np.float32)

    #img_feats = np.ones( (len(files), 1024), dtype=np.float32) * 0.01
    #faceness_scores = np.ones( (len(files), ), dtype=np.float32 )
    return img_feats, faceness_scores
def build_model(vocab: Vocabulary,
                args,
                **kwargs) -> Model:
    print("Building the model")
    vocab_size = vocab.get_vocab_size("tokens")
    EMBED_DIMS = 200

    if args.pretrained_WE_path:
    # turn the tokens into 300 dim embedding. Then, turn the embeddings into encodings
        embedder = BasicTextFieldEmbedder(
            {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size,
                                 pretrained_file=args.pretrained_WE_path, vocab=vocab, )})

    else:
        embedder = BasicTextFieldEmbedder(
            {"tokens": Embedding(embedding_dim=EMBED_DIMS, num_embeddings=vocab_size)})

    encoder = CnnEncoder(embedding_dim=EMBED_DIMS, ngram_filter_sizes = (2,3,5),
                         num_filters=5) # num_filters is a tad bit dangerous: the reason is that we have this many filters for EACH ngram f

    # encoder = BertPooler("bert-base-cased")
    # the output dim is just the num filters *len(ngram_filter_sizes)

    #     construct the regularizer applicator
    regularizer_applicator = None
    if args.use_reg :
        l2_reg = L2Regularizer()
        regexes = [("embedder", l2_reg),
                   ("encoder", l2_reg),
                   ("classifier", l2_reg)
                   ]
        regularizer_applicator = RegularizerApplicator(regexes)

    return MortalityClassifier(vocab, embedder, encoder,regularizer_applicator,**kwargs)
示例#4
0
 def __init__(self, mode=None, tuner=None, model_name="lgb") -> None:
     self.stopWords = [
         x.strip() for x in open(
             config.stopwords, encoding='utf-8', mode='r').readlines()
     ]
     self.embedding = Embedding()
     self.embedding.load()
     self.labelToIndex = json.load(
         open(config.label2id_file, encoding='utf-8'))
     self.ix2label = {v: k for k, v in self.labelToIndex.items()}
     self.mode = mode
     if not self.mode:
         self.mode = 'train'
     self.tuner = tuner
     assert self.mode in ['train', 'predict']
     if self.tuner:
         assert self.tuner in ['bayes', 'grid']
     if self.mode == "train":
         self.train_data = pd.read_csv(
             config.train_data_file,
             sep='\t').dropna().reset_index(drop=True)
         self.dev_data = pd.read_csv(
             config.eval_data_file,
             sep='\t').dropna().reset_index(drop=True)
     else:
         self.test_data = pd.read_csv(
             config.test_data_file,
             sep='\t').dropna().reset_index(drop=True)
     self.exclusive_col = ['text', 'lda', 'bow', 'label']
     self.model = None
     self.model_name = model_name
示例#5
0
def main():
    """
    test method for this class, takes a patient ID as the first cmd line argument and prints out lengths of each session
    that was loaded
    """
    pid = sys.argv[1]
    patient = Patient(pid)
    if patient.pre_test is not None:
        print("season start: {}".format(len(patient.pre_test.raw)))
        for i in range(len(patient.intermediate_tests)):
            print("concussion {}: {}".format(
                i, len(patient.intermediate_tests[i].raw)))
        print("season end: {}".format(len(patient.post_test.raw)))
    prep.stft(patient.pre_test)
    examples = patient.pre_test.get_examples()
    emb = Embedding("pca")
    emb.train(examples)
    emb_examples = emb.embed(examples)

    #patient.season_start.extract_windows()
    #patient.season_start.plot_windows(windows=np.arange(10), channels=["c3", "cz", "c4", "p3", "pz", "p4"])
    prep.extractWaves(patient.pre_test, n=4001, samplingRate=256, wave='alpha')
    patient.pre_test.extract_windows()
    patient.pre_test.plot_windows(
        windows=np.arange(10), channels=["c3", "cz", "c4", "p3", "pz", "p4"])
    #patient.season_start.plot_channels(channels=["c3", "cz", "c4", "p3", "pz", "p4"], end=256)
    import pdb
    pdb.set_trace()
def create_representation(args):
    rep_type = args['<representation>']
    path = args['<representation_path>']
    w_c = args['--w+c']
    eig = float(args['--eig'])

    if rep_type == 'PPMI':
        if w_c:
            raise Exception('w+c is not implemented for PPMI.')
        else:
            return Explicit.load(path, True)

    elif rep_type == 'SVD':
        if w_c:
            return EnsembleEmbedding(SVDEmbedding(path, False, eig, False),
                                     SVDEmbedding(path, False, eig, True),
                                     True)
        else:
            return SVDEmbedding(path, True, eig)

    else:
        if w_c:
            return EnsembleEmbedding(Embedding.load(path + '.words', False),
                                     Embedding.load(path + '.contexts', False),
                                     True)
        else:
            return Embedding.load(path, True)
示例#7
0
    def __init__(self,
                 logger=Log(print),
                 embedding_file='data/wiki-news-300d-1M.vec',
                 bots_file='data/bots_tweets.txt',
                 human_file='data/human_tweets.txt',
                 validation_split=0.2,
                 test_split=0.2,
                 batch_size=50,
                 epochs=25,
                 additional_feats_enabled=True,
                 custom_callback=None,
                 early_stopping=5,
                 dataset_config=DatasetConfig.USER_STATE):

        self.dataset = DatasetBuilder(logger, dataset_config)
        _, self.dataset_config_name = dataset_config
        self.logger = logger
        self.custom_callback = custom_callback
        self.embedding = Embedding(logger, embedding_file)
        self.model = None  # initialize later
        self.additional_feats_enabled = additional_feats_enabled
        self.batch_size = batch_size
        self.epochs = epochs
        self.early_stopping = early_stopping
        self.validation_split = validation_split
        self.test_split = test_split
        self.bots_file = bots_file
        self.human_file = human_file
        self.x_bot_tweets = []
        self.bot_tweets = []
        self.bot_test_tweets = []
        self.doc_test_tweets = []
        self.labels_test = []
def create_representation(args):
    rep_type = args['<representation>']
    path = args['<representation_path>']
    neg = int(args['--neg'])
    w_c = args['--w+c']
    eig = float(args['--eig'])
    normalize = args['--normalize']

    if rep_type == 'PPMI':
        if w_c:
            raise Exception('w+c is not implemented for PPMI.')
        else:
            return PositiveExplicit(path, normalize, neg)

    elif rep_type == 'SVD':
        if w_c:
            return EnsembleEmbedding(SVDEmbedding(path, normalize, eig, False),
                                     SVDEmbedding(path, normalize, eig, True),
                                     normalize)
        else:
            return SVDEmbedding(path, normalize, eig)

    else:
        if w_c:
            return EnsembleEmbedding(Embedding(path + '.words', normalize),
                                     Embedding(path + '.contexts', normalize),
                                     normalize)
        else:
            return Embedding(path + '.words', normalize)
示例#9
0
def dimension_afn(x):
    _embedding = Embedding(x)

    dim = np.arange(1, 20 + 2)
    E, Es = _embedding.afn(x, tau=138, dim=dim, window=45, metric='chebyshev')
    E1, E2 = E[1:] / E[:-1], Es[1:] / Es[:-1]
    _embedding.plot_afn(dim, E1, E2)
    def __init__(self,
                 char_dict_size: int = 10,
                 char_embedding_dim: int = 8,
                 word_dict_size: int = 10,
                 word_embedding_dim: int = 300,
                 char_filter_dim: int = 100,
                 n_gram_sizes: Tuple[int, ...] = [5],
                 rnn_dim: int = 100,
                 keep_prob: bool = .8,
                 bidirectional: bool = True):
        super(Bidaf, self).__init__()

        self.char_embedding = Embedding(char_dict_size, char_embedding_dim)
        self.word_embedding = Embedding(word_dict_size, word_embedding_dim)

        self.char_cnn_encoder = CNNEncoder(char_embedding_dim, char_filter_dim,
                                           n_gram_sizes)

        self.highway = Highway(char_filter_dim + word_embedding_dim,
                               num_layers=2)

        self.contextual_embedding = RNNBaseModule(char_filter_dim +
                                                  word_embedding_dim,
                                                  rnn_dim,
                                                  keep_prob=keep_prob,
                                                  bidirectional=bidirectional)

        self.model_layers = RNNBaseModule(rnn_dim * 2,
                                          rnn_dim,
                                          num_layers=2,
                                          keep_prob=keep_prob,
                                          bidirectional=bidirectional)

        self.output_module = MultiDimLinear(rnn_dim * 2, 1)
def create_representation(args):
    rep_type = args['<representation>']
    path = args['<representation_path>']
    neg = int(args['--neg'])
    w_c = args['--w+c']
    eig = float(args['--eig'])

    if rep_type == 'PPMI':
        if w_c:
            raise Exception('w+c is not implemented for PPMI.')
        else:
            return PositiveExplicit(path, True, neg)

    elif rep_type == 'SVD':
        if w_c:
            return EnsembleEmbedding(SVDEmbedding(path, False, eig, False),
                                     SVDEmbedding(path, False, eig, True),
                                     True)
        else:
            return SVDEmbedding(path, True, eig)

    elif rep_type == 'SGNS':
        if w_c:
            return EnsembleEmbedding(Embedding(path + '.words', False),
                                     Embedding(path + '.contexts', False),
                                     True)
        else:
            return Embedding(path + '.words', True)
    elif rep_type == 'discriminative':
        return discriminative_embedding(path, True, eig)
    elif rep_type == 'discriminative_SGNS':
        return discriminative_SGNS(path, True)

    elif rep_type == 'projective':
        return Projective_embedding(path)
示例#12
0
def main(model_num=1):

    preprocess = Preprocess()

    texts_train, labels_train = preprocess.preprocessData(
        '../projet2/train.txt', mode="train")
    texts_dev, labels_dev = preprocess.preprocessData('../projet2/dev.txt',
                                                      mode="train")

    MAX_SEQUENCE_LENGTH = 24
    LSTM_DIM = 64
    HIDDEN_LAYER_DIM = 30
    NUM_CLASSES = 4
    GAUSSIAN_NOISE = 0.1
    DROPOUT = 0.2
    DROPOUT_LSTM = 0.2
    BATCH_SIZE = 200

    X_train, X_val, y_train, y_val = train_test_split(texts_train,
                                                      labels_train,
                                                      test_size=0.2,
                                                      random_state=42)

    labels_categorical_train = to_categorical(np.asarray(y_train))
    labels_categorical_val = to_categorical(np.asarray(y_val))
    labels_categorical_dev = to_categorical(np.asarray(labels_dev))

    embedding = Embedding('../projet2/emosense.300d.txt')
    embeddings = embedding.getMatrix()
    tokenizer = embedding.getTokenizer()

    message_first_message_train, message_second_message_train, message_third_message_train = get_sequences(
        X_train, MAX_SEQUENCE_LENGTH, tokenizer)
    message_first_message_val, message_second_message_val, message_third_message_val = get_sequences(
        X_val, MAX_SEQUENCE_LENGTH, tokenizer)
    message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequences(
        texts_dev, MAX_SEQUENCE_LENGTH, tokenizer)

    model = CustomModel(model_num)
    model.build(embeddings,
                MAX_SEQUENCE_LENGTH,
                LSTM_DIM,
                HIDDEN_LAYER_DIM,
                NUM_CLASSES,
                noise=GAUSSIAN_NOISE,
                dropout_lstm=DROPOUT_LSTM,
                dropout=DROPOUT)
    model.summary()
    history = model.train(message_first_message_train,
                          message_second_message_train,
                          message_third_message_train,
                          labels_categorical_train, message_first_message_val,
                          message_second_message_val,
                          message_third_message_val, labels_categorical_val)

    y_pred = model.predict([
        message_first_message_dev, message_second_message_dev,
        message_third_message_dev
    ])
def main(model_dir):
    np.random.seed(0)
    torch.manual_seed(0)
    e = Embedding(model_dir)
    print(e.nearest_k('god'))
    print(e.similarity('god', 'wickedly'))
    print(e.analogy('god', 'love', 'satan', k=5))
    print()
示例#14
0
 def __init__(self):
     self.img_cropper = ImageCropper()
     self.geolocation_model = GeolocationEstimator()
     self.embedding_model = Embedding()
     self.entity_retriever = EntityRetriever()
     self.news_api = NewsArticlesApi()
     self.events_api = OekgEventsApi()
     print('Loaded GeoWINE successfully.')
示例#15
0
def dimension_fnn(x):
    _embedding = Embedding(x)

    dim = np.arange(1, 20 + 1)
    f1, f2, f3 = _embedding.fnn(x,
                                tau=14,
                                dim=dim,
                                window=10,
                                metric='cityblock')
    _embedding.plot_fnn(dim, f1, f2, f3)
示例#16
0
 def forward(self, xs):
     N, T = xs.shape
     V, D = self.W.shape
     out = np.empty((N, T, D), dtype='f')
     self.layers = []
     for t in range(T):
         layer = Embedding(self.W)
         out[:, t, :] = layer.forward(xs[:, t])
         self.layers.append(layer)
     return out
示例#17
0
def test_embedding():
    np.random.seed(42)
    random_bits = np.ones(100, dtype=int)
    assert np.sum(Embedding("00001111", random_bits).compute()) == 0
    assert np.sum(Embedding("10001111", random_bits).compute()) == 1

    random_bits = np.random.randint(2, size=80)
    s1 = "1" * 10 + "0" * 5
    s2 = "1" * 11 + "0" * 4
    assert pair_embed(s1, s2, random_bits) == 1
    assert pair_embed("bad", "boy", random_bits) == 16
示例#18
0
 def _preprocess(self):
     self.data.dropna(inplace=True)
     self.data['qid'] = self.data['qid'].astype(int)
     self._gen_vocab()
     self.data['qs_processed'] = self.data['question'].apply(
         lambda x: ' '.join(jieba.cut(x)))
     self.embedding = Embedding(self.config['vocab_path'],
                                self.config['w2v_path'])
     self.data['qid'] = self.data['qid'].astype(int)
     self.data['qs_embed'] = self.data['qs_processed'].apply(
         lambda x: self.embedding.sentence_embedding(x.split()))
示例#19
0
def get_embedding_matrix_and_vectorizer(conversations):
    from vectorize import Vectorize
    vectorizer = Vectorize(conversations, MAX_VOCAB_SIZE)
    word_index = vectorizer.word2idx
    # train_sequences = vectorizer.vectorize_data(conversations, MAX_SEQUENCE_LENGTH)
    vectorizer.save_tokenizer(TOKENIZER_PATH)

    from embedding import Embedding
    embed = Embedding(word_index)
    embedding_matrix = embed.get_embedding_matrix()

    return embedding_matrix, vectorizer
class TestEmbedding(unittest.TestCase):
    def setUp(self):
        W = np.arange(21).reshape(7, 3)
        self.embedding = Embedding(W)
        self.index = np.array([0, 2, 0, 4])

    def test_params(self):
        params, = self.embedding.params
        assert_array_equal(np.array([
            [ 0,  1,  2],
            [ 3,  4,  5],
            [ 6,  7,  8],
            [ 9, 10, 11],
            [12, 13, 14],
            [15, 16, 17],
            [18, 19, 20]
       ]), params)

    def test_grads(self):
        grads, = self.embedding.grads
        assert_array_equal(np.array([
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0],
            [0, 0, 0]
       ]), grads)

    def test_forward(self):
        out = self.embedding.forward(self.index)
        assert_array_equal(np.array([
            [ 0,  1,  2],
            [ 6,  7,  8],
            [ 0,  1,  2],
            [12, 13, 14]
       ]), out)

    def test_backward(self):
        dout = self.embedding.forward(self.index)
        self.embedding.backward(dout)
        grads, = self.embedding.grads
        assert_array_equal(np.array([
            [ 0,  2,  4],
            [ 0,  0,  0],
            [ 6,  7,  8],
            [ 0,  0,  0],
            [12, 13, 14],
            [ 0,  0,  0],
            [ 0,  0,  0]
        ]), grads)
示例#21
0
def check_embedding_quality(conversations):
    from vectorize import Vectorize
    vectorizer = Vectorize(conversations, MAX_VOCAB_SIZE)
    word_index = vectorizer.word2idx

    from embedding import Embedding
    embed = Embedding(word_index)

    docu_vocab = vectorizer.word_counts
    embedding_vocab = embed.get_embedding_vocab()
    oov_words = embed.check_coverage(docu_vocab, embedding_vocab)
    print('Collected oov words.')
    return oov_words
示例#22
0
文件: lib.py 项目: moomou/mlab
def raw_txt_to_embedding(embedding_file, content):
    # load nlp object
    nlp = spacy.load('en', create_make_doc=PlangTokenizer)
    embedding = Embedding(embedding_file)

    # push through it the text line pipe
    tokenized = nlp(content)

    # convert the trigrams to embedding
    content_embedding = embedding.words_to_embeddings(tokenized)

    # TODO: pipe to this to a keras model
    print(content_embedding)
示例#23
0
    def __init__(self,
                 n_token,
                 n_layer,
                 n_head,
                 d_model,
                 d_head,
                 d_inner,
                 dropout,
                 dropatt,
                 tie_weight=True,
                 d_embed=None,
                 div_val=1,
                 tie_projs=[False],
                 pre_lnorm=False,
                 tgt_len=None,
                 ext_len=None,
                 mem_len=None,
                 cutoffs=[],
                 adapt_inp=False,
                 same_length=False,
                 clamp_len=-1,
                 sample_softmax=-1,
                 demographics_len=0):
        super(MemTransformerLM, self).__init__()
        self.n_token = n_token

        d_embed = d_model if d_embed is None else d_embed
        self.d_embed = d_embed
        self.d_model = d_model
        self.n_head = n_head
        self.d_head = d_head

        self.word_emb = Embedding(n_token, d_embed)
        self.drop = nn.Dropout(dropout)

        self.n_layer = n_layer

        self.tgt_len = tgt_len
        self.ext_len = ext_len
        self.max_klen = tgt_len + ext_len

        self.clamp_len = clamp_len
        self.layers = nn.ModuleList()
        for i in range(n_layer):
            self.layers.append(
                DecoderLayer(n_head,
                             d_model,
                             d_head,
                             d_inner,
                             dropout,
                             dropatt=dropatt,
                             pre_lnorm=pre_lnorm))

        self.pos_emb = PositionalEmbedding(self.d_model)
        self.loss = nn.BCEWithLogitsLoss()
        self.demographics_len = demographics_len
        self.fc = nn.Linear(self.d_embed + self.demographics_len,
                            self.n_token,
                            bias=True)
        weights_init(self)
示例#24
0
    def __init__(
        self,
        dataset='data/185_baseball.csv',
        columns=None,
        tree='ontologies/class-tree_dbpedia_2016-10.json',
        embedding='models/wiki2vec/en.model',
        row_agg_func=mean_of_rows,
        tree_agg_func=np.mean,
        source_agg_func=mean_of_rows,
        max_num_samples=1e6,
        verbose=False,
    ):

        # print function that works only when verbose is true
        self.vprint = print if verbose else no_op
        self.max_num_samples = max_num_samples

        self.embedding = embedding if isinstance(
            embedding, Embedding) else Embedding(embedding_path=embedding,
                                                 verbose=verbose)
        self.dataset = dataset if isinstance(
            dataset, EmbeddedDataset) else EmbeddedDataset(
                self.embedding,
                columns=columns,
                dataset_path=dataset,
                verbose=verbose)
        self.tree = tree if isinstance(
            tree, EmbeddedClassTree) else EmbeddedClassTree(
                self.embedding, tree_path=tree, verbose=verbose)

        self.row_agg_func = row_agg_func
        self.source_agg_func = source_agg_func
        self.tree_agg_func = tree_agg_func

        self.similarity_matrices = {}
示例#25
0
 def _create_embeddings(self):
     self.embedding_layers = []
     for i, table_size in enumerate(self.table_sizes):
         l = Embedding(input_dim=table_size,
                       output_dim=self.local_embedding_dim,
                       trainable=self.embedding_trainable)
         self.embedding_layers.append(l)
示例#26
0
 def __init__(self,
              vocab_sizes,
              embedding_dims,
              merge_methods,
              padding_indices,
              fix_embedding,
              out_method='none',
              out_dim=None):
     super(MultiFeatureEmbedding, self).__init__()
     self._vocab_sizes = vocab_sizes
     self._embedding_dims = embedding_dims
     self._n_feature = len(vocab_sizes)
     self._merge_methods = merge_methods
     self._padding_indices = padding_indices
     self._fix_embedding = fix_embedding
     self.emb_list = nn.ModuleList(
         Embedding(vocab_size, embedding_dim, padding_idx, fix_embedding)
         for vocab_size, embedding_dim, padding_idx in zip(
             vocab_sizes, embedding_dims, padding_indices))
     self._out_method = out_method
     self._emb_out_dim = sum(dim for index, dim in enumerate(embedding_dims)
                             if merge_methods[index] == 'cat')
     if out_method == 'none':
         self._out_dim = self._emb_out_dim
     elif out_method == 'linear':
         self._out_dim = out_dim
         self.out_module = nn.Linear(self._emb_out_dim, self._out_dim)
     else:
         self._out_dim = out_dim
         self.out_module = MLP(self._emb_out_dim,
                               [int(self._emb_out_dim / 2), self._out_dim],
                               ['prelu', 'prelu'])
示例#27
0
    def __init__(self, config, vocab):
        super(Net, self).__init__()
        self.embed = Embedding(config, vocab)

        def gen_convs(in_channel, kernel_sizes, output_channels):
            return nn.ModuleList([
                nn.Conv1d(in_channels=in_channel,
                          out_channels=oc,
                          kernel_size=kz,
                          padding=((kz - 1) // 2))
                for kz, oc in zip(kernel_sizes, output_channels)
            ])

        full_size = sum(config.output_channels)

        self.convs_QA = gen_convs(config.q_seq_len, config.kernel_sizes,
                                  config.output_channels)
        self.convs_QR = gen_convs(config.q_seq_len, config.kernel_sizes,
                                  config.output_channels)
        self.convs_CA = gen_convs(config.c_seq_len, config.kernel_sizes,
                                  config.output_channels)
        self.convs_CR = gen_convs(config.c_seq_len, config.kernel_sizes,
                                  config.output_channels)
        self.convs_PQ = gen_convs(full_size, config.kernel_sizes,
                                  config.output_channels)
        self.convs_PC = gen_convs(full_size, config.kernel_sizes,
                                  config.output_channels)
        self.drop_QA = nn.Dropout(config.dropout)
        self.drop_QR = nn.Dropout(config.dropout)
        self.drop_CA = nn.Dropout(config.dropout)
        self.drop_CR = nn.Dropout(config.dropout)
        self.drop_PQ = nn.Dropout(config.dropout)
        self.drop_PC = nn.Dropout(config.dropout)
        self.proj1 = nn.Linear(full_size, full_size)
        self.proj2 = nn.Linear(full_size, 1)
示例#28
0
    def __init__(self, data_set: DataSet, ngram_size: int, emb_size: int,
                 hid_size: int):
        """Initialize the language recognition module.

        Args:
            data_set: the dataset from which the set of input symbols
                and output classes (languages) can be extracted
            ngram_size: size of n-gram features (e.g., use 1 for unigrams,
                2 for bigrams, etc.)
            emb_size: size of the character embedding vectors
            hid_size: size of the hidden layer of the FFN use for scoring
        """
        # Keep the size of the ngrams
        self.ngram_size = ngram_size
        # Calculate the embedding alphabet and create the embedding sub-module
        feat_set = self.alphabet(data_set)
        self.register("emb", Embedding(feat_set, emb_size))
        # Encoding (mapping between langs and ints)
        lang_set = set(lang for (_, lang) in data_set)
        self.enc = Encoding(lang_set)
        # Scoring FFN sub-module
        self.register("ffn",
                      FFN(idim=emb_size, hdim=hid_size, odim=len(lang_set)))
        # Additional check to verify that all the registered
        # parameters actually require gradients.  This allows
        # to identify the "bug" in the embedding module.
        assert all([param.requires_grad is True for param in self.params()])
示例#29
0
文件: lib.py 项目: moomou/mlab
def convert_to_word_embedding(rank, glove_h5, prefix, *inputs):
    files = _plang_h5(prefix, driver='mpio', comm=MPI.COMM_WORLD)
    embedding = Embedding(glove_h5)

    MAX_COUNTER = 900000
    files_counter = {key: 0 for key in files.keys()}

    print('[%d] Handling:: %s' % (rank, inputs), flush=True)

    with GracefulInterruptHandler() as h:
        for counter, f in enumerate(inputs):
            with open(f) as csvfile:
                reader = csv.DictReader(csvfile)

                for row in reader:
                    if h.interrupted: break

                    plang_enum = get_type_by_name(row[PATH_COL])

                    key = prefix + plang_enum.name + '.h5'
                    h5 = files[key]

                    if files_counter[key] > MAX_COUNTER:
                        continue
                    if files_counter[key] % 1000 == 0:
                        print('[%d] %s:: %d' % (rank, key, files_counter[key]),
                              flush=True)

                    idx = '/%s' % row[ID_COL]
                    if idx not in h5:
                        files_counter[key] += 1

                        content = row[CONTENT_COL]

                        content_embedding = embedding.words_to_embeddings(
                            content)

                        h5.create_dataset(idx,
                                          data=content_embedding,
                                          dtype=content_embedding.dtype)

            print('[%d] finished %s' % (rank, counter + 1), flush=True)
            if h.interrupted: break

    print('[%d] exiting' % rank, flush=True)
    archiver.close_multi(files)
def get_image_feature(img_path, img_list_path, model_path, epoch_num, gpu_id):
    img_list = open(img_list_path)
    embedding = Embedding(model_path, epoch_num, gpu_id)
    files = img_list.readlines()
    img_feats = []
    faceness_scores = []
    for img_index, each_line in enumerate(print_progress(files)):
        name_lmk_score = each_line.strip().split(' ')
        img_name = os.path.join(img_path, name_lmk_score[0])
        img = cv2.imread(img_name)
        lmk = np.array([float(x) for x in name_lmk_score[1:-1]], dtype=np.float32)
        lmk = lmk.reshape( (5,2) )
        img_feats.append(embedding.get(img,lmk))
        faceness_scores.append(name_lmk_score[-1])
    img_feats = np.array(img_feats).astype(np.float32)
    faceness_scores = np.array(faceness_scores).astype(np.float32)
    return img_feats, faceness_scores
def create_representation(rep_type, path, *args, **kwargs):
    if rep_type == 'Explicit' or rep_type == 'PPMI':
        return Explicit.load(path, *args, **kwargs)
    elif rep_type == 'SVD':
        return SVDEmbedding(path, *args, **kwargs)
    elif rep_type == 'GIGA':
        return GigaEmbedding(path, *args, **kwargs)
    elif rep_type:
        return Embedding.load(path, *args, **kwargs)
def create_representation(args):
    rep_type = args['<representation>']
    path = args['<representation_path>']
    w_c = args['--w+c']
    eig = float(args['--eig'])
    
    if rep_type == 'PPMI':
        if w_c:
            raise Exception('w+c is not implemented for PPMI.')
        else:
            return Explicit.load(path, True)
        
    elif rep_type == 'SVD':
        if w_c:
            return EnsembleEmbedding(SVDEmbedding(path, False, eig, False), SVDEmbedding(path, False, eig, True), True)
        else:
            return SVDEmbedding(path, True, eig)
        
    else:
        if w_c:
            return EnsembleEmbedding(Embedding.load(path + '.words', False), Embedding.load(path + '.contexts', False), True)
        else:
            return Embedding.load(path, True)
def simple_create_representation(rep_type, path, restricted_context=None, thresh=None, normalize=True):
    if rep_type == 'PPMI':
        return Explicit.load(path, normalize=normalize, restricted_context=restricted_context, thresh=thresh) 
    else:
        return Embedding.load(path, True)
示例#34
0
import sys
import numpy as np
import datetime
sys.path.append('../SSH')
sys.path.append('../alignment')
from ssh_detector import SSHDetector
from alignment import Alignment
from embedding import Embedding

#short_max = 800
scales = [1200, 1600]
t = 2

detector = SSHDetector('../SSH/model/e2ef', 0)
alignment = Alignment('../alignment/model/3d_I5', 12)
embedding = Embedding('./model/model', 0)
out_filename = './out.png'

f = '../sample-images/t1.jpg'
if len(sys.argv)>1:
  f = sys.argv[1]
img = cv2.imread(f)
im_shape = img.shape
print(im_shape)
target_size = scales[0]
max_size = scales[1]
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
if im_size_min>target_size or im_size_max>max_size:
  im_scale = float(target_size) / float(im_size_min)
  # prevent bigger axis from being more than max_size: