예제 #1
0
    def run(self):

        fpath = self.args.text
        fpath_pickled = self.args.text + ".pickled"

        max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f,
                               shape=(0, ),
                               dtype=[('vec', np.float32, 128)])
        '''
        if not os.path.exists(fpath_pickled):
            data, labels = self.load_data(max_len, nchars, nwords, words, charmap)
            pickle.dump((data, labels), open(fpath_pickled, 'wb'))
        else:
            data, labels = pickle.load(open(fpath_pickled, 'rb'))
        '''

        if not os.path.exists(self.args.model_name):
            model = self.create_model(128, max_len, nchars)

            #history = model.fit(data, labels, epochs=self.args.epochs, batch_size=128)
            generator = self.generator(max_len, nchars, nwords, words, charmap,
                                       2048)
            model.fit_generator(generator,
                                steps_per_epoch=nwords / 2048,
                                epochs=self.args.epochs)
        else:
            model = load_model(self.args.model_name)

        model.save(self.args.model_name)

        if self.args.layer == 'lstm_layer':
            self.log.info('Accessing the layer weights')
            new_model = Sequential()
            new_model.add(LSTM(128, input_shape=(max_len, nchars),
                               unroll=True))
            weights = model.layers[0].get_weights()
            new_model.set_weights(weights)
            model_p = new_model
        else:
            model_p = model

        self.log.info('started predicting')
        for word in words:
            test_data, test_lables = self.load_data(max_len, nchars, 1, [word],
                                                    charmap)
            p_out = model_p.predict(test_data)
            disk_array.append((p_out[0], ))

        disk_array.flush()
예제 #2
0
    def run(self):

        fpath = self.args.text

        max_len, nchars, nwords, words, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f, shape=(0,), dtype=[('vec', np.float32, 108)])

        model = load_model(self.args.model_name)

        self.log.info('started predicting')
        for word in words:
            test_data, test_lables = self.load_data(max_len, nchars, 1, [word], charmap)
            p_out = model.predict(test_data)
            disk_array.append((p_out[0],))

        disk_array.flush()
예제 #3
0
    def run(self):

        fpath = self.args.text

        max_len, nchars, nwords, charmap = self.get_char_to_int(fpath)

        disk_array = DiskArray(self.args.out_f,
                               shape=(0, ),
                               dtype=[('vec', np.float32, 128)])
        if not os.path.exists(self.args.training_data):
            data, labels = self.load_data(max_len, nchars, nwords, charmap)
        else:
            data = DiskArray(self.args.training_data, dtype=np.float32)
            labels = DiskArray(self.args.labels_data, dtype=np.float32)

        if not os.path.exists(self.args.model_name):
            model = self.create_model(128, max_len, nchars)
            self.log.info('Started training the model')
            history = model.fit(data[:],
                                labels[:],
                                epochs=self.args.epochs,
                                batch_size=128)
            plt.plot(history.history['loss'])
            plt.savefig(self.args.image_name)
        else:
            model = load_model(self.args.model_name)

        model.save(self.args.model_name)

        self.log.info('Accessing the layer weights')
        new_model = Sequential()
        new_model.add(LSTM(128, input_shape=(max_len, nchars), unroll=True))
        weights = model.layers[0].get_weights()
        new_model.set_weights(weights)

        self.log.info('started predicting')
        for word in open(fpath):
            word = word.strip()
            test_data, test_lables = self.get_test_data(
                max_len, nchars, 1, [word], charmap)
            p_out = new_model.predict(test_data)
            disk_array.append((p_out[0], ))

        disk_array.flush()
예제 #4
0
class GW2VectoWordVecSpaceFile(object):
    """
    Abstraction that helps in converting word vector space data
    (vectors and vocabulary) from Google Word2Vec format to
    WordVecSpaceFile format.
    """
    def __init__(self,
                 in_dir,
                 outdir,
                 nvecs_per_shard=0,
                 shard_name="shard",
                 full_name="full"):

        self.in_dir = in_dir
        self.outdir = outdir
        self.nvecs_per_shard = nvecs_per_shard
        self.shard_name = shard_name

        self.do_sharding = bool(self.nvecs_per_shard)
        if self.do_sharding:
            self.full_fpath = self.J(self.outdir, full_name)
            os.makedirs(self.full_fpath)

            map_itow = self.J(self.full_fpath, "indextoword")
            self.itow = DiskDict(map_itow)

            map_wtoi = self.J(self.full_fpath, "wordtoindex")
            self.wtoi = DiskDict(map_wtoi)

            self.mags = DiskArray(
                self.J(self.full_fpath, "magnitudes"),
                shape=(0, ),
                dtype=np.float32,
                growby=1000000,
            )
            self.occurs = DiskArray(
                self.J(self.full_fpath, "occurrences"),
                shape=(0, ),
                dtype=np.uint64,
                growby=1000000,
            )

    def J(self, p1, p2):
        return os.path.join(p1, p2)

    def _iter_vecs(self, vfile, vocabfile):
        for word, vec in vfile.iter_vectors():
            vec = np.fromstring(vec, dtype="float32")
            mag = np.linalg.norm(vec)
            vec = vec / mag
            _line = vocabfile.readline().split(" ")

            word, occur = _line[0], int(_line[1])
            yield vec, word, mag, occur

    def _build_writer(self, vidx, dim):
        if self.do_sharding:
            shard_num = int(vidx / self.nvecs_per_shard)
            shard_name = "{}{}".format(self.shard_name, shard_num)
            fpath = self.J(self.outdir, shard_name)
            return GWVecBinWriter(fpath, dim, sharding=True)
        else:
            return GWVecBinWriter(self.outdir, dim)

    def _create_manifest(
        self,
        out_fpath,
        nvecs,
        dim,
        N,
        t_occur,
        in_fpath,
        m_info={},
        full=False,
        num_vecs=None,
        nvps=None,
    ):
        if full:
            mfc = dict(
                num_shards=N,
                num_vectors=nvecs,
                dimension=dim,
                num_words=t_occur,
                dt_creation=datetime.utcnow().isoformat(),
                input_path=in_fpath,
                manifest_info=m_info,
                num_vecs_per_shard=self.nvecs_per_shard,
            )
        else:
            mfc = dict(
                num_shards=N,
                num_vecs_in_shard=nvecs,
                num_vecs=num_vecs,
                num_vecs_per_shard=nvps,
                dimension=dim,
                num_words=t_occur,
                dt_creation=datetime.utcnow().isoformat(),
                input_path=in_fpath,
                manifest_info=m_info,
            )

        fp = open(self.J(out_fpath, "manifest.json"), "w")
        fp.write(json.dumps(mfc))
        fp.close()

    def _find_manifest_info(self, fpath):
        m_file = self.J(fpath, "manifest.json")
        c = {}
        if os.path.isfile(m_file):
            fp = open(m_file, "r")
            c = json.loads(fp.read())
        return c

    def start(self):
        inp_vec_f = open(self.J(self.in_dir, "vectors.bin"), "rb")
        inp_vecs = GWVecBinReader(inp_vec_f)
        dim = inp_vecs.dim
        nvecs = inp_vecs.nvecs

        vocab_file = open(self.J(self.in_dir, "vocab.txt"),
                          "r",
                          encoding="utf-8",
                          errors="ignore")
        m_info = self._find_manifest_info(self.in_dir)

        w = None
        vecs = self._iter_vecs(inp_vecs, vocab_file)
        N = self.nvecs_per_shard
        if N:
            num_shards = math.ceil(nvecs / N)
        else:
            num_shards = 1

        t_occur = 0
        count = -1
        for index, (vec, word, mag, occur) in enumerate(vecs):
            if self.do_sharding and index % N == 0:
                if w:
                    count += 1
                    t_occur += s_occur
                    self._create_manifest(
                        w.outdir,
                        (index - (count * N)),
                        dim,
                        num_shards,
                        s_occur,
                        self.in_dir,
                        m_info,
                        num_vecs=nvecs,
                        nvps=N,
                    )
                    w.close()
                    w = None

            if not w:
                s_occur = 0
                w = self._build_writer(index, dim)

            if self.do_sharding:
                self.wtoi[word] = index
                self.itow[index] = word

                self.mags.append(mag)
                self.occurs.append(occur)

                w.write(vec=vec, mag=mag, occur=occur)

            else:
                w.write(vec=vec, mag=mag, word=word, index=index, occur=occur)

            s_occur += occur

        if w:
            w.close()
            count += 1
            t_occur += s_occur
            self._create_manifest(
                w.outdir,
                (index - (count * N)),
                dim,
                num_shards,
                s_occur,
                self.in_dir,
                m_info,
                num_vecs=nvecs,
                nvps=N,
            )

        if self.do_sharding:
            self.wtoi.close()
            self.itow.close()

            self.mags.flush()
            self.mags.close()

            self.occurs.flush()
            self.occurs.close()

            self._create_manifest(
                self.full_fpath,
                nvecs,
                dim,
                num_shards,
                t_occur,
                self.in_dir,
                m_info,
                full=True,
            )
class CorrectionalTraining(BaseScript):
    VEC_DIM = 300
    LABELS = [0, 1]

    def __init__(self):
        super(CorrectionalTraining, self).__init__()
        self.train_f = DiskArray(self.args.train_f,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.wv = WordVecSpaceMem(self.args.wvspace_f)

    def get_user_token(self):
        token = input("Enter the search token: ")

        return token

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(self.args.train_f).st_size // np.dtype(dtype).itemsize
        return shape

    def get_nearest_token(self, token):
        url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_k_nearest_cosine?word={}&k=10'.format(
            token)
        #url = 'http://dev0.servers.deepcompute.com:8888/api/v1/get_nn_model_k_nearest?word={}&k=10'.format(token)
        response = requests.get(url)
        response = response.json()
        result = response.get('result')

        return result

    def get_user_label(self, token, nearest_token):
        #name = nearest_token.get('name', '')
        #nearest_token = nearest_token.get('word2', '')
        name = token
        '''
        if not name:
            name = nearest_token
        '''
        print('the nearest token is %s' % name)
        label = input("Mark the distance between {} and {}: ".format(
            token, nearest_token))

        return int(label)

    def get_token_vector(self, token, nearest_token):
        token_vec = self.wv.get_word_vector(token)
        nearest_tok_vec = self.wv.get_word_vector(nearest_token)

        return token_vec, nearest_tok_vec

    def append_label_to_diskarray(self, vec1, vec2, word1, word2, label):
        self.train_f.append((vec1, vec2, word1, word2, label))

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('word1', 'S', self.VEC_DIM),
            ('word2', 'S', self.VEC_DIM),
            ('label', np.int),
        ]

    def run(self):
        try:
            while True:
                token = self.get_user_token()
                nearest_tokens = self.get_nearest_token(token)
                for nearest_token in nearest_tokens:
                    label = int(self.get_user_label(token, nearest_token))
                    if label not in self.LABELS:
                        continue
                    vec1, vec2 = self.get_token_vector(token, nearest_token)
                    self.append_label_to_diskarray(vec1, vec2, token,
                                                   nearest_token, label)
        finally:
            self.train_f.flush()

    def define_args(self, parser):
        parser.add_argument('train_f', help='diskarray train file')
        parser.add_argument('wvspace_f', help='wvspace file')
예제 #6
0
class TrainData(BaseScript):

    VEC_DIM = 300

    def __init__(self):
        super(TrainData, self).__init__()
        self.wvspace = WordVecSpaceMem(self.args.wvspace)
        self.train_f = DiskArray(self.args.train_file,
                                 shape=(self.get_shape(), ),
                                 dtype=self.get_dtype())
        self.words_f = open(self.args.words_file, 'w')
        #self.model = load_model(self.args.model)

    def get_shape(self):
        if not os.path.exists(self.args.train_f):
            return 0

        dtype = self.get_dtype()
        shape = os.stat(
            self.args.train_file).st_size // np.dtype(dtype).itemsize
        return shape

    def get_dtype(self):
        return [
            ('vec1', np.float32, self.VEC_DIM),
            ('vec2', np.float32, self.VEC_DIM),
            ('label', np.int),
        ]

    def get_random_point(self):
        return random.randint(0, len(self.wvspace))

    def near_pair(self):
        index = self.get_random_point()
        word1 = self.wvspace.get_word_at_index(index)
        nearest = self.wvspace.get_nearest(word1, 10)
        n_words = self.wvspace.get_word_at_indices(nearest)
        word2 = n_words[1]
        self.add_pair(word1, word2)

    def add_pair(self, word1, word2):
        vec1 = self.wvspace.get_word_vector(word1)
        vec2 = self.wvspace.get_word_vector(word2)
        diff_vec = abs(vec1 - vec2)
        p_value = self.model.predict(vec1, vec2)
        p_value = 0 if p_value < 3 else 1
        self.train_f.append((vec1, vec2, p_value))
        self.words_f(word1 + '<====>' + word2 + '<======>' + str(p_value))

    def far_pair(self):
        index1 = self.get_random_point()
        word1 = self.wvspace.get_word_at_index(index)
        index2 = self.get_random_point()
        word2 = self.wvspace.get_word_at_index(index)
        self.add_pair(word1, word2)

    def run(self):
        for i in range(self.args.n_samples):
            word1, word2 = self.near_pair()

    def define_args(self, parser):
        parser.add_argument('train_file', metavar='training-file')
        parser.add_argument('wvspace', metavar='vector-space')
        parser.add_argument('words_file', metavar='words-file')
        parser.add_argument('n_samples', metavar='num-of-pairs')