Exemplo n.º 1
0
def dumpInput(outputPath,data_spec):
    x = sorted(glob.glob(data_spec));
    tmp = bp.unpack_ndarray_file(x[0])
    dims = len(tmp[0])
    count = len(tmp)    
    for i in range (1,len(x)):
        tmp = bp.unpack_ndarray_file(x[i])
        if len(tmp[0]) != dims:    
            raise ValueError("Dimension "+str(len(tmp[0]))+" not same as first file "+str(dims))
        count =  count + len(tmp)
    with open(outputPath+'/originalData.txt', 'w') as fout:
      c = 0
      fout.write(str(count)+" "+str(dims)+"\n")
      for inputFile in x:
        tmp = bp.unpack_ndarray_file(inputFile)
        for xi in range (0,tmp.shape[0]):
          x_vector = tmp[xi]
          xstr = numpy.char.mod('%f', x_vector)
          output = " ".join(xstr)+"\n"
          fout.write(output)
        c = c + len(tmp)
      if c != count:
        raise ValueError("Error output count ("+str(c)+") is not equal to calculated output count ("+str(count)+")")
    
    return (count,dims)
Exemplo n.º 2
0
def _get_neural_features_dataset(reshape_style, classes):
    print("Loading neural features data...")
    # Paths to pickled data
    root = '/Users/miljan/PycharmProjects/entity-dependent-sentiment-mining/data/blp/glove/300d/' + classes + '/'
    data_path_train_x = root + 'train_x.blp'
    data_path_train_y = root + 'train_y.blp'
    data_path_test_x = root + 'test_x.blp'
    data_path_test_y = root + 'test_y.blp'

    # Load train data
    print("\nUnpickling training data...")
    np_train_x = bp.unpack_ndarray_file(data_path_train_x)
    # np_train_x = np_train_x[:np.floor(np_train_x.shape[0]/2.0), :, :]
    np_train_y = bp.unpack_ndarray_file(data_path_train_y)
    # np_train_y = np_train_y[:np.floor(np_train_y.shape[0]/2.0)]
    gc.collect()

    # Load test data
    print("\nUnpickling testing data...")
    np_test_x = bp.unpack_ndarray_file(data_path_test_x)
    np_test_y = bp.unpack_ndarray_file(data_path_test_y)
    gc.collect()

    # reshape data matrices into single row per sample matrices (3D > 2D reshaping)
    np_train_x, np_test_x = _reshape_vectors(reshape_style, np_train_x, np_test_x)

    print("\nDone")
    return np_train_x, np_train_y, np_test_x, np_test_y
Exemplo n.º 3
0
def _file2nnet(layers, set_layer_num = -1, path="dnn.tmp",  factor=1.0):
    n_layers = len(layers)
    nnet_dict = {}
    if set_layer_num == -1:
        set_layer_num = n_layers
        log("file2nnet set_layer_num is -1 so set it to "+str(set_layer_num))
    
    with open(path + '/metadata.tmp', 'rb') as fp:
        nnet_dict = pickle.load(fp)
    for i in range(set_layer_num):
        dict_a = 'W' + str(i)
        layer = layers[i]
        if layer.type == 'fc':
            mat_shape = layer.W.get_value().shape
            f = path + "/" + os.path.split(nnet_dict[dict_a])[1]
            layer.W.set_value(factor * np.asarray(bp.unpack_ndarray_file(f), dtype=theano.config.floatX).reshape(mat_shape))
        elif layer.type == 'conv':
            filter_shape = layer.filter_shape
            W_array = layer.W.get_value()
            for next_X in range(filter_shape[0]):
                for this_X in range(filter_shape[1]):
                    new_dict_a = dict_a + ' ' + str(next_X) + ' ' + str(this_X)
                    mat_shape = W_array[next_X, this_X, :, :].shape
                    f = path + "/" + os.path.split(nnet_dict[new_dict_a])[1]
                    W_array[next_X, this_X, :, :] = factor * np.asarray(bp.unpack_ndarray_file(f), dtype=theano.config.floatX).reshape(mat_shape)
            layer.W.set_value(W_array)
        dict_a = 'b' + str(i)
        f = path + "/" + os.path.split(nnet_dict[dict_a])[1]
        layer.b.set_value(np.asarray(bp.unpack_ndarray_file(f), dtype=theano.config.floatX))
Exemplo n.º 4
0
def _load_train_data(divisor, offset):
    '''
    Load subset of data
    :param divisor:
    :param offset:
    :return:
    '''

    start = offset / float(divisor)
    end = (1 + offset) / float(divisor)

    x_train = bp.unpack_ndarray_file(data_path_train_x)
    data_temp = np.empty_like(
        x_train[np.floor(x_train.shape[0] * start):np.floor(x_train.shape[0] *
                                                            end), :, :])
    np.copyto(
        data_temp,
        x_train[np.floor(x_train.shape[0] * start):np.floor(x_train.shape[0] *
                                                            end), :, :])
    x_train = -1
    gc.collect()
    x_train = data_temp
    y_train = bp.unpack_ndarray_file(data_path_train_y)
    y_train = deepcopy(y_train[np.floor(y_train.shape[0] *
                                        start):np.floor(y_train.shape[0] *
                                                        end)])

    return x_train, y_train
Exemplo n.º 5
0
def load_data(fp, polydata_instead_of_face_vertex_list=True, download_s3=True):
    from .vis3d_utilities import load_mesh_stl
    from .distributed_utilities import download_from_s3

    if ENABLE_DOWNLOAD_S3 and download_s3:
        download_from_s3(fp)

    if fp.endswith('.bp'):
        try:
            data = bp.unpack_ndarray_file(fp)
        except:
            fp = fp.replace('.bp','.npy')
            data = np.load(fp)
    elif fp.endswith('.npy'):
        data = np.load(fp)
    elif fp.endswith('.json'):
        data = load_json(fp)
    elif fp.endswith('.pkl'):
        data = load_pickle(fp)
    elif fp.endswith('.stl'):
        data = load_mesh_stl(fp, return_polydata_only=polydata_instead_of_face_vertex_list)
    elif fp.endswith('.txt'):
        data = np.loadtxt(fp)
    elif fp.endswith('.png') or fp.endswith('.tif'):
        data = imread(fp)
    elif fp.endswith('.ini'):
        data = load_ini(fp)
    elif fp.endswith('.csv'):
        data = csv_to_dict(fp)
    else:
        raise

    return data
Exemplo n.º 6
0
 def get_total_size(self):
     if self.total_size != None:
         return self.total_size
     self.total_size = 0
     for f in self.pfile_path_list:
         l = bp.unpack_ndarray_file(f+".labels")
         self.total_size += len(l)
     return self.total_size 
Exemplo n.º 7
0
def load_Xy():
    X_np = np.hstack([bp.unpack_ndarray_file(fn) for fn in FEATURE_LIST])
    X_ss = None
    for fn in SPARSE_LIST:
        print("Loading {}".format(fn))
        if X_ss is None:
            X_ss = loadmat(fn)
            X_ss = ss.csr_matrix(X_ss)
        else:
            X_ss = ss.hstack([X_ss, loadmat(fn)])
            X_ss = ss.csr_matrix(X_ss)
    X_np = ss.csr_matrix(X_np)
    print("Concatinate X_ss and X_np")
    X = ss.hstack([X_np, X_ss])
    print("Convert X into CSC matrix")
    X = ss.csc_matrix(X)
    print("done")
    del X_np, X_ss
    y_train = bp.unpack_ndarray_file("feat.y.blp")
    return X, y_train
Exemplo n.º 8
0
def make_dataset_from_name(size, data_name):

    base = 2**(20)
    to_load = {base: 'small',
               base*10: 'mid',
               base*100: 'large',
               base*1000: 'xlarge',
               }

    return bp.unpack_ndarray_file(os.path.join(DATASET_ROOT,
                                  '%s_%s.blp' % (data_name, to_load[size])))
Exemplo n.º 9
0
def load_Xy():
    X_np = np.hstack([bp.unpack_ndarray_file(fn) for fn in FEATURE_LIST])
    X_ss = None
    for fn in SPARSE_LIST:
        print("Loading {}".format(fn))
        if X_ss is None:
            X_ss = loadmat(fn)
            X_ss = ss.csr_matrix(X_ss)
        else:
            X_ss = ss.hstack([X_ss, loadmat(fn)])
            X_ss = ss.csr_matrix(X_ss)
    X_np = ss.csr_matrix(X_np)
    print("Concatinate X_ss and X_np")
    X = ss.hstack([X_np, X_ss])
    print("Convert X into CSC matrix")
    X = ss.csc_matrix(X)
    print("done")
    del X_np, X_ss
    y_train = bp.unpack_ndarray_file("feat.y.blp")
    return X, y_train
def _load_train_data(divisor, offset):
    '''
    Load subset of data
    :param divisor:
    :param offset:
    :return:
    '''

    start = offset / float(divisor)
    end = (1 + offset) / float(divisor)

    x_train = bp.unpack_ndarray_file(data_path_train_x)
    data_temp = np.empty_like(x_train[np.floor(x_train.shape[0]*start):np.floor(x_train.shape[0]*end), :, :])
    np.copyto(data_temp, x_train[np.floor(x_train.shape[0]*start):np.floor(x_train.shape[0]*end), :, :])
    x_train = -1
    gc.collect()
    x_train = data_temp
    y_train = bp.unpack_ndarray_file(data_path_train_y)
    y_train = deepcopy(y_train[np.floor(y_train.shape[0]*start):np.floor(y_train.shape[0]*end)])

    return x_train, y_train
Exemplo n.º 11
0
def load_data(fp):

    if fp.endswith('bp'):
        data = bp.unpack_ndarray_file(fp)
    elif fp.endswith('jpg'):
        data = imread(fp)
    elif fp.endswith('hdf'):
        data = load_hdf_v2(fp).tolist()
    elif fp.endswith('pkl'):
        data = pickle.load(open(fp, 'r'))
    else:
        raise Exception('Not recognized.')

    return data
Exemplo n.º 12
0
    def load_next_partition(self, shared_xy):
        pfile_path = self.pfile_path_list[self.cur_pfile_index]
        if self.feat_mat is None or len(self.pfile_path_list) > 1:
            #log("Start reading partition "+pfile_path) 
            self.feat_mat = bp.unpack_ndarray_file(pfile_path)
            self.label_vec = bp.unpack_ndarray_file(pfile_path+".labels")  
            shared_x, shared_y = shared_xy

            self.feat_mat, self.label_vec = \
                preprocess_feature_and_label(self.feat_mat, self.label_vec, self.read_opts)
            if self.read_opts['random']:
                shuffle_feature_and_label(self.feat_mat, self.label_vec)

            shared_x.set_value(self.feat_mat, borrow=True)
            shared_y.set_value(self.label_vec.astype(theano.config.floatX), borrow=True)
            #log("Finished reading partition "+pfile_path)
        self.cur_frame_num = len(self.feat_mat)
        self.cur_pfile_index += 1

        if self.cur_pfile_index >= len(self.pfile_path_list):   # the end of one epoch
            self.end_reading = True
            self.cur_pfile_index = 0
        return pfile_path
Exemplo n.º 13
0
def load_data(fp):

    if fp.endswith('bp'):
        data = bp.unpack_ndarray_file(fp)
    elif fp.endswith('jpg'):
        data = imread(fp)
    elif fp.endswith('hdf'):
        data = load_hdf_v2(fp).tolist()
    elif fp.endswith('pkl'):
        data = pickle.load(open(fp, 'r'))
    else:
        raise Exception('Not recognized.')

    return data
Exemplo n.º 14
0
def dumpInput(outputPath,perplexity,data_spec):
    x = sorted(glob.glob(data_spec));
    tmp = bp.unpack_ndarray_file(x[0])
    dims = len(tmp[0])
    count = len(tmp)    
    for i in range (1,len(x)):
        tmp = bp.unpack_ndarray_file(x[i])
        if len(tmp[0]) != dims:    
            raise ValueError("Dimension "+str(len(tmp[0]))+" not same as first file "+str(dims))
        count =  count + len(tmp)
    files = []
    for p in perplexity:
        files.append(createDataFile(outputPath,'OriginalData',count,2,dims,p))
    c = 0
    for inputFile in x:
        tmp = bp.unpack_ndarray_file(inputFile)
        for f in files:
            appendData(f,tmp)
        c = c + len(tmp)
    for data_file in files:
        closeDataFile(data_file)
    if c != count:
        raise ValueError("Error output count ("+str(c)+") is not equal to calculated output count ("+str(count)+")")
    return (count,dims)
Exemplo n.º 15
0
    def __init__(self):
        scene.SceneCanvas.__init__(self, keys='interactive', size=(960, 960), show=True, bgcolor='black', title='MRI', vsync=False)

        self.unfreeze()
        self.view = self.central_widget.add_view()
        
        # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_thumbnail.npz')['arr_0']

        # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_scoreMap_2.npz')['arr_0']
        # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_scoreMap_1.npz')['arr_0']
        # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_scoreMap_9.npz')['arr_0']
        # self.vol_data = self.vol_data / self.vol_data.max()
        # self.vol_data[self.vol_data < .5] = 0

        self.vol_data = bp.unpack_ndarray_file('/home/yuncong/CSHL_volumes/volume_MD589_annotation.bp')
        # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_annotationAllClasses.npz')['arr_0']
        # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_labelmap.npz')['arr_0']
        # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD594_predMap.npz')['arr_0']
        # self.vol_data = np.load('/home/yuncong/CSHL_volumes/volume_MD589_predMap.npz')['arr_0']
        self.vol_data = self.vol_data[::2,::2,::2].astype(np.float)/9.


        # self.vol_data = np.flipud(np.rollaxis(self.vol_data, 1))

        # self.sectionTo = 150
        self.sectionTo = 50

        colors = np.loadtxt('/home/yuncong/Brain/visualization/100colors.txt')

        # self.volume = scene.visuals.Volume(self.vol_data[:,0:self.sectionTo,:], parent=self.view.scene, cmap=get_colormap('coolwarm'))
        self.volume = scene.visuals.Volume(self.vol_data[:,0:self.sectionTo,:], parent=self.view.scene, method='mip', cmap=Colormap([(0,0,0),(0,1,0), (1,0,0), (0,1,0), (0,0,1), (1,1,0),
            (0,1,1), (1,1,0),(1,0.5,0),(0,0.5,0),(0,0,1)], interpolation='linear'))
        self.volume.transform = scene.STTransform(translate=(0,0,0))
        CMAP = self.volume.cmap

        self.section2D = self.vol_data[:,self.sectionTo,:]

        self.plane = scene.visuals.Image(self.section2D, parent=self.view.scene, cmap=CMAP, relative_step_size=1.5)
        # self.plane.transform = scene.STTransform(translate=(0,self.sectionTo,0))
        # self.plane.transform = scene.STTransform(translate=(0,0,0))
        self.plane.transform = MatrixTransform()
        self.plane.transform.rotate(90, (1,0,0))
        self.plane.transform.translate((0,self.sectionTo,0))
        
        self.plane.attach(BlackToAlpha())

        self.view.camera = scene.cameras.ArcballCamera(parent=self.view.scene)
Exemplo n.º 16
0
def get_train_X_y(train_filepath_prefix, location, fold_id=None):
    df_train = pd.read_csv(
        '.'.join([train_filepath_prefix, location + ".tsv"]),
        **KWARGS_READ_CSV)

    if isinstance(fold_id, int):
        crossval_index_filename = '.'.join([
            train_filepath_prefix, "index.crossval{f}".format(f=fold_id),
            location + ".blp"
        ])
        removal_index = pd.DatetimeIndex(
            bp.unpack_ndarray_file(crossval_index_filename))
        df_train.drop(removal_index, axis=0, inplace=True)

    df_train.dropna(axis=0, how="any", inplace=True)

    return df_train.values
Exemplo n.º 17
0
def unpack_file(fn, encoding='utf8'):
    """ Unpack numpy array from filename

    Supports binary data with bloscpack and text data with msgpack+blosc

    >>> unpack_file('foo.blp')  # doctest: +SKIP
    array([1, 2, 3])

    See also:
        pack_file
    """
    try:
        return bloscpack.unpack_ndarray_file(fn)
    except ValueError:
        with open(fn, 'rb') as f:
            return np.array(msgpack.unpackb(blosc.decompress(f.read()),
                                            encoding=encoding))
Exemplo n.º 18
0
def unpack_file(fn, encoding='utf8'):
    """ Unpack numpy array from filename

    Supports binary data with bloscpack and text data with msgpack+blosc

    >>> unpack_file('foo.blp')  # doctest: +SKIP
    array([1, 2, 3])

    See also:
        pack_file
    """
    try:
        return bloscpack.unpack_ndarray_file(fn)
    except ValueError:
        with open(fn, 'rb') as f:
            return np.array(
                msgpack.unpackb(blosc.decompress(f.read()), encoding=encoding))
Exemplo n.º 19
0
    def get_incr_classificator(self,
                               incr_datas,
                               incr_class_label,
                               test_datas,
                               test_class_label,
                               method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1:y[3], :])
            label_block.append(incr_class_label[x[3] + 1:y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(
                    origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
            #            predict_true = handle(clf, "zero")
            #            if predict_true:
            #                return predict_true

            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0:i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                        c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice,
                     max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(
                fit_for_class_support)

            #            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
            #            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(
                    c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([
                    1 if origin_label[j] == label[j] else -1
                    for j in range(len(origin_label))
                ])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(
                        origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_,
                                          "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_,
                                         "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(
            dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out,
               feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(
                    self.bayes, "class_log_prior_"):
                raise ValueError(
                    "please use get_classificator() to get classificator firstly"
                )

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (
                    i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去
                # 更新时,增量集会不断减少
                block = []
                label_block = []
                # 更新时,训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update,
                                             key=lambda x: x[3])

                    #                    index = [index0[3] for index0 in accord_to_index]
                    #                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
                    #                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] +
                                                1:, :])
                    label_block.append(
                        incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[
                        0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_,
                          self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]),
                zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])


#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime(
            '%Y-%m-%d %H:%M:%S')
        print
        return self
 def __call__(self):
     if path.exists(self.filename):
         return bloscpack.unpack_ndarray_file(self.filename)
     result = self.f()
     bloscpack.pack_ndarray_file(result, self.filename)
     return result
 def wrapped_f():
     if path.exists(self.filename):
         return bloscpack.unpack_ndarray_file(self.filename)
     result = f()
     bloscpack.pack_ndarray_file(result, self.filename)
     return result
Exemplo n.º 22
0
    features_rotated = np.reshape([
        np.roll(features_tabular[i], -ai, axis=-1)
        for i, ai in enumerate(max_angle_indices)
    ], (fs.shape[0], dm.n_freq * dm.n_angle))

    return features_rotated


t = time.time()
sys.stderr.write('load filtered values ...')

features = []
for i in range(dm.n_kernel):

    sys.stderr.write('%d\n' % i)
    a = bp.unpack_ndarray_file(os.environ['GORDON_RESULT_DIR'] +
                               '/feature_%03d.bp' % i).reshape((-1, ))

    if which_part == 0:
        features.append(a[:len(a) / 2])
    else:
        features.append(a[len(a) / 2:])

features = np.asarray(features).T

sys.stderr.write('done in %f seconds\n' % (time.time() - t))

t = time.time()
sys.stderr.write('rotate features ...')

items_per_job = 100
Exemplo n.º 23
0
 def read_blp(serialized_filepath):
     return bp.unpack_ndarray_file(serialized_filepath)
b_conf_matrix = True
word_vector_size = 300
sentence_length = 43

# Load train data
print "\nunpickling training data..."
x_train, y_train = _load_train_data(data_ratio, 0.0)
# pdb.set_trace()

print "\nTraining matrix shape:"
print x_train.shape
print "\nTraining matrix size in bytes: "
print x_train.nbytes
# Load test data
print "\nunpickling testing data..."
x_test = bp.unpack_ndarray_file(data_path_test_x)
y_test = bp.unpack_ndarray_file(data_path_test_y)
N_test = x_test.shape[0]

print N_test

# data_info
print "\nreading info..."
N = x_train.shape[0]
dim = x_train.shape[1]
max_len = x_train.shape[2]

# reshape data to match chainer format
x_train = np.reshape(x_train, (x_train.shape[0], 1, word_vector_size, max_len))

# Hyper search params
Exemplo n.º 25
0
import pylab as Plot
#from OriginalTSNE import tsne
from BHTSNEDropInReplacementTSNE import processResultFileName
import bloscpack as bp
import pandas as pd
import seaborn as sns
import glob

if __name__ == "__main__":
  outputLabels = '/tmp/data/testpartition*.blp.labels'
  y = sorted(glob.glob(outputLabels))
  print("Found labels "+str(y))
  files = ['resultLayer7Perplexity20.000000.dat','resultLayer7Perplexity30.000000.dat','resultLayer7Perplexity5.000000.dat','resultLayer7Perplexity50.000000.dat']
  for f in files: 
        inputData =  f
        labels = bp.unpack_ndarray_file(y[0])
        for i in range(1,len(y)):
            tmpl = bp.unpack_ndarray_file(y[i])
            labels = Math.concatenate((labels,tmpl))
        Y = processResultFileName(inputData)
        if Y.shape[0] != labels.shape[0]:
            raise ValueError("X shape does not match label shape!!")    
        print(Y.shape)
        print(labels.shape)
        df = pd.DataFrame(Y, columns=['x', 'y'])
        df['label']=labels
        class1 = df.query("label == 1.0")
        class2 = df.query("label == 0.0")
        sns.set(style="darkgrid")   
        f, ax = plt.subplots(figsize=(8, 8))
        ax.set_aspect("equal")
Exemplo n.º 26
0
pred_file = sys.argv[1]

if '.gz' in pred_file:
    pred_mat = pickle.load(gzip.open(pred_file, 'rb'))
else:
    pred_mat = pickle.load(open(pred_file, 'rb'))
 
l = sorted(glob.glob(sys.argv[2]))
if len(l) == 0:  
	log("ERROR in show_results. Test partitions is empty. Argument "+sys.argv[2])

subclassificationMapping = pd.read_csv("/ssd/subclassificationMapping",sep="=",names=["NUMBER","NAME"],index_col="NUMBER")
print(subclassificationMapping.loc[121][0])


test_labels = bp.unpack_ndarray_file(l[0]+".labels")
test_labels = test_labels.astype(numpy.int32)

# Read the subclassifications
assert l[0][-4:] == '.blp' , "Invalid extension "+l[0][-4:]
fn = l[0][:-4]+".ignored.csv.gz"
df = pd.read_csv(fn,sep=';',usecols=['SUBCLASSIFICATION'],dtype=numpy.int32)
assert df.shape[0] == test_labels.shape[0],"Shapes not equal"

# End read the subclassification

for i in range (1,len(l)):
    stop_if_stop_is_requested()
    lab = bp.unpack_ndarray_file(l[i]+".labels")
    lab = lab.astype(numpy.int32)
    test_labels = numpy.concatenate((test_labels,lab))
Exemplo n.º 27
0
 def decompress(self):
     it = bp.unpack_ndarray_file(self.storage)
Exemplo n.º 28
0
b_conf_matrix = True
word_vector_size = 300
sentence_length = 43

# Load train data
print "\nunpickling training data..."
x_train, y_train = _load_train_data(data_ratio, 0.0)
# pdb.set_trace()

print "\nTraining matrix shape:"
print x_train.shape
print "\nTraining matrix size in bytes: "
print x_train.nbytes
# Load test data
print "\nunpickling testing data..."
x_test = bp.unpack_ndarray_file(data_path_test_x)
y_test = bp.unpack_ndarray_file(data_path_test_y)
N_test = x_test.shape[0]

print N_test

# data_info
print "\nreading info..."
N = x_train.shape[0]
dim = x_train.shape[1]
max_len = x_train.shape[2]

# reshape data to match chainer format
x_train = np.reshape(x_train, (x_train.shape[0], 1, word_vector_size, max_len))

# Hyper search params
Exemplo n.º 29
0
    def get_incr_classificator(self, incr_datas, incr_class_label, test_datas, test_class_label, method="first"):
        """
        对增量式贝叶斯的增量集部分进行处理
        :param incr_datas: [{"emorion-1-type": value, "sentence": {}},...]
                            (emotion-1-type and sentence are optional)
        :param incr_class_label:
        :param test_datas:
        :param test_class_label:
        :return:
        """
        def func(x, y):
            block.append(fit_incr_datas[x[3] + 1: y[3], :])
            label_block.append(incr_class_label[x[3] + 1: y[3]])
            block0.append(fit_incr_datas[y[3]:y[3] + 1, :])
            return y

        def handle(clf, method):
            if method == "zero":
                return handle_zero(clf)
            elif method == "first":
                return handle_first(clf)
            elif method == "second":
                return handle_second(clf)
            elif method == "third":
                return handle_third(clf)
            elif method == "four":
                return handle_four(clf)
            elif method == "five":
                return handle_five(clf)
            else:
                pass

        def handle_zero(clf):
            """
            寻找当前分类器下预测正确的样本
            :param clf:
            :return:
            """
            incr_pre_label = clf.predict(fit_incr_datas)
            # 选出预测正确的下标
            true_index = (incr_class_label == incr_pre_label).nonzero()

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)

            res = []
            for i0 in true_index[0]:
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = incr_pre_label[i0]

                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)

                res.append((loss0, text0, c_pred0, i0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return res

        def handle_first(clf):
            # 最原始的分类损失度的计算
            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_my_zero_one_loss(test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_second(clf):
            # 另一种分类损失度的计算
#            predict_true = handle(clf, "zero")
#            if predict_true:
#                return predict_true

            # 分类损失,求最小值的处理方式
            loss = 9999
            # 增量集中优先选择更改分类器参数的文本
            text = None
            # 增量集中优先选择更改分类器参数的文本所对应的类别
            c_pred = None
            # 增量集中优先选择更改分类器参数的文本所对应的下标
            index = 0

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                c_true0 = incr_class_label[i0: i0 + 1][0]
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]

                if c_true0 == c_pred0:
                    loss = 0
                    text = text0
                    c_pred = c_pred0
                    index = i0
                    break
                else:
                    clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                    test_proba = clf.predict_max_proba(test_datas)
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                    if loss0 < loss:
                        loss = loss0
                        text = text0
                        c_pred = c_pred0
                        index = i0

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            return [(loss, text, c_pred, index)]

        def handle_third(clf):
            # todo
            # 如何获得合适的阖值
            def get_fit(e0):
                # 获得合适的阖值
                return 20
#                while len((r >= e0).nonzero()[0]) == 0:
#                    e0 = int(e0 / 2)
#                return e0

            global e
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            second_max_proba = -np.partition(-proba, kth=1, axis=1)[:, 1:2]
            # 支持度
            r = np.divide(max_proba, second_max_proba)
            # 阖值
            e = get_fit(e)
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_third_another(clf):
            # 类支持度的计算
            proba = clf.predict_proba(fit_incr_datas)
            label = clf.predict(fit_incr_datas)
            max_proba = np.max(proba, axis=1).reshape(-1, 1)
            leave_proba = np.sum(proba, axis=1).reshape(-1, 1) - max_proba
            # 支持度
            r = np.divide(max_proba, leave_proba)
            # 阖值
            e = 5
            # select
            select_indices = (r >= e).nonzero()
            return [(0.0, fit_incr_datas.getrow(indice), label[indice], indice, max_proba[indice][0]) for indice in select_indices[0]]

        def handle_four(clf):
            # My Own Idea
            # 存放 Test 的结果
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)
            for i0 in range(fit_incr_datas.shape[0]):
                text0 = fit_incr_datas.getrow(i0)
                c_pred0 = clf.predict(text0)[0]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        def handle_five(clf):
            """
            类支持度和无显著性差异的结合
            :param clf:
            :return:
            """
            predict_true = handle(clf, "zero")
            if predict_true:
                return predict_true

            fit_for_class_support = handle(clf, "third")
            print "The result of class-support: %d samples" % len(fit_for_class_support)

#            fit_for_class_support = filter(lambda x: x[4] > clf.bayes.class_log_prior_[np.where(clf.bayes.classes_ == x[2])[0][0]], fit_for_class_support)
#            print "The result of class-support: %d samples" % len(fit_for_class_support)
            # My Own Idea
            # 存放 Test 的结果
            f_res = []

            origin_class_log_prob_ = clf.bayes.class_log_prior_
            origin_feature_log_prob_ = clf.bayes.feature_log_prob_
            origin_proba = clf.predict_max_proba(test_datas)
            origin_label = clf.predict(test_datas)

            for i0 in range(len(fit_for_class_support)):
                text0 = fit_for_class_support[i0][1]
                c_pred0 = fit_for_class_support[i0][2]
                clf.bayes.class_log_prior_, clf.bayes.feature_log_prob_ = clf.bayes.update(c_pred0, text0, copy=True)
                test_proba = clf.predict_max_proba(test_datas)
                label = clf.predict(test_datas)
                # 考虑到类别的影响
                # 会出现以下的情况:某个样本属于某个类的概率很高,update后属于某个类别的概率也很高,但是
                # 前后两个类别可能不一致
                smooth = np.asarray([1 if origin_label[j] == label[j] else -1 for j in range(len(origin_label))])
                np.multiply(test_proba, smooth, test_proba)

                f_test0 = pair_test(origin_proba, test_proba)
                if f_test0:
                    loss0 = clf.metrics_another_zero_one_loss(origin_proba, test_proba)
                else:
                    loss0 = -1
                f_res.append((loss0, text0, c_pred0, i0, f_test0))

                clf.bayes.class_log_prior_ = origin_class_log_prob_
                clf.bayes.feature_log_prob_ = origin_feature_log_prob_

            res = filter(lambda x: x[4], f_res)
            return [(r[0], r[1], r[2], r[3]) for r in res]

        method_options = ("first", "second", "third", "four", "five")
        if method not in method_options:
            raise ValueError("method has to be one of " + str(method_options))

        print "Begin Increment Classification: ", time.strftime('%Y-%m-%d %H:%M:%S')
        # 将参数写入/读取
        dir_ = os.path.join(TEXT_OUT, "bayes_args")
        FileUtil.mkdirs(dir_)

        suffix = ".blp"
        class_count_out = os.path.join(dir_, "class_count_" + method + suffix)
        class_log_prob_out = os.path.join(dir_, "class_log_prob_" + method + suffix)
        feature_count_out = os.path.join(dir_, "feature_count_" + method + suffix)
        feature_log_prob_out = os.path.join(dir_, "feature_log_prob_" + method + suffix)

        out = (class_count_out, class_log_prob_out, feature_count_out, feature_log_prob_out)

        if self.f or not FileUtil.isexist(out) or FileUtil.isempty(out):
            if not hasattr(self.bayes, "feature_log_prob_") or not hasattr(self.bayes, "class_log_prior_"):
                raise ValueError("please use get_classificator() to get classificator firstly")

            fit_incr_datas = self.fit_data(incr_datas)
            incr_class_label = np.asanyarray(incr_class_label)
            # 保存需要增加到key_words.txt文档中的数据
            add_to_key_words = []

            i = 0
            while fit_incr_datas.nnz > 0:
                print
                print "Begin Increment Classification_%d: %s" % (i, time.strftime('%Y-%m-%d %H:%M:%S'))

                need_to_update = handle(self, method)
                # 如果没有可更新的,表示剩余的增量集并不适合当前的分类器,所以舍去
                # 更新时,增量集会不断减少
                block = []
                label_block = []
                # 更新时,训练集会不断增加
                block0 = []
                if need_to_update:
                    # 根据 loss 从小到大排序
                    accord_to_loss = sorted(need_to_update, key=lambda x: x[0])
                    for data in accord_to_loss:
                        self.bayes.update(data[2], data[1])
                    # 根据 index 排序
                    accord_to_index = sorted(need_to_update, key=lambda x: x[3])

#                    index = [index0[3] for index0 in accord_to_index]
#                    [add_to_key_words.append(raw_incr_datas[index0]) for index0 in index]
#                    raw_incr_datas = [raw for index0, raw in enumerate(raw_incr_datas) if index0 not in index]

                    block0.append(test_datas)
                    reduce(func, accord_to_index, (0.0, "", "", -1))
                    block.append(fit_incr_datas[accord_to_index[-1][3] + 1:, :])
                    label_block.append(incr_class_label[accord_to_index[-1][3] + 1:])
                    test_datas = sp.vstack(block0)
                    print "This times updates %d samples" % len(need_to_update)
                else:
                    block.append(fit_incr_datas[0:0, :])
                    label_block.append(incr_class_label[0:0])
                    print "Finally leaving %d samples that unnecessary added to train sets" % fit_incr_datas.shape[0]
                fit_incr_datas = sp.vstack(block)
                incr_class_label = np.concatenate(label_block)
                i += 1

            bayes_args = (self.bayes.class_count_, self.bayes.class_log_prior_,
                          self.bayes.feature_count_, self.bayes.feature_log_prob_)
            # 保存到文本
            map(lambda x: bp.pack_ndarray_file(x[0], x[1]), zip(bayes_args, out))
            # 追加
#            path = os.path.join(TEXT_OUT, "key_words/CHIFeature.txt")
#            FileUtil.write(path, add_to_key_words, "a")
        else:
            # speed up
            self.bayes.class_count_ = bp.unpack_ndarray_file(out[0])
            self.bayes.class_log_prior_ = bp.unpack_ndarray_file(out[1])
            self.bayes.feature_count_ = bp.unpack_ndarray_file(out[2])
            self.bayes.feature_log_prob_ = bp.unpack_ndarray_file(out[3])

#            self.bayes.class_count_ = np.loadtxt(out[0])
#            self.bayes.class_log_prior_ = np.loadtxt(out[1])
#            self.bayes.feature_count_ = np.loadtxt(out[2])
#            self.bayes.feature_log_prob_ = np.loadtxt(out[3])

        print "Increment Classification Done: ", time.strftime('%Y-%m-%d %H:%M:%S')
        print
        return self
    pickle.dump((l_overall_sentiment, d_source_sentiment),
                open('./stats_july.pkl', 'wb'))


if __name__ == '__main__':
    # get the models
    ensemble_model_data = conv_net_model.get_ensemble()

    # load test data
    root_path = '/Users/miljan/PycharmProjects/entity-dependent-sentiment-mining/data/blp/word2vec/two_classes/'
    s_x_test = 'test_x.blp'
    s_y_test = 'test_y.blp'

    # Load test data
    print "\nunpickling testing data..."
    x_test = bp.unpack_ndarray_file(root_path + s_x_test)
    y_test = bp.unpack_ndarray_file(root_path + s_y_test)

    # reshape data to match chainer format
    x_test = np.reshape(x_test, (x_test.shape[0], 1, 300, 43))

    print 'Predicting'

    predictions = []
    for entry in x_test:
        predictions.append(
            conv_net_model.ensemble_predict(entry, ensemble_model_data))

    counter = 0
    for real, pred in zip(y_test, predictions):
        if real == pred:
    tf_parameter_dict = load_alignment_parameters_v2(
        stack_f=atlas_name,
        stack_m=stack,
        warp_setting=24,
        vol_type_f='annotationAsScore',
        vol_type_m='annotationAsScore',
        downscale=32)
    cf = tf_parameter_dict['centroid_f']
    cm = tf_parameter_dict['centroid_m']
    of = tf_parameter_dict['crop_origin_f']
    om = tf_parameter_dict['crop_origin_m']
    params = tf_parameter_dict['params']
    Rt = np.reshape(params, (3, 4))
    R = Rt[:3, :3]
    t = Rt[:3, 3]
    moving_brain_markers_raw = bp.unpack_ndarray_file(
        get_stacy_markers_filepath(stack=stack, structure='All'))
    brain_markers_aligned2atlas = np.dot(
        R, (moving_brain_markers_raw - om - cm).T).T + t + of + cf
    all_markers[stack] = brain_markers_aligned2atlas

#%%
thickness = 5
cut_plane_normal = (0., 0., 1.)

testplot = 0
markers_by_struct = pd.DataFrame()

for OZ in range(-440, 440, 10):
    cut_plane_origin = (0., 0., OZ)

    ######
Exemplo n.º 32
0
 def decompress(self):
     it = bp.unpack_ndarray_file(self.storage)
Exemplo n.º 33
0
def read_numpy(
    path
    ):
    return bp.unpack_ndarray_file(path)
Exemplo n.º 34
0
    module = importlib.import_module(args.module)
    coder_class = getattr(module, args.model)


    # load data
    # ---------

    # Primary dataset
    if args.data is None:
        #load mnist as default
        train_data = mnist_data.read_data_sets('MNIST_data').train
        datas = [train_data.images]
        if hasattr(coder_class, 'prep_mnist'):
            datas = [coder_class.prep_mnist(datas[0])]
    else:
        datas = [bp.unpack_ndarray_file(fname) for fname in args.data]
    if data_fn: datas = data_fn(datas)
    train_mode = 'recode'
    
    # auxiliary optional datasets
    if args.labels:
        labels = bp.unpack_ndarray_file(args.labels)
        datas.append(labels)
        train_mode = 'label'
    if args.targets:
        targets = bp.unpack_ndarray_file(args.targets)
        if target_fn: targets = target_fn(targets)
        datas.append(targets)
        train_mode = 'target'