Exemplo n.º 1
0
    def initialize(self, args):
        """`initialize` is called only once when the model is being loaded.
        Implementing `initialize` function is optional. This function allows
        the model to intialize any state associated with this model.

        Parameters
        ----------
        args : dict
          Both keys and values are strings. The dictionary keys and values are:
          * model_config: A JSON string containing the model configuration
          * model_instance_kind: A string containing model instance kind
          * model_instance_device_id: A string containing model instance device ID
          * model_repository: Model repository path
          * model_version: Model version
          * model_name: Model name
        """

        # You must parse model_config. JSON string is not parsed here
        self.model_config = json.loads(args['model_config'])
        self.model_instance_device_id = json.loads(
            args['model_instance_device_id'])
        import numba.cuda as cuda
        cuda.select_device(self.model_instance_device_id)
        import cudf
        from cudf.core.subword_tokenizer import SubwordTokenizer

        # get vocab
        v_p = Path(__file__).with_name('vocab_hash.txt')

        self.cudf_tokenizer = SubwordTokenizer(v_p, do_lower_case=True)
        self.cudf_lib = cudf
        self.seq_len = 256
Exemplo n.º 2
0
def box2d_rotate_iou(boxes2d, gt_boxes2d, device_id=0):

    # Inputs:
    #   boxes2d: (N1, 5) x,y,w,l,r
    #   gt_boxes2d: (N2, 5) x,y,w,l,r
    # Outputs:
    #   iou: (N1, N2)
    boxes2d = boxes2d.astype(np.float32)
    gt_boxes2d = gt_boxes2d.astype(np.float32)
    N1 = boxes2d.shape[0]
    N2 = gt_boxes2d.shape[0]
    iou = np.zeros((N1, N2), dtype=np.float32)
    if N1 == 0 or N2 == 0:
        return iou

    threadsPerBlock = 8 * 8
    cuda.select_device(device_id)
    blockspergrid = (DIVUP(N1, threadsPerBlock), DIVUP(N2, threadsPerBlock))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes2d.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(gt_boxes2d.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel[blockspergrid, threadsPerBlock,
                          stream](N1, N2, boxes_dev, query_boxes_dev, iou_dev)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)

    return iou.astype(boxes2d.dtype)
Exemplo n.º 3
0
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
    """
    rotated box iou running in gpu. 8x faster than cpu version (take 5ms in one example with numba.cuda code).
    convert from [this project](https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
    :param boxes: rbboxes, format: centers, dims, angles(clockwise when positive), FloatTensor[N, 5]
    :param query_boxes: FloatTensor[K, 5]
    :param criterion: optional, default: -1
    :param device_id: int, optional, default: 0
    :return:
    """
    boxes = boxes.astype(np.float32)
    query_boxes = query_boxes.astype(np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    iou = np.zeros((N, K), dtype=np.float32)
    if N == 0 or K == 0:
        return iou
    threads_per_block = 8 * 8
    cuda.select_device(device_id)
    blocks_per_grid = (div_up(N,
                              threads_per_block), div_up(K, threads_per_block))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel_eval[blocks_per_grid, threads_per_block,
                               stream](N, K, boxes_dev, query_boxes_dev,
                                       iou_dev, criterion)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
    return iou.astype(boxes.dtype)
Exemplo n.º 4
0
def gpuWork(stuff):
    batch, batch_size, gpu_id, data_len, data, parser = stuff
    workers = multiprocessing.cpu_count() / len(cuda.list_devices())
    pool = ThreadPool(workers)
    cuda.select_device(gpu_id)

    start = batch * batch_size
    end = (batch + 1) * batch_size
    if end > data_len:
        end = data_len
    results = pool.map(
        randRotateAndTranslateWrap,
        zip([parser] * (end - start), data[start:end], [
            gpu_id,
        ] * len(data[start:end]), [True] * len(data[start:end])))

    images, energies = [], []
    for im, e in results:
        images.append(im)
        energies.append(e)

    images = np.array(images, dtype=np.float32)
    energies = np.array(energies, dtype=np.float32)
    pool.close()
    pool.join()
    return images, energies
Exemplo n.º 5
0
def gpu_dmt_timeseries(dedisp_times, psr_data, max_delay, device=0):
    """
    :param cand: Candidate object
    :param device: GPU id
    :return:
    """
    cuda.select_device(device)
    dm_time = np.zeros((dedisp_times.shape[1], int(psr_data.shape[0]-max_delay)), dtype=np.float32)

    @cuda.jit(fastmath=True)
    def gpu_dmt(cand_data_in, all_delays, cand_data_out):
        ii, jj, kk = cuda.grid(3)
        if ii < cand_data_in.shape[0] and jj < cand_data_out.shape[1] and kk < all_delays.shape[1]:
            cuda.atomic.add(cand_data_out, (kk, jj), cand_data_in[ii, (jj + all_delays[ii,kk]) ]) 

    #with cuda.pinned(dedisp_times, dm_time, psr_data):
    all_delays = cuda.to_device(dedisp_times)
    dmt_return = cuda.device_array(dm_time.shape, dtype=np.float32)

    cand_data_in = cuda.to_device(np.array(psr_data.T, dtype=psr_data.dtype))

    threadsperblock = (4, 8, 32)
    blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1])
    blockspergrid_z = math.ceil(dedisp_times.shape[1] / threadsperblock[2])
    blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)

    gpu_dmt[blockspergrid, threadsperblock](cand_data_in, all_delays,  dmt_return)
    dm_time = dmt_return.copy_to_host()
    #print(all_delays.shape)
    cuda.close()
    return dm_time
Exemplo n.º 6
0
def gpu_dmt(cand, device=0):
    """
    :param cand: Candidate object
    :param device: GPU id
    :return:
    """
    cuda.select_device(device)
    chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32))
    dm_list = cuda.to_device(np.linspace(0, 2 * cand.dm, 256, dtype=np.float32))
    dmt_return = cuda.to_device(np.zeros((256, cand.data.shape[0]), dtype=np.float32))
    cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=np.uint8))

    @cuda.jit
    def gpu_dmt(cand_data_in, chan_freqs, dms, cand_data_out, tsamp):
        ii, jj, kk = cuda.grid(3)
        if ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1] and kk < dms.shape[0]:
            disp_time = int(
                -1 * 4148808.0 * dms[kk] * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp)
            cuda.atomic.add(cand_data_out, (kk, jj), cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]])

    threadsperblock = (16, 8, 8)
    blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1])
    blockspergrid_z = math.ceil(dm_list.shape[0] / threadsperblock[2])

    blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)

    gpu_dmt[blockspergrid, threadsperblock](cand_data_in, chan_freqs, dm_list, dmt_return, float(cand.tsamp))

    cand.dmt = dmt_return.copy_to_host()

    cuda.close()

    return cand
 def relase_GPU_memory():
     K.clear_session()
     cuda.select_device(0)
     cuda.close()
     ses = K.get_session()
     config = tf.ConfigProto()
     K.tensorflow_backend.set_session(tf.Session(config=config))
Exemplo n.º 8
0
def cuda_select_device(dev_i):
    try:
        cuda.close()
    except Exception as e:
        print(e)
        #pass
    cuda.select_device(dev_i)
Exemplo n.º 9
0
def inference(file_path, file_urls):
    result = dict()

    config = yaml.load(open("./assets/config.yaml", 'r'),
                       Loader=yaml.FullLoader)
    pse = InferencePSE(config["pse_evaluation_parameter"], file_path)
    ocr = InferenceOCR(config["ocr_evaluation_parameter"], file_path)
    #Inference PSE
    pse_time = pse.run()
    #Crop image based on PSE output
    # Release the gpu memory
    cuda.select_device(int(config["pse_evaluation_parameter"]["gpu_list"]))
    cuda.close()
    print(file_path)
    CropPSE(file_path)
    #Inference OCR
    ocr_time = ocr.run()
    #Combining Result
    #CreateTxt(file_urls)
    for file_name in file_urls:
        result[file_name] = dict()
        txt_file = "./assets/demo/text/" + file_name.replace("jpg", "txt")
        img_file = file_path + file_name
        df, _, _, _ = create_df(txt_file)
        dict_cells, list_infos = create_cells(df)
        result[file_name]['df'] = create_DB(dict_cells, list_infos).drop(
            'idx', axis=1).to_html(header="true")
        # Visualizer
        result[file_name]['img'] = connect_and_save(img_file, dict_cells,
                                                    list_infos)
    return result
Exemplo n.º 10
0
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
    """
    :param boxes:这个是gtbox
    :param query_boxes: 这个是det box
    :param criterion: 这个是
    :param device_id:
    :return:
    """
    box_dtype = boxes.dtype
    boxes = boxes.astype(np.float32)
    query_boxes = query_boxes.astype(np.float32)
    N = boxes.shape[0]
    K = query_boxes.shape[0]
    iou = np.zeros((N, K), dtype=np.float32)
    if N == 0 or K == 0:
        return iou
    threadsPerBlock = 8 * 8
    cuda.select_device(device_id)
    blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))

    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_bev = cuda.to_device(boxes.reshape([-1]), stream)
        query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
        iou_dev = cuda.to_device(iou.reshape([-1]), stream)
        rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
                               stream](N, K, boxes_bev, query_boxes_dev,
                                       iou_dev, criterion)
        iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
    return iou.astype(boxes.dtype)
Exemplo n.º 11
0
def stupidconv_gpu(img, filt, padval):
    """
    does convolution without using FFT because FFT is pissing me off and giving me weird answers
    :param img:
    :param filt:
    :param padval:
    :return:
    """
    cuda.close()
    cuda.select_device(1)
    # get the number of nonzero entries in the filter for later averaging of result
    filt_nnz = np.count_nonzero(filt)

    # pad the images
    s_filt = filt.shape
    s_img = img.shape

    # appropriate padding depends on context
    # pad with filt size all around img
    pad_img = np.ones((s_img[0] + (2 * s_filt[0]), s_img[1] + (2 * s_filt[1])), dtype=np.float32) * padval

    pad_img[s_filt[0]: s_img[0] + s_filt[0], s_filt[1]: s_img[1] + s_filt[1]] = img

    output = np.zeros(pad_img.shape, dtype=np.float32)

    d_pad_img = cuda.to_device(pad_img)
    d_filt = cuda.to_device(filt)
    d_output = cuda.to_device(output)

    stupidconv_gpu_helper(d_pad_img, d_filt, s_img[0], s_img[1], s_filt[0], s_filt[1], d_output)

    output = d_output.copy_to_host()
    output = output[s_filt[0]:s_filt[0] + s_img[0], s_filt[1]:s_filt[1] + s_img[1]]

    return output / filt_nnz
Exemplo n.º 12
0
def nms_gpu(dets, nms_overlap_thresh, device_id=0):
    """nms in gpu. 
    
    Args:
        dets ([type]): [description]
        nms_overlap_thresh ([type]): [description]
        device_id ([type], optional): Defaults to 0. [description]
    
    Returns:
        [type]: [description]
    """

    boxes_num = dets.shape[0]
    keep_out = np.zeros([boxes_num], dtype=np.int32)
    scores = dets[:, 4]
    order = scores.argsort()[::-1].astype(np.int32)
    boxes_host = dets[order, :]

    threadsPerBlock = 8 * 8
    col_blocks = div_up(boxes_num, threadsPerBlock)
    cuda.select_device(device_id)
    mask_host = np.zeros((boxes_num * col_blocks, ), dtype=np.uint64)
    blockspergrid = (div_up(boxes_num, threadsPerBlock),
                     div_up(boxes_num, threadsPerBlock))
    stream = cuda.stream()
    with stream.auto_synchronize():
        boxes_dev = cuda.to_device(boxes_host.reshape([-1]), stream)
        mask_dev = cuda.to_device(mask_host, stream)
        nms_kernel[blockspergrid, threadsPerBlock, stream](
            boxes_num, nms_overlap_thresh, boxes_dev, mask_dev)
        mask_dev.copy_to_host(mask_host, stream=stream)
    # stream.synchronize()
    num_out = nms_postprocess(keep_out, mask_host, boxes_num)
    keep = keep_out[:num_out]
    return list(order[keep])
Exemplo n.º 13
0
def gpu_dedisperse(cand, device=0):
    """
    :param cand: Candidate object
    :param device: GPU id
    :return:
    """
    cuda.select_device(device)
    chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32))
    cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=np.uint8))
    cand_data_out = cuda.to_device(np.zeros_like(cand.data.T, dtype=np.uint8))

    @cuda.jit
    def gpu_dedisp(cand_data_in, chan_freqs, dm, cand_data_out, tsamp):
        ii, jj = cuda.grid(2)
        if ii < cand_data_in.shape[0] and jj < cand_data_in.shape[1]:
            disp_time = int(-4148808.0 * dm * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2) / 1000 / tsamp)
            cand_data_out[ii, jj] = cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]]

    threadsperblock = (32, 32)
    blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1])

    blockspergrid = (blockspergrid_x, blockspergrid_y)

    gpu_dedisp[blockspergrid, threadsperblock](cand_data_in, chan_freqs, float(cand.dm), cand_data_out,
                                               float(cand.tsamp))

    cand.dedispersed = cand_data_out.copy_to_host().T

    cuda.close()

    return cand
Exemplo n.º 14
0
    def CREATE_MODEL(self):
        try:
            cuda.select_device(0)
            cuda.close()
        except:
            pass
        try:
            tf.keras.backend.clear_session()
        except:
            pass

        self.outsize = self.dict['OTHERS']['1']['OUT_SIZE']
        self.windowlength = self.dict['OTHERS']['1']['WINDOW_LEN']
        self.MAX_window = self.dict['OTHERS']['1']['WINDOW_LEN']
        self.batch =self.dict['OTHERS']['1']['BATCH_SIZE']
        self.period = self.dict['OTHERS']['1']['PERIOD']
        self.optimizer.learning_rate = self.dict['OTHERS']['1']['LR']
        self.epochz = self.dict['OTHERS']['1']['EPOCHS']

        if self.FIRST_ITER:
            self.CREATE_DATA()
            self.FIRST_ITER = False


        if ~(len(list(self.VARS_EX['OTHERS'].keys())) == 0 or list(self.VARS_EX['OTHERS'].keys()) == ['LR'] or list(self.VARS_EX['OTHERS'].keys()) == ['LR','EPOCHS'] or list(self.VARS_EX['OTHERS'].keys()) == ['EPOCHs']):
            self.CREATE_DATA()

        self.model_parallel()
        self.trainingz()
        self.SAVE_PLOTS()
        print(self.epochz)
Exemplo n.º 15
0
def GPUWrapper(data_out, device_id, photons_req_per_device,
               max_photons_per_device, muA, muS, g, source_type, source_param1,
               source_param2, detector_params, max_N, max_distance_from_det,
               target_type, target_mask, target_gridsize, z_target, z_bounded,
               z_range, ret_cols, absorb_threshold, absorb_chance):

    # TODO: These numbers can be optimized based on the device / architecture / number of photons
    threads_per_block = 256
    blocks = 64
    photons_per_thread = int(
        np.ceil(float(photons_req_per_device) / (threads_per_block * blocks)))
    max_photons_per_thread = int(
        np.ceil(float(max_photons_per_device) / (threads_per_block * blocks)))

    cuda.select_device(device_id)
    stream = cuda.stream()  # use stream to trigger async memory transfer

    # Keeping this piece of code here for now -potentially we need this in the future
    #  with compiler_lock:                        # lock the compiler
    # prepare function for this thread
    # the jitted CUDA kernel is loaded into the current context
    # TODO: ideally we should call cuda.jit(signature)(propPhotonGPU), where
    # signature is the call to the function. So far I couldn't figure out what is the signature of the
    # rng_states, closest I got to was: array(Record([('s0', '<u8'), ('s1', '<u8')]), 1d, A)
    # But I couldn't get it to work yet.
    #     MC_cuda_kernel = cuda.jit(propPhotonGPU)

    data = np.ndarray(shape=(threads_per_block * blocks, photons_per_thread,
                             12),
                      dtype=np.float32)
    photon_counters = np.ndarray(shape=(threads_per_block * blocks, 5),
                                 dtype=np.int)
    data_out_device = cuda.device_array_like(data, stream=stream)
    photon_counters_device = cuda.device_array_like(photon_counters,
                                                    stream=stream)

    # Used to initialize the threads random states.
    rng_states = create_xoroshiro128p_states(
        threads_per_block * blocks,
        seed=(np.random.randint(sys.maxsize) - 128) + device_id,
        stream=stream)

    # Actual kernel call
    propPhotonGPU[blocks, threads_per_block](
        rng_states, data_out_device, photon_counters_device,
        photons_per_thread, max_photons_per_thread, muA, muS, g, source_type,
        source_param1, source_param2, detector_params, max_N,
        max_distance_from_det, target_type, target_mask, target_gridsize,
        z_target, z_bounded, z_range, absorb_threshold, absorb_chance)
    # Copy data back
    data_out_device.copy_to_host(data, stream=stream)
    photon_counters_device.copy_to_host(photon_counters, stream=stream)
    stream.synchronize()

    data = data.reshape(data.shape[0] * data.shape[1], data.shape[2])
    data = data[:, ret_cols]
    data_out[device_id][0] = data

    photon_counters_aggr = np.squeeze(np.sum(photon_counters, axis=0))
    data_out[device_id][1] = photon_counters_aggr
Exemplo n.º 16
0
def get_prediction_real_time(sparkEngine, model=None, url_weight="", dim=15, prediction_weight="", encoder_length=24, decoder_length=24, attention_length=24, is_close_cuda=True):
    # continuously crawl aws and aqi & weather
    end = utils.get_datetime_now()
    end = end - timedelta(hours=1)
    # end = datetime.strptime("2018-06-19 11:01:00", p.fm)
    # e_ = end.strftime(p.fm)
    start = end - timedelta(hours=23)
    start = start.replace(minute=0, second=0, microsecond=0)
    # s_ = start.strftime(p.fm)
    # 2. process normalize data
    vectors, w_pred, china_vectors, timestamp = sparkEngine.process_vectors(start, end, dim)
    v_l = len(vectors)
    if v_l:
        sp_vectors = psv.convert_data_to_grid_exe(vectors)
        if v_l < encoder_length:
            sp_vectors = np.pad(sp_vectors, ((encoder_length - v_l,0), (0,0), (0,0), (0, 0)), 'constant', constant_values=0)
        # repeat for 25 districts
        if w_pred:
            w_pred = np.repeat(np.expand_dims(w_pred, 1), p.grid_size, 1)
            de_vectors = psv.convert_data_to_grid_exe(w_pred)
            # pad to fill top elements of decoder vectors
            de_vectors = np.pad(de_vectors, ((0, 0), (0, 0), (0, 0), (6, 0)), 'constant', constant_values=0)
        else:
            # know nothing about future weather forecast
            de_vectors = np.zeros((decoder_length, p.grid_size, p.grid_size, dim))
        sp_vectors = np.concatenate((sp_vectors, de_vectors), axis=0)

        c_l = len(china_vectors)
        if c_l < attention_length:
            # print(attention_length - c_l)
            china_vectors = np.pad(china_vectors, ((attention_length - c_l, 0), (0, 0)), 'constant', constant_values=0)

        # 4. Feed to model
        if model is None:
            # model = BaselineModel(encoder_length=encoder_length, encode_vector_size=12, batch_size=1, decoder_length=decoder_length, rnn_layers=1,
            #                 dtype='grid', grid_size=25, use_cnn=True)
            # model.set_data(sp_vectors, [0], None)
            # model = MaskGan(encoder_length=encoder_length, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, use_cnn=True)
            model = APGan(encoder_length=24, decoder_length=24, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, forecast_factor=0)
            # model = APNet(encoder_length=24, decoder_length=24, encode_vector_size=15, batch_size=1, decode_vector_size=9, grid_size=25, forecast_factor=0)
        model.set_data(sp_vectors, [0], None, china_vectors)
        with tf.device('/%s' % p.device):
            model.init_ops(is_train=False)
            saver = tf.train.Saver()
        tconfig = get_gpu_options(False)        
        with tf.Session(config=tconfig) as session:
            model.assign_datasets(session)    
            preds_pm25 = realtime_execute(model, session, saver, decoder_length, p.prediction_weight_pm25)
            model.forecast_factor = 1
            preds_pm10 = realtime_execute(model, session, saver, decoder_length, p.prediction_weight_pm10)
            china_vectors = np.array(china_vectors)
            # print("china", china_vectors.shape)
            # tf.reset_default_graph()
            # session.close()
            if is_close_cuda:
                cuda.select_device(0)
                cuda.close()
        return (preds_pm25, preds_pm10), timestamp, np.transpose(china_vectors[:,:2] * 500)
    else:
        return ([],[]), [], []
Exemplo n.º 17
0
 def clear(self):
     K.clear_session()
     gc.collect()
     del self.model
     for gpu in range(len(cuda.gpus)):
         cuda.select_device(gpu)
         cuda.close()
Exemplo n.º 18
0
    def init(self):

        # NOTE: do this at first of all
        if self.gpu_i is not None:
            cuda.select_device(self.gpu_i)

        self.nk = Nccl()
def worker(input_q, output_q):
    # Load a (frozen) Tensorflow model into memory.
    detection_graph = tf.Graph()
    with detection_graph.as_default():
        od_graph_def = tf.GraphDef()
        with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')

        sess = tf.Session(graph=detection_graph)
        mtcnn = detect_and_align.create_mtcnn(sess, None)

    fps = FPS().start()
    while True:
        fps.update()
        frame = input_q.get()
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)        
        face_patches, padded_bounding_boxes, landmarks = detect_and_align.detect_faces(frame_rgb, mtcnn)
        output = dict(face_boxes=padded_bounding_boxes)
        output_q.put(output)

    fps.stop()
    sess.close()
    cuda.select_device(0)
    cuda.close()
Exemplo n.º 20
0
def crack_gpu(password_list, target_hash: str,
              gpu_id: int) -> typing.List[str]:
    #if gpu_id != 0:
    cuda.select_device(gpu_id)
    arr = password_list
    target_hash_arr = [
        int(target_hash[i:i + 8], 16) for i in range(0, len(target_hash), 8)
    ]
    target_hash_arr = np.array([
        struct.unpack(">I", struct.pack("<I", i))[0] for i in target_hash_arr
    ],
                               dtype=np.uint32)
    matching_hash_index = np.array([-1], dtype=np.int32)
    THREADS_PER_BLOCK = 512
    BLOCKS_PER_GRID = (
        arr.shape[0] +
        (THREADS_PER_BLOCK - 1)) // THREADS_PER_BLOCK  # only 1 "Grid"
    target_hash_arr = cuda.to_device(target_hash_arr)
    matching_hash_index = cuda.to_device(matching_hash_index)
    arr = cuda.to_device(arr)
    print(f"cracking phase (gpu{gpu_id})")
    crack_password[BLOCKS_PER_GRID, THREADS_PER_BLOCK](arr, target_hash_arr,
                                                       matching_hash_index)
    print(f"finished (gpu{gpu_id})")
    matching_hash_index = matching_hash_index.copy_to_host()
    if matching_hash_index[0] != -1:
        return [int_arr_to_str(password_list[matching_hash_index[0]])]
    return []


# cracker = PasswordCracker(['a'])
# cracker.crack_gpu()
Exemplo n.º 21
0
    def parse(self):
        if not self.initialized:
            self.initialize()
        self.opt = self.parser.parse_args()

        self.opt.device = torch.device(
            "cuda:%d" %
            (self.opt.gpu_id) if torch.cuda.is_available() else "cpu")
        cuda.select_device(self.opt.gpu_id)
        # torch.cuda.set_device(self.opt.gpu_id)

        args = vars(self.opt)

        print('------------ Options -------------')
        for k, v in sorted(args.items()):
            print('%s: %s' % (str(k), str(v)))
        print('-------------- End ----------------')

        # save to the disk
        expr_dir = os.path.join(self.opt.checkpoints_dir, self.opt.name)
        util.mkdirs(expr_dir)
        file_name = os.path.join(expr_dir, 'opt.txt')
        with open(file_name, 'wt') as opt_file:
            opt_file.write('------------ Options -------------\n')
            for k, v in sorted(args.items()):
                opt_file.write('%s: %s\n' % (str(k), str(v)))
            opt_file.write('-------------- End ----------------\n')
        return self.opt
Exemplo n.º 22
0
    def detect(self, image):
        cuda.select_device(0)

        config = ConfigProto()
        config.gpu_options.allow_growth = True
        session = InteractiveSession(config=config)
        ROOT_DIR = "/home/bernihoh/Bachelor/SMS/MaskRCNN/samples/SMSNetworks/face_feature_detection/"
        MODEL_DIR = os.path.join(ROOT_DIR, "logsFaceFeatureDetection")
        COCO_MODEL_PATH = "/home/bernihoh/Bachelor/SMS/MaskRCNN/samples/SMSNetworks/face_feature_detection/mask_rcnn_face_feature_detection_0029.h5"
        config = InferenceConfig()
        config.display()

        # Create model object in inference mode.
        model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)

        # Load weights trained on MS-COCO
        model.load_weights(COCO_MODEL_PATH, by_name=True)

        class_names = ["bg", "iris_l", "inner_eye_l", "outer_eye_l", "eye_brow_l", "cheek_l", "iris_r",
                       "inner_eye_r", "outer_eye_r", "eye_brow_r", "cheek_r", "nose_tip", "nose", "mouth",
                       "chin", "face", "head", "distortion"]

        results = model.detect([image], verbose=1)
        r = results[0]
        session.close()
        cuda.close()
        return r
Exemplo n.º 23
0
def configure_node(dist_backend, init_method):
    args = Param()
    args.dist_backend = dist_backend

    is_using_slurm = os.environ.get('SLURM_NTASKS') is not None
    if is_using_slurm:
        SLURM_LOCALID = int(os.environ.get('SLURM_LOCALID'))
        SLURM_PROCID = int(os.environ.get('SLURM_PROCID'))
        SLURM_NTASKS = int(os.environ.get('SLURM_NTASKS'))
        SLURM_NTASKS_PER_NODE = int(os.environ.get('SLURM_NTASKS_PER_NODE'))
        # logging.info(f'SLURM_LOCALID: {SLURM_LOCALID}')
        # logging.info(f'SLURM_PROCID: {SLURM_PROCID}')
        # logging.info(f'SLURM_NTASKS: {SLURM_NTASKS}')
        # logging.info(f'SLURM_NTASKS_PER_NODE: {SLURM_NTASKS_PER_NODE}')
        args.slurm_tasks_per_node = SLURM_NTASKS_PER_NODE if SLURM_NTASKS_PER_NODE is not None else 0
        args.rank = SLURM_PROCID if SLURM_PROCID is not None else 0
        args.gpu = SLURM_LOCALID if SLURM_LOCALID is not None else args.rank
        args.world_size = SLURM_NTASKS if SLURM_NTASKS is not None else 1
        args.is_distributed = True
        args.scheduler = 'slurm'
    else:
        args.rank = 0
        args.world_size = 1
        args.gpu = 0
        args.is_distributed = False
        args.scheduler = 'local'

    args.device = th.device(f'cuda:{args.gpu}')
    # if args.world_size > 1:
    # let numba know which device we are running the kernels on
    logging.info(
        f'rank {args.rank} avail gpus: {[x.id for x in GPUtil.getGPUs()]}')
    logging.info(f'Selecting device: {args.gpu}')
    cuda.select_device(args.gpu)
    dist.init_process_group(backend=args.dist_backend,
                            rank=args.rank,
                            world_size=args.world_size,
                            init_method=init_method)

    ram_gpu_free_GB = []
    ram_cpu_free_GB = psutil.virtual_memory().available / 2**30
    gpus = GPUtil.getGPUs()
    gpu = gpus[args.gpu]
    ram_gpu_free_GB = gpu.memoryFree / 1000
    if args.rank == 0:
        logging.info(f'Scheduler: {args.scheduler}')
        logging.info(f'System resources:')
    logging.info(f'Rank {args.rank}    Free CPU RAM: {ram_cpu_free_GB} GB')
    # if args.world_size > 1:
    #     dist.barrier()
    logging.info(f'Rank {args.rank}    Free GPU RAM: {ram_gpu_free_GB} GB')
    # if args.world_size > 1:
    #     dist.barrier()
    logging.info(
        f'Rank {args.rank} is using device {args.gpu}/{len(gpus)}: {gpu.name} driver: v{gpu.driver}'
    )
    # if args.world_size > 1:
    #     dist.barrier()
    return args
Exemplo n.º 24
0
    def __exit__(self, *args):

        cuda.select_device(self.gpu)
        suffix = 'ms (' + self.label + ')' if self.label else 'ms'
        self.end.record()
        self.end.synchronize()
        time = cuda.event_elapsed_time(self.start, self.end)
        print('elapsed time:', int(time), suffix)
Exemplo n.º 25
0
 def __init__(self, x, box, r_cut, gpu=0):
     cuda.select_device(gpu)
     self.gpu = gpu
     self.x = x
     self.box = box
     self.r_cut = r_cut
     self.r_cut2 = self.r_cut**2
     self.update()
Exemplo n.º 26
0
    def __init__(self, label='', gpu=0):

        self.label = label
        self.gpu = gpu
        self.start = cuda.event()
        self.end = cuda.event()
        cuda.select_device(self.gpu)
        self.start.record(),
Exemplo n.º 27
0
def cleanup():
    from keras import backend as K

    K.clear_session()

    from numba import cuda
    cuda.select_device(0)
    cuda.close()
Exemplo n.º 28
0
def newthread():
    cuda.select_device(0)
    stream = cuda.stream()
    A = np.arange(100)
    dA = cuda.to_device(A, stream=stream)
    stream.synchronize()
    del dA
    del stream
    cuda.close()
Exemplo n.º 29
0
 def newthread(exception_queue):
     try:
         devices = range(driver.get_device_count())
         for _ in range(2):
             for d in devices:
                 cuda.select_device(d)
                 cuda.close()
     except Exception as e:
         exception_queue.put(e)
Exemplo n.º 30
0
 def newthread():
     devices = range(driver.get_device_count())
     print('Devices', devices)
     for _ in range(2):
         for d in devices:
             cuda.select_device(d)
             print('Selected device', d)
             cuda.close()
             print('Closed device', d)
Exemplo n.º 31
0
 def newthread(exception_queue):
     try:
         devices = range(driver.get_device_count())
         for _ in range(2):
             for d in devices:
                 cuda.select_device(d)
                 cuda.close()
     except Exception as e:
         exception_queue.put(e)
Exemplo n.º 32
0
def gpu_dmt(cand, device=0):
    """

    GPU DM-Time bow-tie (by rolling the array)

    Args:
        cand: Candidate instance
        device (int): GPU ID

    Returns:
        candidate object

    """
    cuda.select_device(device)
    chan_freqs = cuda.to_device(np.array(cand.chan_freqs, dtype=np.float32))
    dm_list = cuda.to_device(np.linspace(0, 2 * cand.dm, 256, dtype=np.float32))
    dmt_return = cuda.to_device(np.zeros((256, cand.data.shape[0]), dtype=np.float32))
    cand_data_in = cuda.to_device(np.array(cand.data.T, dtype=cand.data.dtype))

    @cuda.jit
    def gpu_dmt(cand_data_in, chan_freqs, dms, cand_data_out, tsamp):
        ii, jj, kk = cuda.grid(3)
        if (
            ii < cand_data_in.shape[0]
            and jj < cand_data_in.shape[1]
            and kk < dms.shape[0]
        ):
            disp_time = int(
                -1
                * 4148808.0
                * dms[kk]
                * (1 / (chan_freqs[0]) ** 2 - 1 / (chan_freqs[ii]) ** 2)
                / 1000
                / tsamp
            )
            cuda.atomic.add(
                cand_data_out,
                (kk, jj),
                cand_data_in[ii, (jj + disp_time) % cand_data_in.shape[1]],
            )

    threadsperblock = (16, 8, 8)
    blockspergrid_x = math.ceil(cand_data_in.shape[0] / threadsperblock[0])
    blockspergrid_y = math.ceil(cand_data_in.shape[1] / threadsperblock[1])
    blockspergrid_z = math.ceil(dm_list.shape[0] / threadsperblock[2])

    blockspergrid = (blockspergrid_x, blockspergrid_y, blockspergrid_z)

    gpu_dmt[blockspergrid, threadsperblock](
        cand_data_in, chan_freqs, dm_list, dmt_return, float(cand.your_header.tsamp)
    )

    cand.dmt = dmt_return.copy_to_host()

    cuda.close()

    return cand
 def newthread(exception_queue):
     try:
         devices = range(driver.get_device_count())
         print('Devices', devices)
         for _ in range(2):
             for d in devices:
                 cuda.select_device(d)
                 print('Selected device', d)
                 cuda.close()
                 print('Closed device', d)
     except Exception as e:
         exception_queue.put(e)
Exemplo n.º 34
0
def newthread(exception_queue):
    try:
        cuda.select_device(0)
        stream = cuda.stream()
        A = np.arange(100)
        dA = cuda.to_device(A, stream=stream)
        stream.synchronize()
        del dA
        del stream
        cuda.close()
    except Exception as e:
        exception_queue.put(e)
Exemplo n.º 35
0
def device_controller(cid):
    cuda.select_device(cid)                    # bind device to thread
    device = cuda.get_current_device()         # get current device

    # print some information about the CUDA card
    prefix = '[%s]' % device
    print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY)

    max_thread = device.MAX_THREADS_PER_BLOCK

    with compiler_lock:                        # lock the compiler
        # prepare function for this thread
        # the jitted CUDA kernel is loaded into the current context
        cuda_kernel = cuda.jit(signature)(kernel)

    # prepare data
    N = 12345
    data = np.arange(N, dtype=np.int32) * (cid + 1)
    orig = data.copy()

    # determine number of threads and blocks
    if N >= max_thread:
        ngrid = int(ceil(float(N) / max_thread))
        nthread = max_thread
    else:
        ngrid = 1
        nthread = N

    print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread))

    # real CUDA work
    d_data = cuda.to_device(data)                   # transfer to device
    cuda_kernel[ngrid, nthread](d_data, d_data)     # compute inplace
    d_data.copy_to_host(data)                       # transfer to host

    # check result
    if not np.all(data == orig + 1):
        raise ValueError
Exemplo n.º 36
0
from __future__ import print_function

from timeit import default_timer as timer
import math

import numpy as np
import pylab

from numba import cuda
# For machine with multiple devices
cuda.select_device(0)


@cuda.jit('float32(float32, float32)', device=True)
def core(a, b):
    return 1


@cuda.jit('void(float32[:], float32[:], float32[:])')
def vec_add(a, b, c):
    i = cuda.grid(1)
    c[i] = core(a[i], b[i])


@cuda.jit('void(float32[:], float32[:], float32[:])')
def vec_add_ilp_x2(a, b, c):
    # read
    i = cuda.grid(1)
    ai = a[i]
    bi = b[i]