示例#1
0
    def compute_residual(self, save_dir,
                         multi_processing=False,
                         n_processors=1):
        '''
        '''
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        batch_ids = []
        fnames_seg = []
        for batch_id in range(self.reader.n_batches):
            batch_ids.append(batch_id)
            fnames_seg.append(
                os.path.join(save_dir,
                             'residual_seg{}.npy'.format(batch_id)))

        #self.logger.info("computing residuals")
        if multi_processing:
            batches_in = np.array_split(batch_ids, n_processors)
            fnames_in = np.array_split(fnames_seg, n_processors)
            parmap.starmap(self.subtract_parallel, 
                         list(zip(batches_in, fnames_in)),
                         processes=n_processors,
                         pm_pbar=True)

        else:
            for ctr in range(len(batch_ids)):
                self.subtract_parallel(
                    [batch_ids[ctr]], [fnames_seg[ctr]])

        self.fnames_seg = fnames_seg
示例#2
0
def sen2cor_L2A_batch(res, L1Cdir):
    """
    Batch processing of S1-L1C files in a directory to S1-L2A
    
    Args:
        res (str/num): resolution, accepts 10, 20, 60, or all
        L1Cdir (str): location of S1 L1C products
    """
    # Put S1 L1C directory names in list
    L1C_files = filter(
        re.compile(r'^S2.*L1C.*SAFE$').search, os.listdir(L1Cdir))
    print("{} L1C files found in directory".format(str(len(L1C_files))))
    l1cList = []
    for L1C_file in L1C_files:  # Iterate over directory names
        # Check if the file exists
        checker = r'S2._MSIL2A_' + L1C_file[11:]
        checker_list = filter(re.compile(checker).search, os.listdir(L1Cdir))
        checker2 = r'S2._USER_PRD_MSIL2A_' + L1C_file[20:]
        checker_list = checker_list + filter(
            re.compile(checker2).search, os.listdir(L1Cdir))

        if len(checker_list) == 0:
            # Call sen2cor function for individual product
            print("{} is set for processing".format(L1C_file))
            #sen2cor_L2A(res, L1Cdir+L1C_file)
            l1cList.append((res, L1Cdir + L1C_file))
        else:
            print("{} was already processed, removing from list".format(
                L1C_file))

    parmap.starmap(sen2cor_L2A, l1cList, pm_chunksize=12)
def download_hub(download_dir,
                 file_path,
                 accounts_file,
                 start_date='NOW-30DAYS',
                 end_date='NOW',
                 downloads_per_account=1,  # maximum allowed is 2
                 max_downloads=10):
    start_date = str(start_date)
    end_date = str(end_date)

    if not os.path.isdir(download_dir):
        raise ValueError('download_dir: '+ download_dir + ' does not exist or is inaccesible. Your current working directory is '+os.getcwd()+'.')
    if not os.path.exists(file_path):
        raise ValueError('file_path: '+ file_path + ' does not exist or is inaccesible. Your current working directory is '+os.getcwd()+'.')
    if not os.path.exists(accounts_file):
        raise ValueError('accounts_file: '+ accounts_file + ' does not exist or is inaccesible. Your current working directory is '+os.getcwd()+'.')

    products, credentials = get_products_aoi(file_path, accounts_file, start_date=start_date, end_date=end_date)
    # Creates directory for download files
    owd = os.getcwd()  # original working directory (owd)
    new_dir = download_dir+'/hub%s' % time.strftime('%a%d%b%Y%H%M%S')
    os.mkdir(new_dir)
    os.chdir(new_dir)

    n_accounts = len(credentials)
    n_products = len(products)

    n_threads = n_products // n_accounts * downloads_per_account
    if n_threads < 1:
        n_threads = 1
    if n_threads >= max_downloads:
        n_threads = max_downloads

    div_products = dict_divider(products, n_threads)

    if n_products > 1:
        div_credentials = credentials.values() * (n_accounts//n_threads)
    else:
        div_credentials = [credentials.values()[0]]

    parmap.starmap(download, izip(div_products, div_credentials))
    # try zip
    #for elem in products:
    #    print elem
    #    print products
    #    os.system('unzip ' + products[elem]['title'] + '.zip')
    #    os.remove(products[elem]['title'] + '.zip')

    os.chdir(owd)

    text_file = open(download_dir+'/TIME_DIR.txt','w')
    text_file.write(new_dir)
    text_file.close()
    return new_dir
示例#4
0
def run_voltage_treshold(standardized_path,
                         standardized_dtype,
                         output_directory,
                         run_chunk_sec='full'):
    """Run detection that thresholds on amplitude
    """
    logger = logging.getLogger(__name__)

    CONFIG = read_config()

    # get data reader
    #n_sec_chunk = CONFIG.resources.n_sec_chunk*CONFIG.resources.n_processors
    batch_length = CONFIG.resources.n_sec_chunk
    n_sec_chunk = 0.5
    print("   batch length to (sec): ", batch_length,
          " (longer increase speed a bit)")
    print("   length of each seg (sec): ", n_sec_chunk)
    buffer = CONFIG.spike_size
    if run_chunk_sec == 'full':
        chunk_sec = None
    else:
        chunk_sec = run_chunk_sec

    reader = READER(standardized_path, standardized_dtype, CONFIG,
                    batch_length, buffer, chunk_sec)

    # number of processed chunks
    n_mini_per_big_batch = int(np.ceil(batch_length / n_sec_chunk))
    total_processing = int(reader.n_batches * n_mini_per_big_batch)

    # neighboring channels
    channel_index = make_channel_index(CONFIG.neigh_channels,
                                       CONFIG.geom,
                                       steps=2)

    if CONFIG.resources.multi_processing:
        parmap.starmap(run_voltage_threshold_parallel,
                       list(zip(np.arange(reader.n_batches))),
                       reader,
                       n_sec_chunk,
                       CONFIG.detect.threshold,
                       channel_index,
                       output_directory,
                       processes=CONFIG.resources.n_processors,
                       pm_pbar=True)
    else:
        for batch_id in range(reader.n_batches):
            run_voltage_threshold_parallel(batch_id, reader, n_sec_chunk,
                                           CONFIG.detect.threshold,
                                           channel_index, output_directory)
示例#5
0
    def apply_heuristics(self,
                         heuristics,
                         primitive_matrix,
                         feat_combos,
                         beta_opt,
                         mode=None):
        """
        Apply given heuristics to given feature matrix X and abstain by beta
        heuristics: list of pre-trained logistic regression models
        feat_combos: primitive indices to apply heuristics to
        beta: best beta value for associated heuristics
        """

        if f'{heuristics[0].__class__}' == "<class 'program_synthesis.synthesizer.dummyKneighborClassifier'>":
            if self.cuda:
                #########gpu
                L = np.zeros((np.shape(primitive_matrix)[0], len(heuristics)))
                for i, hf in enumerate(heuristics):
                    L[:, i] = marginals_to_labels_cuda(
                        hf, beta_opt[i], feat_combos[i],
                        primitive_matrix[:, feat_combos[i]], self)
                return L
            else:
                ########using parmap
                L_ = parmap.starmap(marginals_to_labels,
                                    list(zip(heuristics, beta_opt,
                                             feat_combos)),
                                    primitive_matrix,
                                    self,
                                    pm_pbar=True)

                return np.transpose(np.array(L_))

                ########single-core
                # L = np.zeros((np.shape(primitive_matrix)[0], len(heuristics)))
                # for i, hf in enumerate(heuristics):
                #     L[:, i] = marginals_to_labels(hf, beta_opt[i], feat_combos[i], primitive_matrix,
                #                                      self, mode=mode)
            return L

        else:
            ########using parmap
            L_ = parmap.starmap(marginals_to_labels,
                                list(zip(heuristics, beta_opt, feat_combos)),
                                primitive_matrix,
                                self,
                                mode=mode,
                                pm_pbar=True)

            return np.transpose(np.array(L_))
示例#6
0
def append_price():

    codes = pd.read_excel('코드리스트_test.xlsx', converters={'종목코드': str})['종목코드']
    # codes = ['095570']
    pages = range(1, 1000)

    for code in codes:
        csv = pd.DataFrame(
            columns=['날짜', '종가', '전일비', '시가', '고가', '저가', '거래량'])
        path = "./temp/" + str(code) + ".db"
        con = sqlite3.connect(path)
        csv.to_sql('price', con, if_exists='replace')

        input_list = list(itertools.product([code], pages))
        parmap.starmap(get_price, input_list, pm_pbar=True)
def get_evoked_map(mouse):
    
    lofiles, lofilenames = get_file_list(base_dir, mouse)
    print lofilenames
    lop = get_distance_var(lofiles)

    all_frames = get_video_frames(lofiles)
    print "Alligning all video frames..."

    all_frames = parmap.starmap(shift_frames, zip(all_frames, lop))
    all_frames = np.asarray(all_frames, dtype=np.float32)
    print np.shape(all_frames)

    new_all_frames = parmap.map(process_frames_evoked, all_frames)
    
    all_frames = np.reshape(all_frames,
                            (all_frames.shape[0]*all_frames.shape[1],
                            all_frames.shape[2],
                            all_frames.shape[3])) 
    save_to_file("conc_RAW.raw", all_frames, np.float32)


    print "Creating array.."
    new_all_frames = np.asarray(new_all_frames, dtype=np.float32)
    print "Averaging together..."
    new_all_frames = np.mean(new_all_frames, axis=0)

    print np.shape(new_all_frames)


    save_to_file("evoked_trial_noBP_GSR.raw", new_all_frames, np.float32)
    def get_location(self):
        """

        Extracts the location of each pixel in the satellite image

        """
        self.ncols = self.satellite_gdal.RasterXSize / 2
        self.nrows = self.satellite_gdal.RasterYSize / 2
        self.length_df = self.nrows * self.ncols
        print 'Columns, rows', self.ncols, self.nrows
        cols_grid, rows_grid = np.meshgrid(range(0, self.ncols),
                                           range(0, self.nrows))
        self.cols_grid = cols_grid.flatten()
        self.rows_grid = rows_grid.flatten()
        print 'Checking the meshgrid procedure works'
        # getting a series of lat lon points for each pixel
        self.geotransform = self.satellite_gdal.GetGeoTransform()
        print 'Getting locations'
        self.location_series = np.array(
            parmap.starmap(pixel_to_coordinates,
                           zip(self.cols_grid, self.rows_grid),
                           self.geotransform,
                           processes=self.processes))
        print 'Converting to Points'
        pool = Pool(self.processes)
        self.location_series = pool.map(point_wrapper, self.location_series)
    def get_location(self):
        """

        Extracts the location of each pixel in the satellite image

        """
        self.ncols = self.satellite_gdal.RasterXSize / 2
        self.nrows = self.satellite_gdal.RasterYSize / 2
        self.length_df = self.nrows * self.ncols
        print 'Columns, rows', self.ncols, self.nrows
        cols_grid, rows_grid = np.meshgrid(
                    range(0, self.ncols), 
                    range(0, self.nrows))
        self.cols_grid = cols_grid.flatten()
        self.rows_grid = rows_grid.flatten()
        print 'Checking the meshgrid procedure works'
        # getting a series of lat lon points for each pixel
        self.geotransform = self.satellite_gdal.GetGeoTransform()
        print 'Getting locations'
        self.location_series = np.array(parmap.starmap(
                        pixel_to_coordinates, 
                        zip(self.cols_grid, self.rows_grid), 
                        self.geotransform,
                        processes = self.processes))
        print 'Converting to Points'
        pool = Pool(self.processes)
        self.location_series = pool.map(
                        point_wrapper, 
                        self.location_series)
示例#10
0
    def pairwise(self, pairwiseList, nprocs=1, **kwargs):
        """Pairwise read mapping based on list of references and reads to map.

        Args:
        pairwiseList -- list of tuples (i,j,refIndex,readFile)
                        'i' and 'j' are comparison indices
        nprocs -- max number of parallel mapper calls
        kwargs -- passsed to mapper method
        """
        # adding kwargs to function
        new_mapper = partial(self, **kwargs)

        # making trimmed list of tuples
        trimmed = [(i[2],i[3],) for i in pairwiseList]

        # calling mapper
        samFiles = parmap.starmap(new_mapper, trimmed, processes=nprocs)

        # creating a numpy array for output
        #simSamFiles = np.array([['' for i in range(n_refs)] for j in range(n_refs)], dtype=object)
        
        # appending samFiles to tuple
        pairwiseList2 = []
        for i,samFile in enumerate(samFiles):
            pairwiseList2.append(pairwiseList[i] + (samFile,))

        # return
        return pairwiseList2        
示例#11
0
    def parallel(self, names, mg, nprocs=1, **kwargs):
        """Calling mapper using multiple processors.

        Args:
        names -- NameFile instance with iter_names() method
        mg -- MetaFile instance with readFile attrib
        nprocs -- number of parallel calls
        kwargs -- passed to mapper call
        
        Return:
        refSamFile attrib set for each name in names 
        """
        # making list of tuples (indexFile, readFile)
        lt = [(name.get_indexFile(), mg.get_readFile())
              for name in names.iter_names()]

        # altering function kwargs
        new_mapper = partial(self, **kwargs)

        # calling mapper
        samFiles = parmap.starmap(new_mapper, lt, processes=nprocs)

        # adding samFile attrib to name instances
        for i, name in enumerate(names.iter_names()):
            name.set_refSamFile(samFiles[i])
def NCC_best_template_search(c1,
                             c2,
                             im1,
                             im2,
                             width=60,
                             c1_init=None,
                             c2_init=None,
                             search_w=100):
    ''' Finds the best center according to block matching search
    '''
    searchx, searchy = find_search_pixel(c1, c2, search_w)
    if c1_init is None:
        xv, yv = find_template_pixel(c1, c2, width, im1.shape[1], im1.shape[0])
    else:
        xv, yv = find_template_pixel(c1_init, c2_init, width, im1.shape[1],
                                     im1.shape[0])
    NCC_all = parmap.starmap(get_NCC,
                             zip(np.ravel(searchx), np.ravel(searchy)),
                             im1,
                             im2,
                             width,
                             yv,
                             xv,
                             pm_parallel=True,
                             pm_processes=2)
    maxNCC = np.max(NCC_all)
    if np.sum(NCC_all == -1) > 0:
        print('Number of weird NCC {}'.format(np.sum(NCC_all == -1)))
    if maxNCC == -1:
        print('VERY WEIRD ALL NCC ARE -1')
    idx = np.argmax(NCC_all)
    best_c1, best_c2 = np.ravel(searchx)[idx], np.ravel(searchy)[idx]
    return best_c1, best_c2, maxNCC
示例#13
0
def compute_norm_dist_to_tissue_parallel(locs, tissue_mat_new, target_df,
                                         result_df):

    num_cores = mp.cpu_count()
    if num_cores > math.floor(target_df.shape[0] / 2):
        num_cores = int(math.floor(target_df.shape[0] / 2))
    ttt = np.array_split(target_df, num_cores, axis=0)
    tuples = [(l, d, u, c) for l, d, u, c in zip(
        repeat(locs, num_cores), repeat(tissue_mat_new, num_cores), ttt,
        repeat(result_df, num_cores))]
    # repeat(p_cutoff, num_cores))]

    dist_results = parmap.starmap(compute_norm_dist_to_tissue,
                                  tuples,
                                  pm_processes=num_cores,
                                  pm_pbar=True)

    dd = [dist_results[i][0] for i in range(len(dist_results))]
    dist_val = reduce(operator.add, dd)

    gg = [dist_results[i][1] for i in range(len(dist_results))]
    genes = reduce(operator.add, gg)

    re = [dist_results[i][2] for i in range(len(dist_results))]
    recal_genes = reduce(operator.add, re)

    dist_norm = [val / max(dist_val) for val in dist_val]
    dist_df = pd.DataFrame([genes, dist_val, dist_norm]).T
    dist_df.columns = ['geneID', 'dist', 'norm_dist']
    dist_df.index = genes

    return dist_df, recal_genes
示例#14
0
    def pairwise(self, pairwiseList, nprocs=1, **kwargs):
        """Pairwise read mapping based on list of references and reads to map.

        Args:
        pairwiseList -- list of tuples (i,j,refIndex,readFile)
                        'i' and 'j' are comparison indices
        nprocs -- max number of parallel mapper calls
        kwargs -- passsed to mapper method
        """
        # adding kwargs to function
        new_mapper = partial(self, **kwargs)

        # making trimmed list of tuples
        trimmed = [(
            i[2],
            i[3],
        ) for i in pairwiseList]

        # calling mapper
        samFiles = parmap.starmap(new_mapper, trimmed, processes=nprocs)

        # creating a numpy array for output
        #simSamFiles = np.array([['' for i in range(n_refs)] for j in range(n_refs)], dtype=object)

        # appending samFiles to tuple
        pairwiseList2 = []
        for i, samFile in enumerate(samFiles):
            pairwiseList2.append(pairwiseList[i] + (samFile, ))

        # return
        return pairwiseList2
示例#15
0
def recalc_dist_to_tissue_parallel(locs,
                                   data_norm,
                                   cellGraph,
                                   gmmDict,
                                   test_genes,
                                   tissue_mat_new,
                                   add_sf=30,
                                   unary_scale_factor=100,
                                   label_cost=10,
                                   algorithm='expansion'):
    num_cores = mp.cpu_count()
    if num_cores > math.floor(len(test_genes) / 2):
        num_cores = int(math.floor(len(test_genes) / 2))
    ttt = np.array_split(test_genes, num_cores, axis=0)
    tuples = [
        (l, d, c, g, t, ts, a, u, ll, al)
        for l, d, c, g, t, ts, a, u, ll, al in zip(
            repeat(locs, num_cores), repeat(data_norm, num_cores),
            repeat(cellGraph, num_cores), repeat(gmmDict, num_cores), ttt,
            repeat(tissue_mat_new, num_cores), repeat(add_sf, num_cores),
            repeat(unary_scale_factor, num_cores), repeat(
                label_cost, num_cores), repeat(algorithm, num_cores))
    ]
    dist_results = parmap.starmap(recalc_dist_to_tissue,
                                  tuples,
                                  pm_processes=num_cores,
                                  pm_pbar=True)

    result_df_new = pd.DataFrame()
    best_dist_df = pd.DataFrame()
    for i in range(len(dist_results)):
        result_df_new = pd.concat([result_df_new, dist_results[i][0]])
        best_dist_df = pd.concat([best_dist_df, dist_results[i][1]])

    return result_df_new, best_dist_df
示例#16
0
def identify_spatial_genes(locs, data_norm, cellGraph, gmmDict, smooth_factor=10,
                      unary_scale_factor=100, label_cost=10, algorithm='expansion'):
#    pool = mp.Pool()
    '''
    main function to identify spatially variable genes
    :param file:locs: spatial coordinates (n, 2); data_norm: normalized gene expression;
        smooth_factor=10; unary_scale_factor=100; label_cost=10; algorithm='expansion' 
    :rtype: prediction: a dataframe
    '''    
    
    num_cores = mp.cpu_count()
    if num_cores > math.floor(data_norm.shape[1]/2):
         num_cores=int(math.floor(data_norm.shape[1]/2))
    ttt = np.array_split(data_norm,num_cores,axis=1)
    tuples = [(l, d, c, g, s, u, b, a) for l, d, c, g, s, u, b, a in zip(
                                    repeat(locs, num_cores), 
                                    ttt,
                                    repeat(cellGraph, num_cores),
                                    repeat(gmmDict, num_cores),
                                    repeat(smooth_factor, num_cores),
                                    repeat(unary_scale_factor, num_cores), 
                                    repeat(label_cost, num_cores),
                                    repeat(algorithm, num_cores))] 
    
    results = parmap.starmap(compute_spatial_genomewise_optimize_gmm, tuples,
                                pm_processes=num_cores, pm_pbar=True)
    
#    pool.close()
# p_values, genes, diff_p_values, exp_diff, smooth_factors, pred_labels
    nnn = [results[i][0] for i in np.arange(len(results))]
    nodes = reduce(operator.add, nnn)
    ppp = [results[i][1] for i in np.arange(len(results))]
    p_values=reduce(operator.add, ppp)
    ggg = [results[i][2] for i in np.arange(len(results))]
    genes = reduce(operator.add, ggg)
    # exp_ppp = [results[i][3] for i in np.arange(len(results))]
    # exp_pvalues = reduce(operator.add, exp_ppp)  
    # exp_ddd = [results[i][4] for i in np.arange(len(results))]
    # exp_diffs = reduce(operator.add, exp_ddd)      
    fff = [results[i][3] for i in np.arange(len(results))]
    s_factors = reduce(operator.add, fff)
    lll = [results[i][4] for i in np.arange(len(results))]
    pred_labels = reduce(operator.add, lll)

    best_p_values=[min(i) for i in p_values]
    fdr = multi.multipletests(np.array(best_p_values), method='fdr_bh')[1]
    #exp_fdr = multi.multipletests(np.array(exp_pvalues), method='fdr_bh')[1]    
    
    labels_array = np.array(pred_labels).reshape(len(genes), pred_labels[0].shape[0])
    data_array = np.array((genes, p_values, fdr,s_factors, nodes), dtype=object).T
    t_array = np.hstack((data_array, labels_array))
    c_labels = ['p_value', 'fdr',  'smooth_factor', 'nodes']
    for i in np.arange(labels_array.shape[1]) + 1:
        temp_label = 'label_cell_' + str(i)
        c_labels.append(temp_label)
    result_df = pd.DataFrame(t_array[:,1:], index=t_array[:,0], 
                      columns=c_labels)
    
    return result_df
 def sampling(self):
     """
     Constructs a weighted sample of images from the GeoDataFrame
     
     Returns:
         (array) sample_idx: index of sampled images 
     
     Note: Keras uses the last x percent of data in cross validation 
         Have to shuffle here to ensure that the last ten percent isn't just
         the southern most rows of information
     """
     # Getting the sum of urban pixels for each patch
     self.pop_array = self.df_image['pop_density'].fillna(0)
     self.pop_array = np.array(
             self.pop_array).reshape((self.nrows, self.ncols))
     print 'extract patches'
     self.image_slicer(self.pop_array)
     print 'get locations for individual frames'
     pool = Pool(self.processes)
     cols_grid = pool.map(adder, self.indices[:,0])
     rows_grid = pool.map(adder, self.indices[:,1])
     print 'Max of cols grid after slicing:',  max(cols_grid)
     print 'Max of rows grid after slicing:',  max(rows_grid)
     self.frame_location_series = parmap.starmap(
             pixel_to_coordinates,
             zip(cols_grid, rows_grid), self.geotransform, 
             processes=self.processes)
     print 'converting locations to Points'
     self.frame_location_series = \
             pool.map(Point, self.frame_location_series)
     pop_count = np.array([np.mean(patch) for patch in self.patches])
     self.df_sample = pd.DataFrame(pop_count, columns=['pop_ave'])
     # Getting the locations
     self.df_sample['location'] = self.frame_location_series
     # Creating sample weights 
     seed  =1975
     self.pop_mean_sample = self.df_sample.sample(
             frac=self.sample_rate,
             replace=True,
             weights='pop_ave',
             random_state = seed)
     self.sample_idx = np.array(self.pop_mean_sample.index.values)
     # ensuring that we get some values with zero population
     self.df_zero_sample = self.df_sample[self.df_sample['pop_ave']==0]
     self.pop_mean_sample_zero = self.df_zero_sample.sample(
             frac = self.sample_rate,
             replace = True,
             random_state = seed)
     self.sample_idx_zero = np.array(self.pop_mean_sample_zero.index.values)
     # combining with >0 sample
     self.sample_idx = np.concatenate([self.sample_idx, 
                                       self.sample_idx_zero])
     self.pop_output_data = np.concatenate([self.pop_mean_sample, 
                                            self.pop_mean_sample_zero]).T[0]
     # shuffling so that we don't have the zero pop areas at end of sample
     p = np.random.permutation(len(self.sample_idx))
     self.sample_idx = self.sample_idx[p]
     self.pop_output_data = np.array(
             self.pop_output_data[p]).reshape((len(self.pop_output_data),1))
示例#18
0
def exec18():
    #date 불러오기
    dates = pd.read_excel('workingdays_201912.xlsx', converters={'영업일': str})
    date1 = dates['영업일']
    # date1 = ['2020.06.03']    ####테스트용

    #code 불러오기
    codes = pd.read_excel('코드리스트.xlsx', converters={'종목코드': str})
    code = codes['종목코드']
    # code = ['011790']    ####테스트용

    #불러온 date 라인별로 실행
    for date in date1:
        print(date)
        date = [date]
        input_list = list(itertools.product(date, code))
        parmap.starmap(excute, input_list, pm_pbar=True)
示例#19
0
def parallel_starmap(f, args, processes = 1):
    """
    Wrapper function for 'parmap.starmap': Parallises the computations in 
    'starmap' form if required. If only one process is needed, computations 
    are performed serially
    """
    if processes == 1:
        return [f(*arg) for arg in args]
    return parmap.starmap(f, args, processes = processes)
 def sampling(self):
     """
     Constructs a weighted sample of images from the GeoDataFrame
     
     Returns:
         (array) sample_idx: index of sampled images 
     
     Note: Keras uses the last x percent of data in cross validation 
         Have to shuffle here to ensure that the last ten percent isn't just
         the southern most rows of information
     """
     # Getting the sum of urban pixels for each patch
     self.pop_array = self.df_image['pop_density'].fillna(0)
     self.pop_array = np.array(self.pop_array).reshape(
         (self.nrows, self.ncols))
     print 'extract patches'
     self.image_slicer(self.pop_array)
     print 'get locations for individual frames'
     pool = Pool(self.processes)
     cols_grid = pool.map(adder, self.indices[:, 0])
     rows_grid = pool.map(adder, self.indices[:, 1])
     print 'Max of cols grid after slicing:', max(cols_grid)
     print 'Max of rows grid after slicing:', max(rows_grid)
     self.frame_location_series = parmap.starmap(pixel_to_coordinates,
                                                 zip(cols_grid, rows_grid),
                                                 self.geotransform,
                                                 processes=self.processes)
     print 'converting locations to Points'
     self.frame_location_series = \
             pool.map(Point, self.frame_location_series)
     pop_count = np.array([np.mean(patch) for patch in self.patches])
     self.df_sample = pd.DataFrame(pop_count, columns=['pop_ave'])
     # Getting the locations
     self.df_sample['location'] = self.frame_location_series
     # Creating sample weights
     seed = 1975
     self.pop_mean_sample = self.df_sample.sample(frac=self.sample_rate,
                                                  replace=True,
                                                  weights='pop_ave',
                                                  random_state=seed)
     self.sample_idx = np.array(self.pop_mean_sample.index.values)
     # ensuring that we get some values with zero population
     self.df_zero_sample = self.df_sample[self.df_sample['pop_ave'] == 0]
     self.pop_mean_sample_zero = self.df_zero_sample.sample(
         frac=self.sample_rate, replace=True, random_state=seed)
     self.sample_idx_zero = np.array(self.pop_mean_sample_zero.index.values)
     # combining with >0 sample
     self.sample_idx = np.concatenate(
         [self.sample_idx, self.sample_idx_zero])
     self.pop_output_data = np.concatenate(
         [self.pop_mean_sample, self.pop_mean_sample_zero]).T[0]
     # shuffling so that we don't have the zero pop areas at end of sample
     p = np.random.permutation(len(self.sample_idx))
     self.sample_idx = self.sample_idx[p]
     self.pop_output_data = np.array(self.pop_output_data[p]).reshape(
         (len(self.pop_output_data), 1))
def calculate_mean_utt_length(raw_wavs, sr):
    n_worker = int(cpu_count() * cpu_rate)
    logging.info(
        "{}% resources ({} cpu core(s)) will be used for mean utterance length calculation"
        .format(cpu_rate * 100, n_worker))

    # return pool.starmap(_calculate_mean_utt_length, zip(raw_wavs, repeat(sr)))
    return parmap.starmap(_calculate_mean_utt_length,
                          list(zip(raw_wavs, repeat(sr))),
                          pm_processes=n_worker,
                          pm_pbar=True)
示例#22
0
def pre_landsat_batch(data_dir, out_dir, ref_raster):

    # Get a list of S2 L2A product directory names
    landsat_products = filter(
        re.compile(r'^L.*[0-9]$').search, os.listdir(data_dir))

    for key in landsat_products:

        ldir = filter(re.compile('tif$').search, os.listdir(data_dir + key))

        # Create directories in dest location
        if not os.path.exists(out_dir + key):
            os.makedirs(out_dir + key)

        # Put bands in list
        bands = list(
            map(
                lambda x: (data_dir + key + '/' + x, out_dir + key + '/ref_' +
                           x.split('_')[-2] + x.split('_')[-1], ref_raster),
                ldir))

        parmap.starmap(pre_process_landsat, bands)
def uncompress_files(data_dir, unzip_dir=None):
    """
    Unzips every zipfile in the path, and stores in directory with zipfile name+.SAFE
    
    Args:
        eo_dir (str): directory where zipfiles are located
        unzip_dir (str): directory where files will be unzipped, default is data_dir
    """

    # List all zip files in directory
    eo_zip_files = filter(re.compile('zip$').search, os.listdir(data_dir))

    # List all tar files files in directory
    eo_tar_files = filter(re.compile('tar.gz$').search, os.listdir(data_dir))

    # Check if a data folder exist
    if unzip_dir is None:
        unzip_direc = data_dir
    else:
        unzip_direc = unzip_dir

    if not os.path.exists(unzip_direc):
        os.makedirs(unzip_direc)
        print('New directory {} was created'.format(unzip_direc))

    # Make sure uncompress path ends with slash
    #if unzip_direc[-1] != '/':
    #    unzip_direc = unzip_direc + '/'

    # Put parameter sets in tuples
    eo_zip_files = list(map(lambda x: (unzip_direc, data_dir, x),
                            eo_zip_files))
    eo_tar_files = list(map(lambda x: (unzip_direc, data_dir, x),
                            eo_tar_files))

    parmap.starmap(unzip_eo, eo_zip_files)
    parmap.starmap(untar_eo, eo_tar_files)
示例#24
0
def multiGMM(data_norm):
    num_cores = mp.cpu_count()
    if num_cores > math.floor(data_norm.shape[1]/2):
        num_cores=int(math.floor(data_norm.shape[1]/2))
   # print(num_cores)
    ttt = np.array_split(data_norm,num_cores,axis=1)
    #print(ttt)

    tuples = [d for d in zip(ttt)] 
    gmmDict_=parmap.starmap(gmm_model, tuples,
                             pm_processes=num_cores, pm_pbar=True)
    gmmDict={}
    for i in np.arange(len(gmmDict_)):
        gmmDict.update(gmmDict_[i])   ## dict.update()  add dict to dict
    return gmmDict
def calculate_speaking_rate(raw_wavs, texts, sr, dictionary):
    n_worker = int(cpu_count() * cpu_rate)
    logging.info(
        "{}% resources ({} cpu core(s)) will be used for speaking_rate calculation"
        .format(cpu_rate * 100, n_worker))

    # rates = pool.starmap(_calculate_speaking_rate, zip(raw_wavs, texts, repeat(sr), repeat(dictionary)))
    rates = parmap.starmap(_calculate_speaking_rate,
                           list(
                               zip(raw_wavs, texts, repeat(sr),
                                   repeat(dictionary))),
                           pm_processes=n_worker,
                           pm_pbar=True)

    return rates
def sen2_batch (res, dir): # Creates a list of arguments based on number of files to run
    os.chdir(dir)
    datafiles = os.listdir(dir)
    slist = []

    for files in datafiles: # checks for L1C folders
        checker1 = "L1C"
        if files[7:10] == checker1 or files[16:19] == checker1:
            slist.append((res, files))
        checker2 = "L2A" # checks for unfinished L2A folders an deletes it
        if files[7:10] == checker2 or files[16:19] == checker2:
            each_folder_dir = str(os.listdir(dir)) + '/' + str(files)
            for subf in each_folder_dir:
                if len(subf) <= 8:
                    shutil.rmtree(files, ignore_errors=True) #resolving the bug that shows error after deleting files

    if not os.path.isdir(dir):
        raise ValueError('working_directory: '+ dir + ' does not exist or is inaccesible. Your current working directory is '+os.getcwd()+'.')



    parmap.starmap(sen2_single, slist) #goes for optimal processing setup since GIPP parallelizing is set to AUTO
    dir_L1C = dir
    return dir_L1C
示例#27
0
    def find_optimal_beta(self, heuristics,  X, feat_combos, ground):
        """
        Returns optimal beta for given heuristics
        heuristics: list of pre-trained logistic regression models
        X: primitive matrix
        feat_combos: feature indices to apply heuristics to
        ground: ground truth associated with X data
        """


        if f'{heuristics[0].__class__}' == "<class 'program_synthesis.synthesizer.dummyKneighborClassifier'>":
            ########single-core, gpu
            if self.cuda:
                beta_opt = []
                # for i, hf in enumerate(heuristics):
                #     X_.append(all_pair_dist_cuda(self.val_primitive_matrix[:, feat_combos[i]], X[:, feat_combos[i]], feat_combos[i]))
                #
                # marginals = parmap.map(heuristics[0].predict_proba, X_, self.val_ground)
                #
                # for i, hf in enumerate(heuristics):
                #     beta_opt.append((self.beta_optimizer(marginals[i], ground)))

                for i, hf in enumerate(heuristics):
                    X_ = all_pair_dist_cuda(cp.array(self.val_primitive_matrix[:, feat_combos[i]]), cp.array(X[:, feat_combos[i]]), feat_combos[i])
                    marginals = hf.predict_proba_cuda(X_, self.val_ground)
                    beta_opt.append(self.beta_optimizer_cuda(marginals, ground))
                    self.betas.append = beta_opt[-1]
            else:

                #######with parmap
                # beta_opt = parmap.starmap(self.find_dist_and_proba_and_beta, list(zip(heuristics, feat_combos)), X, ground,
                #                           pm_pbar=True)
                # self.betas = beta_opt

                #######single-core, numba
                beta_opt = []
                for i, hf in enumerate(heuristics):
                    #print(i, end='\r')
                    X_ = all_pair_dist(self.val_primitive_matrix[:, feat_combos[i]], X[:, feat_combos[i]], feat_combos[i])
                    marginals = hf.predict_proba(X_, self.val_ground)
                    beta_opt.append(self.beta_optimizer(marginals, ground))
                    self.betas.append = beta_opt[-1]
        else:
            #######with parmap
            beta_opt = parmap.starmap(self.find_proba_and_beta, list(zip(heuristics, feat_combos)), X, ground, pm_pbar=True)
            self.betas = beta_opt
        return beta_opt
示例#28
0
def cross_val_proba_score_parmap(estimator,
                                 X,
                                 y,
                                 scoring=multilabel_prec,
                                 scoring_arg1=1,
                                 scoring_arg2=5,
                                 n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True)
    scores = parmap.starmap(parmap_wrap,
                            kf.split(X),
                            X,
                            y,
                            estimator,
                            scoring=scoring,
                            scoring_arg1=scoring_arg1,
                            scoring_arg2=scoring_arg2)
    cv_score = np.asarray(scores).mean()
    return cv_score
def calc_silhouette_per_gene(genes=None,
                             expr=None,
                             dissim=None,
                             examine_top=0.1,
                             seed=-1,
                             num_cores=8,
                             bar=True):
    if genes is None or expr is None or dissim is None:
        sys.stderr.write("Need genes, expr, dissim\n")
        return
    if seed != -1 and seed >= 0:
        np.random.seed(seed)
    sys.stdout.write("Started 2 " + "\n")
    sil = []
    ncell = expr.shape[1]
    ex = int((1.0 - examine_top) * 100.0)
    subexpr = np.array_split(expr, num_cores, axis=0)
    subgenes = np.array_split(np.array(genes), num_cores)
    subgenes = [i.tolist() for i in subgenes]
    print('number of cores : ' + str(num_cores))
    dis_list = []
    for i in range(num_cores):
        dis_list.append(dissim)
    assert len(subexpr) == len(subgenes)
    tuples = [(expr, ncell, genes, exa, dis)
              for expr, ncell, genes, exa, dis in zip(
                  subexpr, repeat(ncell, num_cores), subgenes,
                  repeat(examine_top, num_cores), dis_list)]
    results = parmap.starmap(process,
                             tuples,
                             pm_processes=num_cores,
                             pm_pbar=bar)
    for i in np.arange(len(results)):
        sil += results[i]

    res = []
    for ig, g in enumerate(genes):
        this_avg = sil[ig][1]
        this_sil = sil[ig][2]
        res.append((g, this_sil))
    res.sort(key=itemgetter(1), reverse=True)
    return res
def apply_rotation_correction(SHAPE_ROTATION_RESULTS_PATH,
                              APPLY_SHAPE_ROTATION_RESULTS_PATH):
    '''
    > 2.4
    SHAPE_ROTATION_RESULTS_PATH - source
    APPLY_SHAPE_ROTATION_RESULTS_PATH - export
    '''

    import parmap

    ## read correction results
    df_corrections = pd.read_csv(SHAPE_ROTATION_RESULTS_PATH)
    #display(df_corrections.head())
    #display(df_corrections.Rotate.value_counts())

    # apply only once
    if not os.path.exists(APPLY_SHAPE_ROTATION_RESULTS_PATH):
        print(APPLY_SHAPE_ROTATION_RESULTS_PATH,
              ' does not exist -> apply rotation on all images')
        '''
        apply rotation to ALL files
        tskes 2 min (threadded with setting below)
        '''
        N_CPU = multiprocessing.cpu_count() - 10

        args_1 = df_corrections.file.to_list()
        args_2 = df_corrections.Rotate.to_list()
        all_args = list(zip(args_1, args_2))

        results = parmap.starmap(correct_img_rotation,
                                 all_args,
                                 pm_pbar=True,
                                 pm_parallel=True,
                                 pm_processes=N_CPU)

        pd.DataFrame(list(zip(args_1, args_2, results)),
                     columns=['file', 'was_rotated', 'outcome'
                              ]).to_csv(APPLY_SHAPE_ROTATION_RESULTS_PATH,
                                        index=False)

    else:
        print('already applied rotation correction to data!')
示例#31
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-ann_dir", help="Annotation directory")
    parser.add_argument("-clueweb_dir", help="Clueweb directory")
    parser.add_argument("-output_dir", help="Output directory")
    parser.add_argument("-num_processes", help="Number of processes to run")
    args = parser.parse_args()

    num_processes = int(args.num_processes)

    # Iterate over each subdirectory in the clueweb dir
    for subdir, dirs, files in os.walk(args.clueweb_dir):
        for folder in dirs:
            ann_dir = os.path.join(args.ann_dir, folder)
            clueweb_dir = os.path.join(args.clueweb_dir, folder)
            output_dir = os.path.join(args.output_dir, folder)
            
            ann_iter = (sorted(os.listdir(ann_dir)))
            clueweb_iter = (sorted(os.listdir(clueweb_dir)))
            # Ann_dir and clueweb_dir sometimes have different number of files
            # Loop through, and create new ann list with correct files
            ann_list = []
            for cw_file in clueweb_iter:
                for ann_file in ann_iter:
                    # Split to only get filename, and not file extensions
                    if cw_file.split(".")[0] == ann_file.split(".")[0]:
                        ann_list.append(ann_file)
            kwargs = {'processes': num_processes}
            
            start = time.time()
            # Read and clean files in parallel
            results = parmap.starmap(read_and_clean_files, zip(clueweb_iter, ann_list),
                                     clueweb_dir, ann_dir, **kwargs)
            end = time.time()
            print "Time used reading and cleaning all files", end - start
            start = time.time()
            # Initiate indexer
            indexer = Indexer(output_dir)
            # Index all the cleaned records
            indexer.index_files(results)
            end = time.time()
            print "Time used indexing all files", end - start
def calculate_f0(raw_wavs, sr, frame_period=5, parallel=True):
    f0s = []
    logging.info("Calculating f0 ...")

    if not parallel:
        for wav in raw_wavs:
            f0s.append(_calculate_f0(wav, sr, frame_period))
    else:
        n_worker = int(cpu_count() * cpu_rate)
        logging.info(
            "{}% resources ({} cpu core(s)) will be used for f0 calculation".
            format(cpu_rate * 100, n_worker))
        # f0s = pool.starmap(_calculate_f0, zip(raw_wavs, repeat(sr), repeat(frame_period)))
        f0s = parmap.starmap(_calculate_f0,
                             list(
                                 zip(raw_wavs, repeat(sr),
                                     repeat(frame_period))),
                             pm_processes=n_worker,
                             pm_pbar=True)

    return f0s
示例#33
0
def max_posterior(gmm, U, coords, covar=None):
    import multiprocessing, parmap
    pool = multiprocessing.Pool()
    n_chunks, chunksize = gmm._mp_chunksize()
    log_p = [[] for k in range(gmm.K)]
    log_S = np.zeros(len(coords))
    H = np.zeros(len(coords), dtype="bool")
    k = 0
    for log_p[k], U[k], _ in \
    parmap.starmap(pygmmis._Estep, zip(range(gmm.K), U), gmm, data, covar, None, pool=pool, pm_chunksize=chunksize):
        log_S[U[k]] += np.exp(log_p[k]) # actually S, not logS
        H[U[k]] = 1
        k += 1
    log_S[H] = np.log(log_S[H])

    max_q = np.zeros(len(coords))
    max_k = np.zeros(len(coords), dtype='uint32')
    for k in range(gmm.K):
        q_k = np.exp(log_p[k] - log_S[U[k]])
        max_k[U[k]] = np.where(max_q[U[k]] < q_k, k, max_k[U[k]])
        max_q[U[k]] = np.maximum(max_q[U[k]],q_k)
    return max_k
示例#34
0
    def parallel(self, names, mg, nprocs=1, **kwargs):
        """Calling mapper using multiple processors.

        Args:
        names -- NameFile instance with iter_names() method
        mg -- MetaFile instance with readFile attrib
        nprocs -- number of parallel calls
        kwargs -- passed to mapper call
        
        Return:
        refSamFile attrib set for each name in names 
        """
        # making list of tuples (indexFile, readFile)
        lt = [(name.get_indexFile(), mg.get_readFile()) for name in names.iter_names()]

        # altering function kwargs
        new_mapper = partial(self, **kwargs)

        # calling mapper
        samFiles = parmap.starmap(new_mapper, lt, processes=nprocs)

        # adding samFile attrib to name instances
        for i,name in enumerate(names.iter_names()):
            name.set_refSamFile(samFiles[i])
示例#35
0
 def test_warn_wrong_argument_starmap(self):
     with warnings.catch_warnings(record=True) as w:
         parmap.starmap(range, [(0,2), (2,5)], processes=-3)
         assert len(w) == 1
示例#36
0
def training_and_testing(ARGS):
    # Check conditions
    if ARGS['list_columns']:
        list_columns = list(sorted(ARGS['list_columns']))
    if not ARGS['list_columns']:
        list_columns = [
            'CADD_phred', 'SIFTval', 'VEST4_score', 'gnomAD_exomes_AF'
        ]

    if ARGS['flag']:
        flag = list(sorted(ARGS['flag']))
    if not ARGS['flag']:
        flag = [
            "REVEL_score",
            "ClinPred_score",
            "M-CAP_score",
            "fathmm-XF_coding_score",
            "Eigen-raw_coding",
            "PrimateAI_score",
        ]

    if not os.path.exists(ARGS['output_dir'] +
                          '/TRAIN/training.csv.gz') or not os.path.exists(
                              ARGS['output_dir'] + '/TEST/testing.csv.gz'):
        logger.warn(
            '--train_and_test mode selected but training and testing file not found, creation with following parameters :'
            '--ratio : ' + str(ARGS['ratio']) + ', --proportion : ' +
            str(ARGS['proportion']))
        ARGS['force_datasets'] = True
    if os.path.exists(ARGS['output_dir'] +
                      '/TRAIN/training.csv.gz') or os.path.exists(
                          ARGS['output_dir'] + '/TEST/testing.csv.gz'):
        logger.info('Training and testing file found')

    if ARGS['combinatory'] is True:
        pass

    # if enable, erase previously generated training and testing file from a global dataframe to creating other ones
    if ARGS['force_datasets'] is True:

        utils.mkdir(ARGS['output_dir'])
        utils.mkdir(ARGS['output_dir'] + '/TRAIN')
        utils.mkdir(ARGS['output_dir'] + '/TEST')
        logger.warn('Creating new files or overwriting old ones')
        prop = ARGS['proportion']
        t = float(round(prop / (1 - prop), 2))

        ratio = ARGS['ratio']
        tmp = pd.read_csv(filepath_or_buffer=ARGS['input'],
                          sep='\t',
                          compression='gzip',
                          encoding='utf-8',
                          low_memory=False)

        if list_columns and flag:
            # Selection of specific columns to be used from a global dataframe
            # Example : df with 10 columns, --list_columns column1 column2 column5
            tmp = select_columns_pandas.select_columns_pandas(
                tmp, list_columns, flag)
            logger.info(tmp)

        # Use of input parameters to build training and testing dataframes (proportion, ratio of data between train and test)
        # Special attention is paid to remove overlap between evaluation|test sets and training dataset to prevent any overfitting

        complete_data_path = tmp.loc[tmp['True_Label'] == 1]
        complete_data_path = complete_data_path.sample(frac=1)
        complete_data_begn = tmp.loc[tmp['True_Label'] == -1]
        complete_data_begn = complete_data_begn.sample(frac=1)
        max_size = max(complete_data_path.shape[0],
                       complete_data_begn.shape[0])
        min_size = min(complete_data_path.shape[0],
                       complete_data_begn.shape[0])

        if max_size > (t * min_size):
            max_size = min_size * t
        elif max_size < (t * min_size):
            min_size = max_size / t
        if min_size < 1000 and min(complete_data_path.shape[0], complete_data_begn.shape[0]) == \
          complete_data_path.shape[0]:
            logger.warn(
                'CAREFUL : Size of the pathogenic dataset will be < 1000 samples'
            )

        eval_test_size = ratio
        train_path = complete_data_path.head(
            n=int(round(min_size * (1 - eval_test_size))))
        train_begn = complete_data_begn.head(
            n=int(round(max_size * (1 - eval_test_size))))
        eval_path = complete_data_path.tail(
            n=int(round(min_size * eval_test_size)))
        eval_begn = complete_data_begn.tail(
            n=int(round(min_size * eval_test_size)))

        eval_path.dropna(inplace=True)
        eval_begn.dropna(inplace=True)

        complete_training = pd.concat([train_path, train_begn
                                       ]).drop_duplicates(keep='first')
        complete_training = complete_training[complete_training.columns.drop(
            list(complete_training.filter(regex='pred|flag')))]
        complete_training.dropna(inplace=True)

        # Some stats on Pathogenic and Benign variant numbers in both training and testing dataframes

        logger.info('Training - Path : ' + str(complete_training[
            complete_training['True_Label'] == 1].shape[0]))
        logger.info('Training - Benign : ' + str(complete_training[
            complete_training['True_Label'] == -1].shape[0]))

        min_size_eval = min(eval_path.shape[0], eval_begn.shape[0])
        complete_eval = pd.concat([
            eval_path.sample(frac=1).head(min_size_eval),
            eval_begn.sample(frac=1).head(min_size_eval)
        ]).drop_duplicates(keep='first')

        logger.info(
            'Testing - Path : ' +
            str(complete_eval[complete_eval['True_Label'] == 1].shape[0]))
        logger.info(
            'Testing - Benign : ' +
            str(complete_eval[complete_eval['True_Label'] == -1].shape[0]))

        # Dumping data

        complete_training.to_csv(path_or_buf=ARGS['output_dir'] +
                                 '/TRAIN/training.csv.gz',
                                 sep='\t',
                                 compression='gzip',
                                 encoding='utf-8',
                                 index=False)

        complete_eval.to_csv(path_or_buf=ARGS['output_dir'] +
                             '/TEST/testing.csv.gz',
                             sep='\t',
                             compression='gzip',
                             encoding='utf-8',
                             index=False)

    check_dir_train = False
    if os.path.isdir(ARGS['output_dir'] + '/TRAIN/Models'):
        check_dir_train = True
    if (ARGS['force_training'] is True) or (check_dir_train is False):
        # Training model
        # TrainingClassification(input_data=ARGS['output_dir'] + '/TRAIN/training.csv.gz',
        #                        classifiers=classifiers,
        #                        standardize=ARGS['standardize'],
        #                        output=ARGS["output_dir"],
        #                        logger=logger,
        #                        cv=ARGS['cross_validation']
        #                        )

        TestingClassification(input_data=ARGS['output_dir'] +
                              '/TEST/testing.csv.gz',
                              standardize=ARGS['standardize'],
                              output_dir=ARGS["output_dir"],
                              model_dir=ARGS['model'],
                              logger=logger,
                              threshold=ARGS['threshold'])

        # Generation of a histogram to see most important features used in builded model
        # histo_weights.histo_and_metrics(folder=ARGS['output_dir'], logger=logger)

    # This parameter, if enabled, will build all possible combinations from a single dataframe if sources are mentionned
    # Example : A global dataframe based on 3 databases (2 pathogenic : Clinvar and HGMD and 1 benign : gnomAD) was generated
    # The following lines will generate 2 evaluation sets : (clinvar|gnomAD) and (HGMD|gnomAD) with various MAF thresholds (<0.01, <0.001, 0.0001, AC=1(singleton), AF=0)
    # and each of these combinations will be tested with the previously generated outputs. (Overlapping is checked between these combinations and training dataset)

    if ARGS['eval'] and ARGS['eval'].endswith('.csv.gz'):
        # TODO : CHANGE NAME
        print('\n\n')
        logger.info('--BUILDING & TESTING ON EVALUATION SETS--')

        output_dir = ARGS['output_dir']
        eval_output_dir = output_dir
        eval_output_dir = eval_output_dir.split('/')
        eval_output_dir[-1] = 'EVALUATION_SETS_' + eval_output_dir[-1]
        eval_output_dir = "/".join(eval_output_dir)

        if os.path.isdir(eval_output_dir):
            pass
        else:
            utils.mkdir(eval_output_dir)

            # if ARGS['list_columns'] and ARGS['flag']:
            combination_pandas.combination_pandas(
                ARGS['eval'],
                output_dir + '/TRAIN/training.csv.gz',
                eval_output_dir,
                logger,
                list_columns,
                flag,
                CV=ARGS['cross_validation_evaluation'])
        # else:
        # 	combination_pandas.combination_pandas(ARGS['eval'], ARGS['output_dir'] + '/TRAIN/training.csv.gz', output_dir, CV=ARGS['cross_validation_evaluation'])
        l_dir = os.listdir(eval_output_dir)
        print(list(zip(l_dir)))
        parmap.starmap(test_eval_mp,
                       list(zip(l_dir)),
                       pm_pbar=True,
                       pm_processes=ARGS['threads'])

    # Plots are automatically generated to visualize performance across various scenario for the different combinations
    print('\n\n')
    logger.info('--GENERATING PLOTS & STATS--')
    utils.mkdir(eval_output_dir + '/PLOTS_AND_MEAN_TABLE')
    # maf_plot.violin_plot_scores(eval_output_dir, logger)
    # maf_plot.maf_plot_maf_0(eval_output_dir, ARGS['cross_validation_evaluation'], logger)
    maf_plot.maf_plot_others(eval_output_dir,
                             ARGS['cross_validation_evaluation'], logger)
示例#37
0
dt = [0.2]
N = [1]
sigma = [0.5, 1, 2]
mu = [0]
corr_time = [1000]

repetitions = 1000
n_tau = 10
tau_list = [np.linspace(1, 20, n_tau)]
# seed_list = np.arange(n_tau * repetitions).reshape((repetitions, n_tau))
seed_list = np.arange(repetitions)

values = list(product([H], tau_list, dt, N, mu, sigma, corr_time, seed_list))

results = parmap.starmap(dd_wrapper, values, pm_chunksize=3, pm_pbar=True)

results = np.array(results)
print(results.shape)

# Adapt results to input
results = results.reshape((3, repetitions, n_tau))

results_mean = results.mean(axis=-2)
results_std = results.std(axis=-2) / np.sqrt(repetitions - 1)

plt.errorbar(tau_list[0], results_mean[0, :], results_std[0, :])
plt.errorbar(tau_list[0], results_mean[1, :], results_std[1, :])
plt.errorbar(tau_list[0], results_mean[2, :], results_std[1, :])
plt
# plt.errorbar(tau_list[0], results_mean, results_std)
示例#38
0
 def test_starmap(self):
     items = [(1, 2), (3, 4), (5, 6)]
     pfalse = parmap.starmap(_identity, items, 5, 6, pm_parallel=False)
     ptrue = parmap.starmap(_identity, items, 5, 6, pm_parallel=True)
     self.assertEqual(pfalse, ptrue)
def main():

    """
    Demostration Options
    """
    logging.basicConfig(level = logging.DEBUG)

    # If using parallel functionality, you must call this to set the appropriate
    # logging level
    gp.classifier.set_multiclass_logging_level(logging.DEBUG)

    # np.random.seed(50)
    # Feature Generation Parameters and Demonstration Options
    SAVE_OUTPUTS = True # We don't want to make files everywhere for a demo.
    SHOW_RAW_BINARY = True
    test_range_min = -2.0
    test_range_max = +2.0
    test_ranges = (test_range_min, test_range_max)
    n_train = 100
    n_query = 1000
    n_dims  = 2   # <- Must be 2 for vis
    n_cores = None # number of cores for multi-class (None -> default: c-1)
    walltime = 300.0
    approxmethod = 'laplace' # 'laplace' or 'pls'
    multimethod = 'OVA' # 'AVA' or 'OVA', ignored for binary problem
    fusemethod = 'EXCLUSION' # 'MODE' or 'EXCLUSION', ignored for binary
    responsename = 'probit' # 'probit' or 'logistic'
    batch_start = False
    entropy_threshold = None

    n_draws = 6
    n_draws_est = 2500
    rows_subplot = 2
    cols_subplot = 3

    assert rows_subplot * cols_subplot >= n_draws

    # Decision boundaries
    db1 = lambda x1, x2: (((x1 - 1)**2 + x2**2/4) * 
            (0.9*(x1 + 1)**2 + x2**2/2) < 1.6) & \
            ((x1 + x2) < 1.5)
    db2 = lambda x1, x2: (((x1 - 1)**2 + x2**2/4) * 
            (0.9*(x1 + 1)**2 + x2**2/2) > 0.3)
    db3 = lambda x1, x2: ((x1 + x2) < 2) & ((x1 + x2) > -2.2)
    db4 = lambda x1, x2: ((x1 - 0.75)**2 + (x2 + 0.8)**2 > 0.3**2)
    db5 = lambda x1, x2: ((x1/2)**2 + x2**2 > 0.3)
    db6 = lambda x1, x2: (((x1)/8)**2 + (x2 + 1.5)**2 > 0.2**2)
    db7 = lambda x1, x2: (((x1)/8)**2 + ((x2 - 1.4)/1.25)**2 > 0.2**2)
    db4a = lambda x1, x2: ((x1 - 1.25)**2 + (x2 - 1.25)**2 > 0.5**2) & ((x1 - 0.75)**2 + (x2 + 1.2)**2 > 0.6**2) & ((x1 + 0.75)**2 + (x2 + 1.2)**2 > 0.3**2) & ((x1 + 1.3)**2 + (x2 - 1.3)**2 > 0.4**2)
    db5a = lambda x1, x2: ((x1/2)**2 + x2**2 > 0.3) & (x1 > 0)
    db5b = lambda x1, x2: ((x1/2)**2 + x2**2 > 0.3) & (x1 < 0) & ((x1 + 0.75)**2 + (x2 - 1.2)**2 > 0.6**2)
    db1a = lambda x1, x2: (((x1 - 1)**2 + x2**2/4) * 
            (0.9*(x1 + 1)**2 + x2**2/2) < 1.6) & \
            ((x1 + x2) < 1.6) | ((x1 + 0.75)**2 + (x2 + 1.2)**2 < 0.6**2)
    db1b = lambda x1, x2: (((x1 - 1)**2 + x2**2/4) * 
            (0.9*(x1 + 1)**2 + x2**2/2) < 1.6) & ((x1/2)**2 + (x2)**2 > 0.4**2) & \
            ((x1 + x2) < 1.5) | ((x1 + 0.75)**2 + (x2 - 1.5)**2 < 0.4**2) | ((x1 + x2) > 2.25) & (x1 < 1.75) & (x2 < 1.75) # | (((x1 + 0.25)/4)**2 + (x2 + 1.5)**2 < 0.32**2) # & (((x1 + 0.25)/4)**2 + (x2 + 1.5)**2 > 0.18**2)
    db1c = lambda x1, x2: (((x1 - 1)**2 + x2**2/4) * 
            (0.9*(x1 + 1)**2 + x2**2/2) < 1.6) & ((x1/2)**2 + (x2)**2 > 0.4**2) & \
            ((x1 + x2) < 1.5) | ((x1 + 0.75)**2 + (x2 - 1.5)**2 < 0.4**2) | ((x1 + x2) > 2.25) & (x1 < 1.75) & (x2 < 1.75) | (((x1 + 0.25)/4)**2 + (x2 + 1.75)**2 < 0.32**2) & (((x1 + 0.25)/4)**2 + (x2 + 1.75)**2 > 0.18**2)
    db8 = lambda x1, x2: (np.sin(2*x1 + 3*x2) > 0) | (((x1 - 1)**2 + x2**2/4) * 
            (0.9*(x1 + 1)**2 + x2**2/2) < 1.4) & \
            ((x1 + x2) < 1.5) | (x1 < -1.9) | (x1 > +1.9) | (x2 < -1.9) | (x2 > +1.9) | ((x1 + 0.75)**2 + (x2 - 1.5)**2 < 0.3**2)
    # db9 = lambda x1, x2: ((x1)**2 + (x2)**2 < 0.3**2) | ((x1)**2 + (x2)**2 > 0.5**2) |
    decision_boundary  = [db5b, db1c, db4a] # [db5b, db1c, db4a, db8, db6, db7]

    """
    Data Generation
    """

    # # # Training Points
    # shrink = 0.8
    # test_range_min *= shrink
    # test_range_max *= shrink
    # X1 = np.random.normal(loc = np.array([test_range_min, test_range_min]), scale = 0.9*np.ones(n_dims), size = (int(n_train/8), n_dims))
    # X2 = np.random.normal(loc = np.array([test_range_min, test_range_max]), scale = 0.9*np.ones(n_dims), size = (int(n_train/8), n_dims))
    # X3 = np.random.normal(loc = np.array([test_range_max, test_range_min]), scale = 0.9*np.ones(n_dims), size = (int(n_train/8), n_dims))
    # X4 = np.random.normal(loc = np.array([test_range_max, test_range_max]), scale = 0.9*np.ones(n_dims), size = (int(n_train/8), n_dims))
    # X5 = np.random.normal(loc = np.array([0, test_range_min]), scale = 0.9*np.ones(n_dims), size = (int(n_train/8), n_dims))
    # X6 = np.random.normal(loc = np.array([test_range_min, 0]), scale = 0.9*np.ones(n_dims), size = (int(n_train/8), n_dims))
    # X7 = np.random.normal(loc = np.array([test_range_max, 0]), scale = 0.9*np.ones(n_dims), size = (int(n_train/8), n_dims))
    # X8 = np.random.normal(loc = np.array([0, test_range_max]), scale = 0.9*np.ones(n_dims), size = (int(n_train/8), n_dims))
    # test_range_min /= shrink
    # test_range_max /= shrink

    # X = np.concatenate((X1, X2, X3, X4, X5, X6, X7, X8), axis = 0)

    # X = np.random.uniform(test_range_min, test_range_max, 
    #     size = (n_train, n_dims))

    X_s = np.array([[0.0, 0.0], [-0.2, 0.3], [-0.1, -0.1], [0.05, 0.25], [-1.1, 0.0], [-0.5, 0.0], [-0.4, -0.7], [-0.1, -0.1], [test_range_min, test_range_min], [test_range_min, test_range_max], [test_range_max, test_range_max], [test_range_max, test_range_min]])
    X_f = np.array([[1.4, 1.6], [1.8, 1.2], [-1.24, 1.72], [-1.56, -1.9], [-1.9, 1.0], [-0.5, -1.2], [-1.4, -1.9], [0.4, -1.2], [test_range_min, test_range_max], [test_range_max, test_range_max], [test_range_max, test_range_min], [test_range_min, test_range_min]])
    n_track = 25
    X_s = np.random.uniform(test_range_min, test_range_max, size = (n_track, n_dims))
    X_f = test_range_max * np.random.standard_cauchy(n_track * n_dims).reshape(n_track, n_dims)
    X_f = np.random.uniform(test_range_min, test_range_max, size = (n_track, n_dims))
    X = generate_line_paths(X_s, X_f, n_points = 15)
    x1 = X[:, 0]
    x2 = X[:, 1]
    
    # Query Points
    Xq = np.random.uniform(test_range_min, test_range_max, 
        size = (n_query, n_dims))
    xq1 = Xq[:, 0]
    xq2 = Xq[:, 1]

    n_train = X.shape[0]
    n_query = Xq.shape[0]
    logging.info('Training Points: %d' % n_train)
    # Training Labels
    y = gp.classifier.utils.make_decision(X, decision_boundary)
    y_unique = np.unique(y)
    assert y_unique.dtype == int

    if y_unique.shape[0] == 2:
        mycmap = cm.get_cmap(name = 'bone', lut = None)
        mycmap2 = cm.get_cmap(name = 'BrBG', lut = None)
    else:
        mycmap = cm.get_cmap(name = 'gist_rainbow', lut = None)
        mycmap2 = cm.get_cmap(name = 'gist_rainbow', lut = None)
    """
    Classifier Training
    """

    # Training
    fig = plt.figure()
    gp.classifier.utils.visualise_decision_boundary(
        test_range_min, test_range_max, decision_boundary)
    
    plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
    plt.title('Training Labels')
    plt.xlabel('x1')
    plt.ylabel('x2')
    cbar = plt.colorbar()
    cbar.set_ticks(y_unique)
    cbar.set_ticklabels(y_unique)
    plt.xlim((test_range_min, test_range_max))
    plt.ylim((test_range_min, test_range_max))
    plt.gca().patch.set_facecolor('gray')
    print('Plotted Training Set')

    plt.show()

    # Training
    print('===Begin Classifier Training===')
    optimiser_config = gp.OptConfig()
    optimiser_config.sigma = gp.auto_range(kerneldef)
    optimiser_config.walltime = walltime

    # User can choose to batch start each binary classifier with different
    # initial hyperparameters for faster training
    if batch_start:
        if y_unique.shape[0] == 2:
            initial_hyperparams = [100, 0.1, 0.1]
        elif multimethod == 'OVA':
            initial_hyperparams = [  [356.468, 0.762, 0.530], \
                                     [356.556, 0.836, 0.763], \
                                     [472.006, 1.648, 1.550], \
                                     [239.720, 1.307, 0.721] ]
        elif multimethod == 'AVA':
            initial_hyperparams = [ [14.9670, 0.547, 0.402],  \
                                    [251.979, 1.583, 1.318], \
                                    [420.376, 1.452, 0.750], \
                                    [780.641, 1.397, 1.682], \
                                    [490.353, 2.299, 1.526], \
                                    [73.999, 1.584, 0.954]]
        else:
            raise ValueError
        batch_config = gp.batch_start(optimiser_config, initial_hyperparams)
    else:
        batch_config = optimiser_config

    # Obtain the response function
    responsefunction = gp.classifier.responses.get(responsename)

    # Train the classifier!
    learned_classifier = gp.classifier.learn(X, y, kerneldef,
        responsefunction, batch_config, 
        multimethod = multimethod, approxmethod = approxmethod,
        train = True, ftol = 1e-6, processes = n_cores)

    # Print learned kernels
    print_function = gp.describer(kerneldef)
    gp.classifier.utils.print_learned_kernels(print_function, 
                                            learned_classifier, y_unique)

    # Print the matrix of learned classifier hyperparameters
    logging.info('Matrix of learned hyperparameters')
    gp.classifier.utils.print_hyperparam_matrix(learned_classifier)
    
    """
    Classifier Prediction
    """
    # Prediction
    yq_prob = gp.classifier.predict(Xq, learned_classifier, 
        fusemethod = fusemethod)
    yq_pred = gp.classifier.classify(yq_prob, y)
    yq_entropy = gp.classifier.entropy(yq_prob)

    logging.info('Caching Predictor...')
    predictors = gp.classifier.query(learned_classifier, Xq)
    logging.info('Computing Expectance...')
    yq_exp_list = gp.classifier.expectance(learned_classifier, predictors)
    logging.info('Computing Covariance...')
    yq_cov_list = gp.classifier.covariance(learned_classifier, predictors)
    logging.info('Drawing from GP...')
    yq_draws = gp.classifier.draws(n_draws, yq_exp_list, yq_cov_list, 
        learned_classifier)
    logging.info('Computing Linearised Entropy...')
    yq_linearised_entropy = gp.classifier.linearised_entropy(
        yq_exp_list, yq_cov_list, learned_classifier)
    logging.info('Linearised Entropy is {0}'.format(yq_linearised_entropy))

    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
                        THE GAP BETWEEN ANALYSIS AND PLOTS
    """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""







    """
    Classifier Prediction Results (Plots)
    """

    logging.info('Plotting... please wait')

    Xq_plt = gp.classifier.utils.query_map(test_ranges, n_points = 250)
    yq_truth_plt = gp.classifier.utils.make_decision(Xq_plt, decision_boundary)

    """
    Plot: Ground Truth
    """

    # Training
    fig = plt.figure(figsize = (15, 15))
    gp.classifier.utils.visualise_map(yq_truth_plt, test_ranges, cmap = mycmap)
    plt.title('Ground Truth')
    plt.xlabel('x1')
    plt.ylabel('x2')
    cbar = plt.colorbar()
    cbar.set_ticks(y_unique)
    cbar.set_ticklabels(y_unique)
    gp.classifier.utils.visualise_decision_boundary(
        test_range_min, test_range_max, decision_boundary)
    logging.info('Plotted Prediction Labels')

    """
    Plot: Training Set
    """

    # Training
    fig = plt.figure(figsize = (15, 15))
    gp.classifier.utils.visualise_decision_boundary(
        test_range_min, test_range_max, decision_boundary)
    
    plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
    plt.title('Training Labels')
    plt.xlabel('x1')
    plt.ylabel('x2')
    cbar = plt.colorbar()
    cbar.set_ticks(y_unique)
    cbar.set_ticklabels(y_unique)
    plt.xlim((test_range_min, test_range_max))
    plt.ylim((test_range_min, test_range_max))
    plt.gca().patch.set_facecolor('gray')
    logging.info('Plotted Training Set')

    """
    Plot: Query Computations
    """

    # Compute Linearised and True Entropy for plotting
    logging.info('Plot: Caching Predictor...')
    predictor_plt = gp.classifier.query(learned_classifier, Xq_plt)
    logging.info('Plot: Computing Expectance...')
    expectance_latent_plt = \
        gp.classifier.expectance(learned_classifier, predictor_plt)
    logging.info('Plot: Computing Variance...')
    variance_latent_plt = \
        gp.classifier.variance(learned_classifier, predictor_plt)
    logging.info('Plot: Computing Linearised Entropy...')
    entropy_linearised_plt = gp.classifier.linearised_entropy(
        expectance_latent_plt, variance_latent_plt, learned_classifier)
    logging.info('Plot: Computing Equivalent Standard Deviation')
    eq_sd_plt = gp.classifier.equivalent_standard_deviation(
        entropy_linearised_plt)
    logging.info('Plot: Computing Prediction Probabilities...')
    yq_prob_plt = gp.classifier.predict_from_latent(
        expectance_latent_plt, variance_latent_plt, learned_classifier, 
        fusemethod = fusemethod)
    logging.info('Plot: Computing True Entropy...')
    yq_entropy_plt = gp.classifier.entropy(yq_prob_plt)
    logging.info('Plot: Computing Class Predicitons')
    yq_pred_plt = gp.classifier.classify(yq_prob_plt, y_unique)

    if isinstance(learned_classifier, list):
        logging.info('Plot: Computing Naive Linearised Entropy...')
        args = [(expectance_latent_plt[i], variance_latent_plt[i], 
            learned_classifier[i]) for i in range(len(learned_classifier))]
        entropy_linearised_naive_plt = \
            np.array(parmap.starmap(gp.classifier.linearised_entropy, 
                args)).sum(axis = 0)


    Xq_meas = gp.classifier.utils.query_map(test_ranges, n_points = 10)

    predictor_meas = gp.classifier.query(learned_classifier, Xq_meas)
    exp_meas = gp.classifier.expectance(learned_classifier, predictor_meas)
    cov_meas = gp.classifier.covariance(learned_classifier, predictor_meas)

    logging.info('Objective Measure: Computing Joint Linearised Entropy...')
    entropy_linearised_meas = gp.classifier.linearised_entropy(
        exp_meas, cov_meas, learned_classifier)
    logging.info('Objective Measure: Computing Monte Carlo Joint Entropy...')

    # start_time = time.clock()
    # entropy_monte_carlo_meas = gp.classifier.monte_carlo_joint_entropy(exp_meas, cov_meas, learned_classifier, n_draws = n_draws_est)
    # logging.info('Sampling took %.4f seconds' % (time.clock() - start_time))

    entropy_linearised_mean_meas = entropy_linearised_plt.mean()
    entropy_true_mean_meas = yq_entropy_plt.mean()

    mistake_ratio = (yq_truth_plt - yq_pred_plt).nonzero()[0].shape[0] / yq_truth_plt.shape[0]

    if isinstance(learned_classifier, list) & False:

        """
        Plot: Latent Function Expectance
        """

        for i in range(len(expectance_latent_plt)):
            fig = plt.figure(figsize = (15, 15))
            gp.classifier.utils.visualise_map(
                expectance_latent_plt[i], test_ranges, 
                levels = [0.0], 
                vmin = -np.max(np.abs(expectance_latent_plt[i])), 
                vmax = np.max(np.abs(expectance_latent_plt[i])), 
                cmap = cm.coolwarm)
            plt.title('Latent Funtion Expectance %s' 
                % gp.classifier.utils.binary_classifier_name(
                    learned_classifier[i], y_unique))
            plt.xlabel('x1')
            plt.ylabel('x2')
            plt.colorbar()
            plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
            plt.xlim((test_range_min, test_range_max))
            plt.ylim((test_range_min, test_range_max))
            logging.info('Plotted Latent Function Expectance on Training Set')

        """
        Plot: Latent Function Variance
        """

        for i in range(len(variance_latent_plt)):
            fig = plt.figure(figsize = (15, 15))
            gp.classifier.utils.visualise_map(
                variance_latent_plt[i], test_ranges, 
                cmap = cm.coolwarm)
            plt.title('Latent Funtion Variance %s' 
                % gp.classifier.utils.binary_classifier_name(
                    learned_classifier[i], y_unique))
            plt.xlabel('x1')
            plt.ylabel('x2')
            plt.colorbar()
            plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
            plt.xlim((test_range_min, test_range_max))
            plt.ylim((test_range_min, test_range_max))
            logging.info('Plotted Latent Function Variance on Training Set')

        """
        Plot: Prediction Probabilities
        """

        for i in range(len(yq_prob_plt)):
            fig = plt.figure(figsize = (15, 15))
            gp.classifier.utils.visualise_map(yq_prob_plt[i], test_ranges, 
                levels = [0.5], cmap = cm.coolwarm)
            plt.title('Prediction Probabilities (Class %d)' % y_unique[i])
            plt.xlabel('x1')
            plt.ylabel('x2')
            plt.colorbar()
            plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
            plt.xlim((test_range_min, test_range_max))
            plt.ylim((test_range_min, test_range_max))
            logging.info('Plotted Prediction Probabilities on Training Set')

    """
    Plot: Prediction Labels
    """

    # Query (Prediction Map)
    fig = plt.figure(figsize = (15, 15))
    gp.classifier.utils.visualise_map(yq_pred_plt, test_ranges, 
        boundaries = True, cmap = mycmap)
    plt.title('Prediction [Miss Ratio: %.3f %s]' % (100 * mistake_ratio, '%'))
    plt.xlabel('x1')
    plt.ylabel('x2')
    cbar = plt.colorbar()
    cbar.set_ticks(y_unique)
    cbar.set_ticklabels(y_unique)
    logging.info('Plotted Prediction Labels')

    """
    Plot: Prediction Entropy onto Training Set
    """

    # Query (Prediction Entropy)
    fig = plt.figure(figsize = (15, 15))
    gp.classifier.utils.visualise_map(yq_entropy_plt, test_ranges, 
        threshold = entropy_threshold, cmap = cm.coolwarm)
    plt.title('Prediction Entropy [ACE = %.4f]' 
            % (entropy_true_mean_meas))
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.colorbar()
    plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
    plt.xlim((test_range_min, test_range_max))
    plt.ylim((test_range_min, test_range_max))
    logging.info('Plotted Prediction Entropy on Training Set')

    """
    Plot: Linearised Prediction Entropy onto Training Set
    """

    # Query (Linearised Entropy)
    fig = plt.figure(figsize = (15, 15))
    gp.classifier.utils.visualise_map(entropy_linearised_plt, test_ranges, 
        threshold = entropy_threshold, cmap = cm.coolwarm)
    plt.title('Linearised Prediction Entropy [FLE = %.4f, ALE = %.4f]' 
            % (entropy_linearised_meas, entropy_linearised_mean_meas))
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.colorbar()
    plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
    plt.xlim((test_range_min, test_range_max))
    plt.ylim((test_range_min, test_range_max))
    logging.info('Plotted Linearised Prediction Entropy on Training Set')

    """
    Plot: Exponentiated Linearised Prediction Entropy onto Training Set
    """

    # Query (Linearised Entropy)
    fig = plt.figure(figsize = (15, 15))
    gp.classifier.utils.visualise_map(eq_sd_plt, test_ranges, 
        threshold = entropy_threshold, cmap = cm.coolwarm)
    plt.title('Equivalent standard deviation')
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.colorbar()
    plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
    plt.xlim((test_range_min, test_range_max))
    plt.ylim((test_range_min, test_range_max))
    logging.info('Plotted Exponentiated Linearised Prediction Entropy (Equivalent Standard Deviation) on Training Set')

    """
    Plot: Naive Linearised Prediction Entropy onto Training Set
    """

    if isinstance(learned_classifier, list):
        # Query (Naive Linearised Entropy)
        fig = plt.figure(figsize = (15, 15))
        gp.classifier.utils.visualise_map(
            entropy_linearised_naive_plt, test_ranges, 
            threshold = entropy_threshold, cmap = cm.coolwarm)
        plt.title('Naive Linearised Prediction Entropy')
        plt.xlabel('x1')
        plt.ylabel('x2')
        plt.colorbar()
        plt.scatter(x1, x2, c = y, marker = 'x', cmap = mycmap)
        plt.xlim((test_range_min, test_range_max))
        plt.ylim((test_range_min, test_range_max))
        logging.info('Plotted Naive Linearised Prediction Entropy on Training Set')

    """
    Plot: Sample Query Predictions
    """  

    # Visualise Predictions
    fig = plt.figure(figsize = (15, 15))
    gp.classifier.utils.visualise_decision_boundary(
        test_range_min, test_range_max, decision_boundary)
    plt.scatter(xq1, xq2, c = yq_pred, marker = 'x', cmap = mycmap)
    plt.title('Predicted Query Labels')
    plt.xlabel('x1')
    plt.ylabel('x2')
    cbar = plt.colorbar()
    cbar.set_ticks(y_unique)
    cbar.set_ticklabels(y_unique)
    plt.xlim((test_range_min, test_range_max))
    plt.ylim((test_range_min, test_range_max))
    plt.gca().patch.set_facecolor('gray')
    logging.info('Plotted Sample Query Labels')

    """
    Plot: Sample Query Draws
    """  

    # Visualise Predictions
    fig = plt.figure(figsize = (19.2, 10.8))
    for i in range(n_draws):
        plt.subplot(rows_subplot, cols_subplot, i + 1)
        gp.classifier.utils.visualise_decision_boundary(
            test_range_min, test_range_max, decision_boundary)
        plt.scatter(xq1, xq2, c = yq_draws[i], marker = 'x', cmap = mycmap)
        plt.title('Query Label Draws')
        plt.xlabel('x1')
        plt.ylabel('x2')
        cbar = plt.colorbar()
        cbar.set_ticks(y_unique)
        cbar.set_ticklabels(y_unique)
        plt.xlim((test_range_min, test_range_max))
        plt.ylim((test_range_min, test_range_max))
        plt.gca().patch.set_facecolor('gray')
        logging.info('Plotted Sample Query Draws')

    """
    Save Outputs
    """  

    # Save all the figures
    if SAVE_OUTPUTS:
        save_directory = "response_%s_approxmethod_%s" \
        "_training_%d_query_%d_walltime_%d" \
        "_method_%s_fusemethod_%s/" \
            % ( responsename, approxmethod, 
                n_train, n_query, walltime, 
                multimethod, fusemethod)
        full_directory = gp.classifier.utils.create_directories(
            save_directory, home_directory = 'Figures/', append_time = True)
        gp.classifier.utils.save_all_figures(full_directory)
        shutil.copy2('./receding_horizon_path_planning.py', full_directory)

    logging.info('Modeling Done')

    """
    Path Planning
    """

    """ Setup Path Planning """
    xq_now = np.array([[0., 0.]])
    horizon = (test_range_max - test_range_min) + 0.5
    n_steps = 30

    theta_bound = np.deg2rad(30)
    theta_add_init = -np.deg2rad(10) * np.ones(n_steps)
    theta_add_init[0] = np.deg2rad(180)
    theta_add_low = -theta_bound * np.ones(n_steps)
    theta_add_high = theta_bound * np.ones(n_steps)
    theta_add_low[0] = 0.0
    theta_add_high[0] = 2 * np.pi
    r = horizon/n_steps
    choice_walltime = 1500.0
    xtol_rel = 1e-2
    ftol_rel = 1e-4

    k_step = 1

    """ Initialise Values """

    # The observed data till now
    X_now = X.copy()
    y_now = y.copy()

    # Observe the current location
    yq_now = gp.classifier.utils.make_decision(xq_now[[-1]], 
        decision_boundary)

    # Add the observed data to the training set
    X_now = np.concatenate((X_now, xq_now[[-1]]), axis = 0)
    y_now = np.append(y_now, yq_now)

    # Add the new location to the array of travelled coordinates
    xq1_nows = xq_now[:, 0]
    xq2_nows = xq_now[:, 1]
    yq_nows = yq_now.copy()

    # Plot the current situation
    fig1 = plt.figure(figsize = (15, 15))
    fig2 = plt.figure(figsize = (15, 15))
    fig3 = plt.figure(figsize = (15, 15))
    fig4 = plt.figure(figsize = (15, 15))
    fig5 = plt.figure(figsize = (20, 20))

    # Start exploring
    i_trials = 0
    n_trials = 2000
    entropy_linearised_array = np.nan * np.ones(n_trials)
    # entropy_monte_carlo_array = np.nan * np.ones(n_trials)
    entropy_linearised_mean_array = np.nan * np.ones(n_trials)
    entropy_true_mean_array = np.nan * np.ones(n_trials)
    entropy_opt_array = np.nan * np.ones(n_trials)
    mistake_ratio_array = np.nan * np.ones(n_trials)
    m_step = 0
    while i_trials < n_trials:

        """ Path Planning """

        print(m_step)
        print(k_step)
        if m_step <= k_step:
            # Propose a place to observe
            xq_abs_opt, theta_add_opt, entropy_opt = \
                go_optimised_path(theta_add_init, xq_now[-1], r, 
                    learned_classifier, test_ranges,
                    theta_add_low = theta_add_low, theta_add_high = theta_add_high, 
                    walltime = choice_walltime, xtol_rel = xtol_rel, 
                    ftol_rel = ftol_rel, globalopt = False, objective = 'LE',
                    n_draws = n_draws_est)
            logging.info('Optimal Joint Entropy: %.5f' % entropy_opt)

            m_step = keep_going_until_surprise(xq_abs_opt, learned_classifier, 
                decision_boundary)
            logging.info('Taking %d steps' % m_step)
        else:
            m_step -= 1
            theta_add_opt = theta_add_init.copy()
            xq_abs_opt = forward_path_model(theta_add_init, r, xq_now[-1])
            logging.info('%d steps left' % m_step)

        xq_now = xq_abs_opt[:k_step]

        theta_add_init = initiate_with_continuity(theta_add_opt, 
            k_step = k_step)
        np.clip(theta_add_init, theta_add_low + 1e-4, theta_add_high - 1e-4, 
            out = theta_add_init)

        # Observe the current location
        yq_now = gp.classifier.utils.make_decision(xq_now, 
            decision_boundary)

        # Add the observed data to the training set
        X_now = np.concatenate((X_now, xq_now), axis = 0)
        y_now = np.append(y_now, yq_now)

        # Add the new location to the array of travelled coordinates
        xq1_nows = np.append(xq1_nows, xq_now[:, 0])
        xq2_nows = np.append(xq2_nows, xq_now[:, 1])
        yq_nows = np.append(yq_nows, yq_now)

        # Update that into the model
        logging.info('Learning Classifier...')
        batch_config = \
            gp.classifier.batch_start(optimiser_config, learned_classifier)
        try:
            learned_classifier = gp.classifier.learn(X_now, y_now, kerneldef,
                responsefunction, batch_config, 
                multimethod = multimethod, approxmethod = approxmethod,
                train = True, ftol = 1e-6, processes = n_cores)
        except Exception as e:
            logging.warning(e)
            try:
                learned_classifier = gp.classifier.learn(X_now, y_now, kerneldef,
                    responsefunction, batch_config, 
                    multimethod = multimethod, approxmethod = approxmethod,
                    train = False, ftol = 1e-6, processes = n_cores)
            except Exception as e:
                logging.warning(e)
                pass    
        logging.info('Finished Learning')

        # This is the finite horizon optimal route
        xq1_proposed = xq_abs_opt[:, 0][k_step:]
        xq2_proposed = xq_abs_opt[:, 1][k_step:]
        yq_proposed = gp.classifier.classify(gp.classifier.predict(xq_abs_opt, 
            learned_classifier), y_unique)[k_step:]

        """ Computing Analysis Maps """

        # Compute Linearised and True Entropy for plotting
        logging.info('Plot: Caching Predictor...')
        predictor_plt = gp.classifier.query(learned_classifier, Xq_plt)
        logging.info('Plot: Computing Expectance...')
        expectance_latent_plt = \
            gp.classifier.expectance(learned_classifier, predictor_plt)
        logging.info('Plot: Computing Variance...')
        variance_latent_plt = \
            gp.classifier.variance(learned_classifier, predictor_plt)
        logging.info('Plot: Computing Linearised Entropy...')
        entropy_linearised_plt = gp.classifier.linearised_entropy(
            expectance_latent_plt, variance_latent_plt, learned_classifier)
        logging.info('Plot: Computing Equivalent Standard Deviation')
        eq_sd_plt = gp.classifier.equivalent_standard_deviation(
            entropy_linearised_plt)
        logging.info('Plot: Computing Prediction Probabilities...')
        yq_prob_plt = gp.classifier.predict_from_latent(
            expectance_latent_plt, variance_latent_plt, learned_classifier, 
            fusemethod = fusemethod)
        logging.info('Plot: Computing True Entropy...')
        yq_entropy_plt = gp.classifier.entropy(yq_prob_plt)
        logging.info('Plot: Computing Class Predicitons')
        yq_pred_plt = gp.classifier.classify(yq_prob_plt, y_unique)

        
        predictor_meas = gp.classifier.query(learned_classifier, Xq_meas)
        exp_meas = gp.classifier.expectance(learned_classifier, predictor_meas)
        cov_meas = gp.classifier.covariance(learned_classifier, predictor_meas)

        logging.info('Objective Measure: Computing Linearised Joint Entropy')
        start_time = time.clock()
        entropy_linearised_meas = gp.classifier.linearised_entropy(
            exp_meas, cov_meas, learned_classifier)
        logging.info('Computation took %.4f seconds' % (time.clock() - start_time))
        logging.info('Linearised Joint Entropy: %.4f' % entropy_linearised_meas)
        # logging.info('Objective Measure: Computing Monte Carlo Joint Entropy...')
        # start_time = time.clock()
        # entropy_monte_carlo_meas = gp.classifier.monte_carlo_joint_entropy(exp_meas, cov_meas, learned_classifier, n_draws = n_draws_est)
        # logging.info('Computation took %.4f seconds' % (time.clock() - start_time))
        # logging.info('Monte Carlo Joint Entropy: %.4f' % entropy_monte_carlo_meas)

        entropy_linearised_mean_meas = entropy_linearised_plt.mean()
        entropy_true_mean_meas = yq_entropy_plt.mean()

        mistake_ratio = (yq_truth_plt - yq_pred_plt).nonzero()[0].shape[0] / yq_truth_plt.shape[0]

        entropy_linearised_array[i_trials] = entropy_linearised_meas
        # entropy_monte_carlo_array[i_trials] = entropy_monte_carlo_meas
        entropy_linearised_mean_array[i_trials] = entropy_linearised_mean_meas
        entropy_true_mean_array[i_trials] = entropy_true_mean_meas
        entropy_opt_array[i_trials] = entropy_opt
        mistake_ratio_array[i_trials] = mistake_ratio
        

        # Find the bounds of the entropy predictions
        vmin1 = entropy_linearised_plt.min()
        vmax1 = entropy_linearised_plt.max()
        vmin2 = yq_entropy_plt.min()
        vmax2 = yq_entropy_plt.max()
        vmin3 = eq_sd_plt.min()
        vmax3 = eq_sd_plt.max()

        """ Linearised Entropy Map """

        # Prepare Figure 1
        plt.figure(fig1.number)
        plt.clf()
        plt.title('Linearised entropy [horizon = %.2f, FLE = %.2f, ALE = %.2f, ACE = %.2f, TPE = %.2f]' 
            % (horizon, entropy_linearised_meas, entropy_linearised_mean_meas, entropy_true_mean_meas, entropy_opt))
        plt.xlabel('x1')
        plt.ylabel('x2')
        plt.xlim((test_range_min, test_range_max))
        plt.ylim((test_range_min, test_range_max))

        # Plot linearised entropy
        gp.classifier.utils.visualise_map(entropy_linearised_plt, test_ranges, 
            cmap = cm.coolwarm, vmin = vmin1, vmax = vmax1)
        plt.colorbar()

        # Plot training set on top
        plt.scatter(x1, x2, c = y, s = 40, marker = 'x', cmap = mycmap)

        # Plot the path on top
        plt.scatter(xq1_nows, xq2_nows, c = yq_nows, s = 60, 
            facecolors = 'none',
            vmin = y_unique[0], vmax = y_unique[-1], cmap = mycmap)
        plt.plot(xq1_nows, xq2_nows, c = 'w')
        plt.scatter(xq_now[:, 0], xq_now[:, 1], c = yq_now, s = 120, 
            vmin = y_unique[0], vmax = y_unique[-1], 
            cmap = mycmap)

        # Plot the proposed path
        plt.scatter(xq1_proposed, xq2_proposed, c = yq_proposed, 
            s = 60, marker = 'D', 
            vmin = y_unique[0], vmax = y_unique[-1], cmap = mycmap)
        plt.plot(xq1_proposed, xq2_proposed, c = 'w')

        # Plot the horizon
        gp.classifier.utils.plot_circle(xq_now[-1], horizon, c = 'k', 
            marker = '.')

        # Save the plot
        plt.gca().set_aspect('equal', adjustable = 'box')
        plt.savefig('%sentropy_linearised_step%d.png' 
            % (full_directory, i_trials + 1))

        """ Equivalent Standard Deviation Map """

        # Prepare Figure 2
        plt.figure(fig2.number)
        plt.clf()
        plt.title('Equivalent SD [horizon = %.2f, FLE = %.2f, ALE = %.2f, ACE = %.2f, TPE = %.2f]' 
            % (horizon, entropy_linearised_meas, entropy_linearised_mean_meas, entropy_true_mean_meas, entropy_opt))
        plt.xlabel('x1')
        plt.ylabel('x2')
        plt.xlim((test_range_min, test_range_max))
        plt.ylim((test_range_min, test_range_max))

        # Plot linearised entropy
        gp.classifier.utils.visualise_map(eq_sd_plt, test_ranges, 
            cmap = cm.coolwarm, vmin = vmin3, vmax = vmax3)
        plt.colorbar()

        # Plot training set on top
        plt.scatter(x1, x2, c = y, s = 40, marker = 'x', cmap = mycmap)

        # Plot the path on top
        plt.scatter(xq1_nows, xq2_nows, c = yq_nows, s = 60, 
            facecolors = 'none',
            vmin = y_unique[0], vmax = y_unique[-1], cmap = mycmap)
        plt.plot(xq1_nows, xq2_nows, c = 'w')
        plt.scatter(xq_now[:, 0], xq_now[:, 1], c = yq_now, s = 120, 
            vmin = y_unique[0], vmax = y_unique[-1], 
            cmap = mycmap)

        # Plot the proposed path
        plt.scatter(xq1_proposed, xq2_proposed, c = yq_proposed, 
            s = 60, marker = 'D', 
            vmin = y_unique[0], vmax = y_unique[-1], cmap = mycmap)
        plt.plot(xq1_proposed, xq2_proposed, c = 'w')

        # Plot the horizon
        gp.classifier.utils.plot_circle(xq_now[-1], horizon, c = 'k', 
            marker = '.')

        # Save the plot
        plt.gca().set_aspect('equal', adjustable = 'box')
        plt.savefig('%seq_sd_step%d.png' 
            % (full_directory, i_trials + 1))

        """ True Entropy Map """

        # Prepare Figure 3
        plt.figure(fig3.number)
        plt.clf()
        plt.title('True entropy [horizon = %.2f, FLE = %.2f, ALE = %.2f, ACE = %.2f, TPE = %.2f]' 
            % (horizon, entropy_linearised_meas, entropy_linearised_mean_meas, entropy_true_mean_meas, entropy_opt))
        plt.xlabel('x1')
        plt.ylabel('x2')
        plt.xlim((test_range_min, test_range_max))
        plt.ylim((test_range_min, test_range_max))

        # Plot true entropy
        gp.classifier.utils.visualise_map(yq_entropy_plt, test_ranges, 
            cmap = cm.coolwarm, vmin = vmin2, vmax = vmax2)
        plt.colorbar()

        # Plot training set on top
        plt.scatter(x1, x2, c = y, s = 40, marker = 'x', cmap = mycmap)

        # Plot the path on top
        plt.scatter(xq1_nows, xq2_nows, c = yq_nows, s = 60, 
            facecolors = 'none',
            vmin = y_unique[0], vmax = y_unique[-1], cmap = mycmap)
        plt.plot(xq1_nows, xq2_nows, c = 'w')
        plt.scatter(xq_now[:, 0], xq_now[:, 1], c = yq_now, s = 120, 
            vmin = y_unique[0], vmax = y_unique[-1], 
            cmap = mycmap)

        # Plot the proposed path
        plt.scatter(xq1_proposed, xq2_proposed, c = yq_proposed, 
            s = 60, marker = 'D', 
            vmin = y_unique[0], vmax = y_unique[-1], cmap = mycmap)
        plt.plot(xq1_proposed, xq2_proposed, c = 'w')

        # Plot the horizon
        gp.classifier.utils.plot_circle(xq_now[-1], horizon, c = 'k', 
            marker = '.')

        # Save the plot
        plt.gca().set_aspect('equal', adjustable = 'box')
        plt.savefig('%sentropy_true_step%d.png' 
            % (full_directory, i_trials + 1))

        """ Class Prediction Map """

        # Prepare Figure 4
        plt.figure(fig4.number)
        plt.clf()
        plt.title('Class predictions [Miss Ratio: %.3f %s]' % (100 * mistake_ratio, '%'))
        plt.xlabel('x1')
        plt.ylabel('x2')
        plt.xlim((test_range_min, test_range_max))
        plt.ylim((test_range_min, test_range_max))

        # Plot class predictions
        gp.classifier.utils.visualise_map(yq_pred_plt, test_ranges, 
            boundaries = True, cmap = mycmap2, vmin = y_unique[0], vmax = y_unique[-1])
        cbar = plt.colorbar()
        cbar.set_ticks(y_unique)
        cbar.set_ticklabels(y_unique)


        # Plot training set on top
        plt.scatter(x1, x2, c = y, s = 40, marker = 'v', cmap = mycmap)

        # Plot the path on top
        plt.scatter(xq1_nows, xq2_nows, c = yq_nows, s = 60, marker = 'o', 
            vmin = y_unique[0], vmax = y_unique[-1], cmap = mycmap)
        plt.plot(xq1_nows, xq2_nows, c = 'w')
        plt.scatter(xq_now[:, 0], xq_now[:, 1], c = yq_now, s = 120, 
            vmin = y_unique[0], vmax = y_unique[-1], 
            cmap = mycmap)

        # Plot the proposed path
        plt.scatter(xq1_proposed, xq2_proposed, c = yq_proposed, 
            s = 60, marker = 'D', 
            vmin = y_unique[0], vmax = y_unique[-1], cmap = mycmap)
        plt.plot(xq1_proposed, xq2_proposed, c = 'w')

        # Plot the horizon
        gp.classifier.utils.plot_circle(xq_now[-1], horizon, c = 'k', 
            marker = '.')

        # Save the plot
        plt.gca().set_aspect('equal', adjustable = 'box')
        plt.savefig('%sclass_prediction_step%d.png' 
            % (full_directory, i_trials + 1))

        # Prepare Figure 5
        plt.figure(fig5.number)
        plt.clf()

        plt.subplot(5, 1, 1)
        plt.plot(np.arange(i_trials + 1), entropy_linearised_array[:(i_trials + 1)])
        plt.title('Field Linearised Entropy')
        plt.ylabel('Field Linearised Entropy')

        # plt.subplot(6, 1, 2)
        # plt.plot(np.arange(i_trials + 1), entropy_monte_carlo_array[:(i_trials + 1)])
        # plt.title('Field True Entropy through Monte Carlo')
        # plt.ylabel('Field True Entropy')

        plt.subplot(5, 1, 2)
        plt.plot(np.arange(i_trials + 1), entropy_linearised_mean_array[:(i_trials + 1)])
        plt.title('Average Linearised Entropy')
        plt.ylabel('Average Linearised Entropy')

        plt.subplot(5, 1, 3)
        plt.plot(np.arange(i_trials + 1), entropy_true_mean_array[:(i_trials + 1)])
        plt.title('Average True Entropy')
        plt.ylabel('Average True Entropy')

        plt.subplot(5, 1, 4)
        plt.plot(np.arange(i_trials + 1), entropy_opt_array[:(i_trials + 1)])
        plt.title('Joint entropy of path chosen each iteration')
        plt.ylabel('Joint Entropy')

        plt.subplot(5, 1, 5)
        plt.gca().get_xaxis().get_major_formatter().set_useOffset(False)
        plt.plot(np.arange(i_trials + 1), 100 * mistake_ratio_array[:(i_trials + 1)])
        plt.title('Prediction Miss Ratio')
        plt.ylabel('Prediction Miss Ratio (%)')
        
        plt.xlabel('Steps')
        # Save the plot
        plt.savefig('%sentropy_history%d.png' 
            % (full_directory, i_trials + 1))
        logging.info('Plotted and Saved Iteration')

        # Move on to the next step
        i_trials += 1

        # Save the learned classifier
        if i_trials % 50 == 0:
            np.savez('%slearned_classifier_trial%d.npz'
                % (full_directory, i_trials), 
                learned_classifier = learned_classifier)
            np.savez('%sentropy_linearised_array_trial%d.npz'
                % (full_directory, i_trials), 
                entropy_linearised_array = entropy_linearised_array)
            np.savez('%sentropy_linearised_mean_array_trial%d.npz'
                % (full_directory, i_trials), 
                entropy_linearised_mean_array = entropy_linearised_mean_array)
            np.savez('%sentropy_true_mean_array_trial%d.npz'
                % (full_directory, i_trials), 
                entropy_true_mean_array = entropy_true_mean_array)
            np.savez('%sentropy_opt_array_trial%d.npz'
                % (full_directory, i_trials), 
                entropy_opt_array = entropy_opt_array)
            np.savez('%smistake_ratio_array_trial%d.npz'
                % (full_directory, i_trials), 
                mistake_ratio_array = mistake_ratio_array)
    # When finished, save the learned classifier
    np.savez('%slearned_classifier_final.npz' % full_directory, 
        learned_classifier = learned_classifier)

    # Show everything!
    plt.show()
示例#40
0
def main(fasta, output_dir, bacterial_query, email, viral_db,
            keep_tmp=False, threads=20, evalue=0.001, outfmt=5):

    verbose_logging(**locals())

    fasta_name = op.basename(fasta)
    sample = get_sample(fasta_name)

    try:
        tmpdir = tf.mkdtemp("_tmp", "%s_" % sample, tf.tempdir)
        # rename the fa headers while writing to temp working dir
        tmpfasta = preprocess_fasta(fasta, tmpdir)

        blastp_xml = op.join(tmpdir, "%s.blastp.xml" % sample)
        blast_db = op.join(tmpdir, "%s.blastdb" % sample)

        tmp_viral_db = op.join(tmpdir, op.basename(viral_db))
        copytree(viral_db, tmp_viral_db)

        tmpquery = op.join(tmpdir, op.basename(bacterial_query))
        shutil.copyfile(bacterial_query, tmpquery)

        # prodigal
        p_proteins, p_genes, p_genbank, p_score = prodigal(tmpfasta, sample, tmpdir)

        # tRNAscan-SE
        trna_output = os.path.join(tmpdir, "%s.trnascan" % sample)
        trna_output = trnascan(tmpfasta, o=trna_output, B=None)

        # gc content and skew
        gc_output = gc_content(tmpfasta)

        # tetramerPCA
        tetramerPCA_output = tetramerPCA(tmpfasta, window=1600, step=200)

        # blastp
        blastp(query=p_proteins, db='nr', out=blastp_xml, evalue=evalue,
                num_alignments=10, num_threads=threads, outfmt=outfmt)

        # blast_db
        makeblastdb(**{'in':p_proteins, 'parse_seqids':None, 'dbtype':'prot', 'out':blast_db})

        # blastx; viral
        viral_xmls, viral_queries = batch_blastx(blast_db, tmp_viral_db, tmpdir, evalue, threads, outfmt)
        viral_bams = parmap.starmap(xml_to_bam, zip(viral_xmls, viral_queries), p_proteins, processes=12)
        viral_coverages = parmap.map(per_contig_coverage, viral_bams, p_proteins, processes=12)
        viral_pileups = parmap.map(samtools_mpileup, viral_bams, processes=12)

        # blastx; bacterial
        bacterial_xml = os.path.join(tmpdir, "%s.bacterial_fraction.xml" % sample)
        blastx(query=tmpquery, db=blast_db, out=bacterial_xml, evalue=evalue,
                max_target_seqs=1, num_threads=threads, outfmt=5, query_gencode=11)
        bacterial_bam = xml_to_bam(bacterial_xml, bacterial_query, p_proteins)
        bacterial_coverage = per_contig_coverage(bacterial_bam, p_proteins)
        bacterial_pileup = samtools_mpileup(bacterial_bam)

    except Exception as e:
        print e.__doc__
        print e.message
        raise

    finally:
        # always remove viral fastas; don't copy back from ram
        if op.exists(tmp_viral_db):
            shutil.rmtree(tmp_viral_db)
        # gzip all of the files in the temp dir
        gzip_all(tmpdir, ignore=['pdf', 'bam', 'bai'])
        # copy over the files
        runcmd("cp -R -v {src}/* {dst}".format(src=tmpdir, dst=output_dir))

        if not keep_tmp:
            # delete the temp working directory
            shutil.rmtree(tmpdir)

        if email:
            send_email(to=email, subject=op.basename(__file__),
                message="finished processing %s; results were copied to %s" % (fasta, output_dir))

        logging.info("Complete.")
示例#41
0
文件: test_gen.py 项目: HennyH/hpc-p1
                            arg.get("rows"),
                            arg.get("columns"),
                            arg.get("data_type") if op != "sm" else "float",
                            arg["values"]),
                file=tf)
    except Exception as e:
        print("Failed to generate", test_file, "with error", traceback.format_exc())
        try:
            os.unlink(test_file)
        except:
            raise

if __name__ == "__main__":
    op_to_args = defaultdict(list)

    for dirpath, dirnames, filenames in os.walk("."):
        if dirpath == ".":
            continue
        operation = os.path.basename(dirpath)
        for i, output_filename in enumerate(filenames):
            if ".out" not in output_filename:
                continue
            output_filepath = os.path.join(dirpath, output_filename)
            args = get_args(output_filepath)
            op_to_args[operation].append(args)
    file_params = chain.from_iterable(
        [(op, arg) for arg in args]
        for op, args in op_to_args.items()
    )
    list(parmap.starmap(gen_test_file, file_params))
def align_frames(mouse, dir, freq):
    lofiles, lofilenames = get_file_list(dir+"Videos/", mouse)
    print lofilenames
    lop = get_distance_var(lofiles)
    
    all_frames = np.asarray(get_video_frames(lofiles), dtype=np.uint8)
    print "Alligning all video frames..."

    all_frames = parmap.starmap(shift_frames, zip(all_frames, lop))
##    for i in range(len(lop)):
##        for frame in all_frames[i]:
##            frame = image_registration.fft_tools.shift2d(frame, lop[i].dx, lop[i].dy)

    print np.shape(all_frames)

    count = 0
    new_all_frames = parmap.map(process_frames, all_frames, freq, mouse, dir)
    '''
    for frames in all_frames:
        print np.shape(frames)
        save_to_file("Green/"+lofilenames[count][:-4]+"_aligned.raw", frames, np.float32)

        print "Calculating mean..."
        avg_pre_filt = calculate_avg(frames)

        print "Temporal filter..."
        frames = cheby_filter(frames)
        frames += avg_pre_filt
        save_to_file("Green/Cheby/"+lofilenames[count][:-4]+"_BPFilter_0.1-1Hz.raw", frames, np.float32)


        print "Calculating DF/F0..."
        frames = calculate_df_f0(frames)
        save_to_file("Green/DFF/"+lofilenames[count][:-4]+"_DFF.raw", frames, np.float32)

        print "Applying MASKED GSR..."
        #frames = gsr(frames)
        frames = masked_gsr(frames, save_dir+"202_mask.raw")
        save_to_file("Green/GSR/"+lofilenames[count][:-4]+"_GSR.raw", frames, np.float32)


        print "Getting SD map..."
        sd = standard_deviation(frames)
        save_to_file("Green/SD_maps/"+lofilenames[count][:-4]+"_SD.raw", frames, np.float32)

        new_all_frames.append(frames)
        count += 1
    '''
    print "Creating array..."
    new_all_frames = np.asarray(new_all_frames, dtype=np.float32)
    all_frames = np.asarray(all_frames, dtype=np.float32)
    
    print "Joining Files..."
    new_all_frames = np.reshape(new_all_frames,
                            (new_all_frames.shape[0]*new_all_frames.shape[1],
                            new_all_frames.shape[2],
                            new_all_frames.shape[3]))
    all_frames = np.reshape(all_frames,
                            (all_frames.shape[0]*all_frames.shape[1],
                            all_frames.shape[2],
                            all_frames.shape[3]))

    print "Shapes: "
    print np.shape(all_frames)
    print np.shape(new_all_frames)

    where_are_NaNs = np.isnan(new_all_frames)
    new_all_frames[where_are_NaNs] = 0

    save_to_file("FULL_conc.raw", new_all_frames, np.float32)
    save_to_file("conc_RAW.raw", all_frames, np.float32)
    sd = standard_deviation(new_all_frames)
    save_to_file("FULL_SD.raw", sd, np.float32)

    print "Displaying correlation map..."
    mapper = CorrelationMapDisplayer(new_all_frames)
    mapper.display('spectral', -0.3, 1.0)