def swap_log(swap, error=True): sinfo = [] for l in swap.split('\n'): if l == '': continue sinfo.append(l) for o in sinfo: if error: print_error(o) else: print_info(o) return
def __init__( self, dfx, metric='correlation', info_distance=None, ): """ paramters ----------------- dfx: pandas DataFrame metric: {'cosine', 'correlation', 'euclidean', 'jaccard', 'hamming', 'dice'}, default: 'correlation', measurement of feature distance info_distance: a vector-form distance vector of the feature points, shape should be: (n*(n-1)/2), where n is the number of the features """ assert type( dfx ) == pd.core.frame.DataFrame, 'input dfx mush be pandas DataFrame!' super().__init__() self.metric = metric self.isfit = False self.alist = dfx.columns.tolist() self.ftype = 'feature points' self.cluster_flag = False m, n = dfx.shape info_distance_length = int(n * (n - 1) / 2) ## calculating distance if np.array(info_distance).any(): assert len( info_distance ) == info_distance_length, 'shape of info_distance must be (%s,)' % info_distance_length print_info('skip to calculate the distance') self.info_distance = np.array(info_distance) else: print_info('Calculating distance ...') D = calculator.pairwise_distance(dfx.values, n_cpus=16, method=metric) D = np.nan_to_num(D, copy=False) D_ = squareform(D) self.info_distance = D_.clip(0, np.inf) ## statistic info S = summary.Summary(n_jobs=10) res = [] for i in tqdm(range(dfx.shape[1]), ascii=True): r = S._statistics_one(dfx.values, i) res.append(r) dfs = pd.DataFrame(res, index=self.alist) self.info_scale = dfs
def load_config(ftype='descriptor', metric='cosine'): name = '%s_%s.cfg.gzip' % (ftype, metric) dirf = os.path.dirname(__file__) filename = os.path.join(dirf, name) if os.path.exists(filename): df = pd.read_pickle(filename, compression='gzip') else: name = '%s_%s.cfg.gzip' % (ftype, metric) filename = os.path.join(dirf, name) try: print('try to down it from Google drive ...') url = googleids.get(name) print_info('downloading config file from google drive: %s' % url) filename = gdown.download(url, filename, quiet=False) print_info('finished...') except: print( 'Max retries exceeded for Google Drive, will try to down it from bidd.group...' ) url = biddids.get(name) print_info('downloading config file from bidd website: %s' % url) filename = gdown.download(url, filename, quiet=False) print_info('finished...') df = pd.read_pickle(filename, compression='gzip') return df
def MultiProcessUnorderedBarRun(func, deal_list, n_cpus=None): if n_cpus == None: N_CPUS = cpu_count() else: N_CPUS = int(n_cpus) print_info('the number of process is %s' % N_CPUS) p = Pool(N_CPUS) res_list = [] with pbar(total=len(deal_list), ascii=True) as pb: for res in p.imap_unordered(func, deal_list): pb.update(1) res_list.append(res) p.close() p.join() return res_list
def RunCmd(cmd): ''' input: cmd: str output: status: int, 0 for success stdout: str stderr: str ''' print_info('run command : %s' % cmd) def swap_log(swap, error=True): sinfo = [] for l in swap.split('\n'): if l == '': continue sinfo.append(l) for o in sinfo: if error: print_error(o) else: print_info(o) return output = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) status = output.returncode stdout = output.stdout stderr = output.stderr if status != 0: if output.stdout: swap_log(output.stdout, error=True) if output.stderr: swap_log(output.stderr, error=True) else: if output.stdout: swap_log(output.stdout, error=False) #return status return status, stdout, stderr
def MultiExecutorRun(func, deal_list, n_cpus=4, tqdm_args={'unit': 'one'}): ''' input: func: function to do with each element in the deal_list deal_list: list to be done n_cpus: use the number of cpus tqdm_args: args for tqdm output: list of the return value for each func ''' lst = list(deal_list) series = pd.Series(lst) futures = _executor(func, series, n_cpus=n_cpus) args = { 'total': len(deal_list), 'unit': 'one', 'ascii': True, 'unit_scale': True, 'leave': True } args.update(tqdm_args) print_info(args) results = [] indexs = [] for f in tqdm(as_completed(futures), **args): #print(f) idx, result = f.result() indexs.append(idx) results.append(result) res = pd.Series(results, index=indexs) #sort unordered result ordered_lst = res.sort_index().tolist() return ordered_lst
def __init__(self, dfx, metric = 'correlation' ): """ paramters ----------------- dfx: pandas DataFrame metric: {'cosine', 'correlation', 'euclidean', 'jaccard', 'hamming', 'dice'}, default: 'correlation', measurement of feature distance """ assert type(dfx) == pd.core.frame.DataFrame, 'input dfx mush be pandas DataFrame!' super().__init__() self.metric = metric self.isfit = False self.alist = dfx.columns.tolist() self.ftype = 'feature points' self.cluster_flag = False ## calculating distance print_info('Calculating distance ...') D = calculator.pairwise_distance(dfx.values, n_cpus=16, method=metric) D = np.nan_to_num(D,copy=False) D_ = squareform(D) self.info_distance = D_.clip(0, np.inf) ## statistic info S = summary.Summary(n_jobs = 10) res= [] for i in tqdm(range(dfx.shape[1]), ascii=True): r = S._statistics_one(dfx.values, i) res.append(r) dfs = pd.DataFrame(res, index = self.alist) self.info_scale = dfs
def ImapUnorder(processor, iterator, max_workers=10, fail_in_file='./filed.lst'): ''' processor: fuction iterator: list or iterator,each element should be a tuple or dict, so that data can be used as ordered ''' with ProcessPoolExecutor(max_workers=max_workers) as executor: with open(fail_in_file, 'w+') as f: futures = { executor.submit(processor, IdPlusSmile): IdPlusSmile for IdPlusSmile in iterator } success, _ = wait(futures) with pbar(total=len(futures)) as pb: for i in success: IdPlusSmile = futures[i] print_info('deal ' + str(IdPlusSmile)) try: data_dict = i.result() yield data_dict except Exception as exc: print_warn( 'because of the process is dead, input: %s is fialed when deal with %s: %s, so we will deal it automatically' % (IdPlusSmile, processor, exc)) try: yield processor(IdPlusSmile) except: f.write(str(IdPlusSmile) + '\n') print_error( ' input: %s is fialed when deal with %s: %s' % (IdPlusSmile, processor, exc)) pb.update(1)
def MultiProcessRun(func, deal_list, n_cpus=None): ''' input: func: function to do with each element in the deal_list deal_list: list to be done n_cpus: use the number of cpus output: list of the return result for each func ''' #round_c = [deal_list[i:i+batch_size] for i in range(0, len(deal_list), batch_size)] #mata thinking: https://my.oschina.net/leejun2005/blog/203148 if n_cpus == None: N_CPUS = cpu_count() else: N_CPUS = int(n_cpus) print_info('the number of process is %s' % N_CPUS) pool = Pool(N_CPUS) a = pool.map(func, deal_list) pool.close() pool.join() return a
def fit(self, method = 'umap', min_dist = 0.1, n_neighbors = 30, verbose = 2, random_state = 1, **kwargs): """ parameters ----------------- method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D kwargs: the extra parameters for the conresponding method """ if 'n_components' in kwargs.keys(): kwargs.pop('n_components') ## embedding into a 2d assert method in ['tsne', 'umap', 'mds'], 'no support such method!' self.method = method ## 2d embedding first self._fit_embedding(method = method, n_neighbors = n_neighbors, random_state = random_state, min_dist = min_dist, verbose = verbose, n_components = 2, **kwargs) if self.fmap_type == 'scatter': ## naive scatter algorithm print_info('Applying naive scatter feature map...') self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels') print_info('Finished') else: ## linear assignment algorithm print_info('Applying grid feature map(assignment), this may take several minutes(1~30 min)') self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels') print_info('Finished') ## fit flag self.isfit = True self.fmap_shape = self._S.fmap_shape
def plot_grid(molmap, htmlpath='./', htmlname=None): ''' molmap: the object of molmap htmlpath: the figure path ''' if not os.path.exists(htmlpath): os.makedirs(htmlpath) title = 'Assignment of %s by %s emmbedding result' % (molmap.ftype, molmap.method) subtitle = 'number of %s: %s, metric method: %s' % ( molmap.ftype, len(molmap.flist), molmap.metric) name = '%s_%s_%s_%s_%s' % (molmap.ftype, len( molmap.flist), molmap.metric, molmap.method, 'molmap') if htmlname: name = name = htmlname + '_' + name filename = os.path.join(htmlpath, name) print_info('generate file: %s' % filename) m, n = molmap.fmap_shape colormaps = molmap.extract.colormaps position = np.zeros(molmap.fmap_shape, dtype='O').reshape(m * n, ) position[molmap._S.col_asses] = molmap.flist position = position.reshape(m, n) x = [] for i in range(n): x.extend([i] * m) y = list(range(m)) * n v = position.reshape(m * n, order='f') df = pd.DataFrame(list(zip(x, y, v)), columns=['x', 'y', 'v']) bitsinfo = molmap.extract.bitsinfo subtypedict = bitsinfo.set_index('IDs')['Subtypes'].to_dict() subtypedict.update({0: 'NaN'}) df['Subtypes'] = df.v.map(subtypedict) df['colors'] = df['Subtypes'].map(colormaps) H = Highchart(width=1000, height=850) H.set_options('chart', {'type': 'heatmap', 'zoomType': 'xy'}) H.set_options('title', {'text': title}) H.set_options('subtitle', {'text': subtitle}) # H.set_options('xAxis', {'title': '', # 'min': 0, 'max': molmap.fmap_shape[1]-1, # 'allowDecimals':False, # 'labels':{'style':{'fontSize':20}}}) # H.set_options('yAxis', {'title': '', 'tickPosition': 'inside', # 'min': 0, 'max': molmap.fmap_shape[0]-1, # 'reversed': True, # 'allowDecimals':False, # 'labels':{'style':{'fontSize':20}}}) H.set_options( 'xAxis', { 'title': None, 'min': 0, 'max': molmap.fmap_shape[1], 'startOnTick': False, 'endOnTick': False, 'allowDecimals': False, 'labels': { 'style': { 'fontSize': 20 } } }) H.set_options( 'yAxis', { 'title': { 'text': ' ', 'style': { 'fontSize': 20 } }, 'startOnTick': False, 'endOnTick': False, 'gridLineWidth': 0, 'reversed': True, 'min': 0, 'max': molmap.fmap_shape[0], 'allowDecimals': False, 'labels': { 'style': { 'fontSize': 20 } } }) H.set_options( 'legend', { 'align': 'right', 'layout': 'vertical', 'margin': 1, 'verticalAlign': 'top', 'y': 60, 'symbolHeight': 12, 'floating': False, }) H.set_options('tooltip', { 'headerFormat': '<b>{series.name}</b><br>', 'pointFormat': '{point.v}' }) H.set_options('plotOptions', {'series': {'turboThreshold': 5000}}) for subtype, color in colormaps.items(): dfi = df[df['Subtypes'] == subtype] if len(dfi) == 0: continue H.add_data_set( dfi.to_dict('records'), 'heatmap', name=subtype, color=color, #dataLabels = {'enabled': True, 'color': '#000000'} ) H.save_file(filename) print_info('save html file to %s' % filename) return df, H
def plot_scatter(molmap, htmlpath='./', htmlname=None, radius=3): ''' molmap: the object of molmap htmlpath: the figure path, not include the prefix of 'html' htmlname: the name radius: int, defaut:3, the radius of scatter dot ''' title = '2D emmbedding of %s based on %s method' % (molmap.ftype, molmap.method) subtitle = 'number of %s: %s, metric method: %s' % ( molmap.ftype, len(molmap.flist), molmap.metric) name = '%s_%s_%s_%s_%s' % (molmap.ftype, len( molmap.flist), molmap.metric, molmap.method, 'scatter') if not os.path.exists(htmlpath): os.makedirs(htmlpath) if htmlname: name = htmlname + '_' + name filename = os.path.join(htmlpath, name) print_info('generate file: %s' % filename) xy = molmap.embedded.embedding_ colormaps = molmap.extract.colormaps df = pd.DataFrame(xy, columns=['x', 'y']) bitsinfo = molmap.extract.bitsinfo.set_index('IDs') df = df.join(bitsinfo.loc[molmap.flist].reset_index()) df['colors'] = df['Subtypes'].map(colormaps) H = Highchart(width=1000, height=850) H.set_options('chart', {'type': 'scatter', 'zoomType': 'xy'}) H.set_options('title', {'text': title}) H.set_options('subtitle', {'text': subtitle}) H.set_options( 'xAxis', { 'title': { 'enabled': True, 'text': 'X', 'style': { 'fontSize': 20 } }, 'labels': { 'style': { 'fontSize': 20 } }, 'gridLineWidth': 1, 'startOnTick': True, 'endOnTick': True, 'showLastLabel': True }) H.set_options( 'yAxis', { 'title': { 'text': 'Y', 'style': { 'fontSize': 20 } }, 'labels': { 'style': { 'fontSize': 20 } }, 'gridLineWidth': 1, }) # H.set_options('legend', {'layout': 'horizontal','verticalAlign': 'top','align':'right','floating': False, # 'backgroundColor': "(Highcharts.theme && Highcharts.theme.legendBackgroundColor) || '#FFFFFF'", # 'borderWidth': 1}) H.set_options( 'legend', { 'align': 'right', 'layout': 'vertical', 'margin': 1, 'verticalAlign': 'top', 'y': 40, 'symbolHeight': 12, 'floating': False, }) H.set_options( 'plotOptions', { 'scatter': { 'marker': { 'radius': radius, 'states': { 'hover': { 'enabled': True, 'lineColor': 'rgb(100,100,100)' } } }, 'states': { 'hover': { 'marker': { 'enabled': False } } }, 'tooltip': { 'headerFormat': '<b>{series.name}</b><br>', 'pointFormat': '{point.IDs}' } }, 'series': { 'turboThreshold': 5000 } }) for subtype, color in colormaps.items(): dfi = df[df['Subtypes'] == subtype] if len(dfi) == 0: continue data = dfi.to_dict('records') H.add_data_set(data, 'scatter', subtype, color=color) H.save_file(filename) print_info('save html file to %s' % filename) return df, H
def fit(self, feature_group_list = [], cluster_channels = 3, var_thr = -1, split_channels = True, fmap_type = 'grid', fmap_shape = None, emb_method = 'umap', min_dist = 0.1, n_neighbors = 15, verbose = 2, random_state = 32, group_color_dict = {}, lnk_method = 'complete', **kwargs): """ parameters ----------------- feature_group_list: list of the group name for each feature point cluster_channels: int, number of the channels(clusters) if feature_group_list is empty var_thr: float, defalt is -1, meaning that feature will be included only if the conresponding variance larger than this value. Since some of the feature has pretty low variances, we can remove them by increasing this threshold split_channels: bool, if True, outputs will split into various channels using the types of feature fmap_type:{'scatter', 'grid'}, default: 'gird', if 'scatter', will return a scatter mol map without an assignment to a grid fmap_shape: None or tuple, size of molmap, only works when fmap_type is 'scatter', if None, the size of feature map will be calculated automatically emb_method: {'tsne', 'umap', 'mds'}, algorithm to embedd high-D to 2D group_color_dict: dict of the group colors, keys are the group names, values are the colors lnk_method: {'complete', 'average', 'single', 'weighted', 'centroid'}, linkage method kwargs: the extra parameters for the conresponding embedding method """ if 'n_components' in kwargs.keys(): kwargs.pop('n_components') ## embedding into a 2d assert emb_method in ['tsne', 'umap', 'mds'], 'No Such Method Supported: %s' % emb_method assert fmap_type in ['scatter', 'grid'], 'No Such Feature Map Type Supported: %s' % fmap_type self.var_thr = var_thr self.split_channels = split_channels self.fmap_type = fmap_type self.fmap_shape = fmap_shape self.emb_method = emb_method self.lnk_method = lnk_method if fmap_shape != None: assert len(fmap_shape) == 2, "fmap_shape must be a tuple with two elements!" # flist and distance flist = self.info_scale[self.info_scale['var'] > self.var_thr].index.tolist() dfd = pd.DataFrame(squareform(self.info_distance), index=self.alist, columns=self.alist) dist_matrix = dfd.loc[flist][flist] self.flist = flist self.x_mean = self.info_scale['mean'].values self.x_std = self.info_scale['std'].values self.x_min = self.info_scale['min'].values self.x_max = self.info_scale['max'].values #bitsinfo dfb = pd.DataFrame(self.alist, columns = ['IDs']) if feature_group_list != []: self.cluster_flag = False assert len(feature_group_list) == len(self.alist), "the length of the input group list is not equal to length of the feature list" self.cluster_channels = len(set(feature_group_list)) self.feature_group_list = feature_group_list dfb['Subtypes'] = feature_group_list if set(feature_group_list).issubset(set(group_color_dict.keys())): self.group_color_dict = group_color_dict dfb['colors'] = dfb['Subtypes'].map(group_color_dict) else: unique_types = dfb['Subtypes'].unique() color_list = sns.color_palette("hsv", len(unique_types)).as_hex() group_color_dict = dict(zip(unique_types, color_list)) dfb['colors'] = dfb['Subtypes'].map(group_color_dict) self.group_color_dict = group_color_dict else: self.cluster_channels = cluster_channels print_info('applying hierarchical clustering to obtain group information ...') self.cluster_flag = True Z = linkage(squareform(dfd.values), lnk_method) labels = fcluster(Z, cluster_channels, criterion='maxclust') feature_group_list = ['cluster_%s' % str(i).zfill(2) for i in labels] dfb['Subtypes'] = feature_group_list dfb = dfb.sort_values('Subtypes') unique_types = dfb['Subtypes'].unique() if not set(unique_types).issubset(set(group_color_dict.keys())): color_list = sns.color_palette("hsv", len(unique_types)).as_hex() group_color_dict = dict(zip(unique_types, color_list)) dfb['colors'] = dfb['Subtypes'].map(group_color_dict) self.group_color_dict = group_color_dict self.Z = Z self.feature_group_list = feature_group_list self.bitsinfo = dfb colormaps = dfb.set_index('Subtypes')['colors'].to_dict() colormaps.update({'NaN': '#000000'}) self.colormaps = colormaps if fmap_type == 'grid': S = Scatter2Grid() else: if fmap_shape == None: N = len(self.flist) l = np.int(np.sqrt(N))*2 fmap_shape = (l, l) S = Scatter2Array(fmap_shape) self._S = S ## 2d embedding first embedded = self._fit_embedding(dist_matrix, method = emb_method, n_neighbors = n_neighbors, random_state = random_state, min_dist = min_dist, verbose = verbose, n_components = 2, **kwargs) self.embedded = embedded df = pd.DataFrame(embedded.embedding_, index = self.flist,columns=['x', 'y']) typemap = self.bitsinfo.set_index('IDs') df = df.join(typemap) df['Channels'] = df['Subtypes'] self.df_embedding = df if self.fmap_type == 'scatter': ## naive scatter algorithm print_info('Applying naive scatter feature map...') self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels') print_info('Finished') else: ## linear assignment algorithm print_info('Applying grid feature map(assignment), this may take several minutes(1~30 min)') self._S.fit(self.df_embedding, self.split_channels, channel_col = 'Channels') print_info('Finished') ## fit flag self.isfit = True if self.fmap_shape == None: self.fmap_shape = self._S.fmap_shape else: m, n = self.fmap_shape p, q = self._S.fmap_shape assert (m >= p) & (n >=q), "fmap_shape's width must >= %s, height >= %s " % (p, q) return self