def main(opts, args): random.seed(0) rng = np.random.RandomState(1234567) lasagne.random.set_rng(rng) from cwi import xvalidate dset = utils.get_dset() objfunc = partial(xvalidate, dset, args['kfold']) hpspace = Space(args['layers_max'],opts) logging.critical('""" space """') logging.critical(hpspace) logging.critical('""" space end """\n') def objwrapper(spsample): conf = Conf(spsample,args) logging.critical(conf) f1 = objfunc(conf.params) loss = 1-f1 logging.critical('f1:{:.2f}\n'.format(f1)) return {'loss': loss, 'status': STATUS_OK} best = fmin(objwrapper, space=hpspace.space, algo=tpe.suggest, max_evals=args['evals'] ) """ {'opt': 0, 'dr2m3': 0.4161315610584588, 'activation': 2, 'n_batch': 1, 'h2m3': 0, 'dr0m3': 0.5461767196698459, 'dr1m3': 0.6720178139274229, 'h3m3': 0, 'dr3m3': 0.419643470550215, 'h1m3': 1, 'dpart': 2, 'lr': 0.0008630799496044787, 'norm': 8.679706724218939} """ logging.critical(best)
def plot_vars(f_step, projection, load_all=False): # The one employed for the figure name when exported variable_name = 'gph_t_850' # Build the name of the output image run_string, _ = get_run() if load_all: f_steps = list(range(0, 79)) + list(range(81, 121, 3)) else: f_steps = [f_step] filenames = ['/tmp/' + projection + '_' + variable_name + '_%s_%03d.png' % (run_string, f_step) for f_step in f_steps] test_filenames = [os.path.exists(f) for f in filenames] if all(test_filenames): # means the files already exist return filenames # otherwise do the plots dset = get_dset(vars_3d=['t@850', 'fi@500'], f_times=f_steps).squeeze() # Add a fictictious 1-D time dimension just to avoid problems if 'step' not in dset.dims.keys(): dset = dset.expand_dims('step') # dset = subset_arrays(dset, projection) time = pd.to_datetime(dset.valid_time.values) cum_hour = dset.step.values.astype(int) temp_850 = dset['t'] - 273.15 z_500 = dset['z'] gph_500 = mpcalc.geopotential_to_height(z_500) gph_500 = xr.DataArray(gph_500.magnitude, coords=z_500.coords, attrs={'standard_name': 'geopotential height', 'units': gph_500.units}) levels_temp = np.arange(-30., 30., 1.) levels_gph = np.arange(4700., 6000., 70.) lon, lat = get_coordinates(temp_850) lon2d, lat2d = np.meshgrid(lon, lat) cmap = get_colormap('temp') args = dict(filenames=filenames, projection=projection, levels_temp=levels_temp, cmap=cmap, lon2d=lon2d, lat2d=lat2d, lon=lon, lat=lat, temp_850=temp_850.values, gph_500=gph_500.values, levels_gph=levels_gph, time=time, run_string=run_string) if load_all: single_plot_param = partial(single_plot, **args) iterator = range(0, len(f_steps)) pool = Pool(cpu_count()) results = pool.map(single_plot_param, iterator) pool.close() pool.join() else: results = single_plot(0, **args) return results
def main(): random.seed(0) rng = np.random.RandomState(1234567) lasagne.random.set_rng(rng) parser = get_arg_parser() args = vars(parser.parse_args()) setup_logger(args) dset = utils.get_dset() sent_lens = [len(sent['ws']) for sent in dset] logging.debug('# of words per sent, min:{} max:{} mean:{:.2f} std:{:.2f}'.format(min(sent_lens), max(sent_lens), np.mean(sent_lens), np.std(sent_lens))) logging.critical(tabulate([args], headers='keys')) xvalidate(dset, 1, args)
def main(args): logging.critical(tabulate([args], headers='keys', floatfmt='.2f')+'\n') random.seed(0) defaults = {'embs': args['embs'], 'n_fold' : 5, 'e_context' : args['e_context'], 'feats' : args['feats'], 'percentile' : 20, 'unkt' : 2, 'clf' : 'svm', 'kerntype' : 'rbf', 'kerngamma' : 1, 'kerncoef0' : 1, 'kerndegree' : 2, 'cweights' : 1} assert hasattr(hp,'loguniform') space = { # 'kerngamma' : hp.loguniform('kerngamma', -20, 8), 'kerngamma' : getattr(hp, args['kerngamma'][0])('kerngamma', *map(int, args['kerngamma'][1:])), 'C' : getattr(hp,args['C'][0])('C', *map(int, args['C'][1:])), 'percentile' : getattr(hp,args['percentile'][0])('percentile', *map(int, args['percentile'][1:])), # 'percentile' : hp.quniform('percentile', 5, 30,1), # 'percentile' : getattr(hp,args['percentile'][0])('percentile',args['percentile'][1], args['percentile'][2]), # 'C' : hp.loguniform('C', -20, 8), # 'C' : hp.lognormal('C', -4, 1), # 'percentile' : hp.normal('percentile', 20, 5), # 'kerngamma' : hp.uniform('kerngamma', .00001, .1), } from cwi import xvalidate, Emb import utils dset = utils.get_dset(args['data']) emb = Emb(dset) def objwrapper(spsample): conf = defaults.copy() conf.update(spsample) logging.critical(tabulate([conf], headers='keys', floatfmt='.2e')) f1, f1std = xvalidate(dset, conf, emb) loss = 1-f1 logging.critical('f1:{:.2f} f1std:{:.2f}\n'.format(f1,f1std)) return {'loss': loss, 'status': STATUS_OK} best = fmin(objwrapper, space=space, algo=tpe.suggest, max_evals=args['evals'] ) logging.critical(best) defaults.update(best) f1, f1std = xvalidate(dset, defaults, emb) logging.critical('f1:{:.2f} f1std:{:.2f}\n'.format(f1,f1std))
def c_opt(dset, emb, dargs): logging.critical(tabulate([dargs],headers='keys')) f1s = [] for C in C_values: targs = dargs.copy() targs['C'] = C f1, f1std = cwi.xvalidate(dset, targs, emb) f1s.append((f1,f1std,C)) logging.critical('{}\t{}\t{}'.format(C, f1,f1std)) logging.critical('\n') return max(f1s) if __name__ == '__main__': setup_logger({'log':'road'}) dset = utils.get_dset() emb = cwi.Emb(dset) infolist = [] infolist.append(opt(dset,emb,['r50'],0)) """ for ec in range(4): infolist.extend(opt(dset,emb,['{}{}'.format(e,dim)],ec) for e,dim in product(['s','g'],[50,100,200])) # infolist.extend(opt(dset,emb,ename,ec) for ename in [['s50']]) for ec in range(4): infolist.extend(opt(dset,emb,[e1,e2],ec) for e1,e2 in product(['s50','s100','s200'],['g50','g100','g200'])) """
def get_vocab(dset): return set(w for sent in dset for w in sent['ws']) def get_contexts(sent, c): ws = (['<s>']*c) + sent['ws'] + (['</s>']*c) contexts = [] for i, w in enumerate(sent['ws']): wi = i + c if sent['ii'][i]: contexts.append(' '.join([w for w in ws[wi-c:wi] + ['___'] + ws[wi+1:wi+c+1]])) return contexts if __name__ == '__main__': trn = get_dset() tst = get_test() print map(len, map(get_tagged_vocab, [trn,tst])) print 'tagged vocab size trn {} tst {}'.format(*map(len, map(get_tagged_vocab, [trn,tst]))) print 'all vocab size trn {} tst {}'.format(*map(len, map(get_vocab, [trn,tst]))) vtrn, vtst = map(get_tagged_vocab, [trn,tst]) print 'tagged vtst diff: {:.2f}'.format( len(vtst.difference(vtrn)) / len(vtst) ) vtrn, vtst = map(get_vocab, [trn,tst]) print 'all vtst diff: {:.2f}'.format( len(vtst.difference(vtrn)) / len(vtst) ) precnt = Counter(w[:j] for sent in trn for w, lbl in zip(sent['ws'],sent['ls']) for j in range(3,5) if lbl==1 and len(w)>j) sufcnt = Counter(w[-j:] for sent in trn for w, lbl in zip(sent['ws'],sent['ls']) for j in range(3,5) if lbl==1 and len(w)>j) print 'most common prefixes:', precnt.most_common(100) print 'most common suffixes:', sufcnt.most_common(100)
pred_labels = [] for sent, pred in zip(dset,preds): gold_labels.extend([sent['ls'][ii] for ii, interested in enumerate(sent['ii']) if interested]) # pred_labels.extend([pred[ii] for ii, interested in enumerate(sent['ii']) if interested]) pred_labels.extend(pred) logging.debug(tabulate(confusion_matrix(np.array(gold_labels), np.array(pred_labels)), headers=[0,1])) p, r, f = evaluate_system.evaluateIdentifier(gold_labels, pred_labels) return p,r,f if __name__ == '__main__': parser = get_arg_parser() args = vars(parser.parse_args()) setup_logger(args) logging.debug(tabulate([OrderedDict((k,v) for k,v in sorted(args.iteritems()))], headers='keys')) if args['testf']: trn = utils.get_dset(args['data']) tst = utils.get_test() ytrn, ytst = fit_predict(trn, tst, args, Emb(trn+tst)) with open(args['testf'], 'w') as out: out.write('\n'.join([str(y) for y in ytst])) else: dset = utils.get_dset(args['data']) if args['sample'] > 0: random.seed(0) dset = random.sample(dset, args['sample']) xvalidate(dset, args, Emb(dset))
f1s.append((f1,f1std,C)) logging.critical('{}\t{}\t{}'.format(C, f1,f1std)) logging.critical('\n') return max(f1s) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--log', default='dont') parser.add_argument('--feats', default='') parser.add_argument('--percentile', default=20, type=int) parser.add_argument('--e_context', default=0, type=int) parser.add_argument('--fast', default=False, action='store_true') args = parser.parse_args() setup_logger({'log':args.log}) dset0 = utils.get_dset('training') dset = utils.get_dset('testing_annotated') emb = cwi.Emb(dset0+dset) dargs = {'embs':['s50','g50'], 'e_context' : args.e_context, 'feats' : args.feats, 'percentile' : args.percentile, 'n_fold' : 5, 'cweights' : 1, 'clf' : 'svm', 'kerntype' : 'lin', 'C' : 1, 'kerngamma' : 1, 'kerncoef0' : 1, 'kerndegree' : 1} # logging.critical(tabulate([dargs],headers='keys')) if args.fast: add_data_sizes = [0,200,400] C_powers = [-2,+1] seeds = [7,5,9] else: C_powers = [-20,+8] add_data_sizes = [0] + map(lambda x:2**x*100,range(1,7))
def plot_var(f_step, projection): # NOTE! # If we are inside this function it means that the picture does not exist # The one employed for the figure name when exported variable_name = 'gph_t_850' # Build the name of the output image run_string, _ = get_run() filename = '/tmp/' + projection + '_' + \ variable_name + '_%s_%03d.png' % (run_string, f_step) """In the main function we basically read the files and prepare the variables to be plotted. This is not included in utils.py as it can change from case to case.""" dset = get_dset(vars_3d=['t@850', 'fi@500'], f_times=f_step).squeeze() dset = subset_arrays(dset, projection) time = pd.to_datetime(dset.valid_time.values) cum_hour = dset.step.values.astype(int) temp_850 = dset['t'] - 273.15 z_500 = dset['z'] gph_500 = mpcalc.geopotential_to_height(z_500) gph_500 = xr.DataArray(gph_500.magnitude, coords=z_500.coords, attrs={'standard_name': 'geopotential height', 'units': gph_500.units}) levels_temp = np.arange(-30., 30., 1.) levels_gph = np.arange(4700., 6000., 70.) cmap = get_colormap('temp') fig = plt.figure(figsize=(figsize_x, figsize_y)) ax = plt.gca() lon, lat = get_coordinates(temp_850) lon2d, lat2d = np.meshgrid(lon, lat) ax = get_projection_cartopy(plt, projection, compute_projection=True) if projection == 'euratl': norm = BoundaryNorm(levels_temp, ncolors=cmap.N) cs = ax.pcolormesh(lon2d, lat2d, temp_850, cmap=cmap, norm=norm) else: cs = ax.contourf(lon2d, lat2d, temp_850, extend='both', cmap=cmap, levels=levels_temp) c = ax.contour(lon2d, lat2d, gph_500, levels=levels_gph, colors='white', linewidths=1.) labels = ax.clabel(c, c.levels, inline=True, fmt='%4.0f', fontsize=6) maxlabels = plot_maxmin_points(ax, lon, lat, gph_500, 'max', 80, symbol='H', color='royalblue', random=True) minlabels = plot_maxmin_points(ax, lon, lat, gph_500, 'min', 80, symbol='L', color='coral', random=True) an_fc = annotation_forecast(ax, time) an_var = annotation( ax, 'Geopotential height @500hPa [m] and temperature @850hPa [C]', loc='lower left', fontsize=6) an_run = annotation_run(ax, time) plt.colorbar(cs, orientation='horizontal', label='Temperature', pad=0.03, fraction=0.04) plt.savefig(filename, **options_savefig) plt.clf() return filename