def set_cache_dir(cachedir, bytes_limit=10*2**30): cache = joblib.Memory(cachedir=cachedir, bytes_limit=bytes_limit, verbose=0).cache cache_data.cache = cache cache_data.cachedir = cachedir fsc_cached = cache(fourierseries._base_fourier_component) fourierseries._fourier_component = fsc_cached
def generate_tsdiffana_thumbnail(image_files, sessions, subject_id, output_dir, results_gallery=None, tooltips=None): """Generate tsdiffana thumbnails Parameters ---------- image_files: list or strings or list paths (4D case) to list of paths (3D case) of images under inspection output_dir: string dir to which all output whill be written subject_id: string id of subject under inspection sessions: list list of session ids, one per element of image_files result_gallery: ResultsGallery instance (optional) gallery to which thumbnails will be committed """ # plot figures qa_cache_dir = os.path.join(output_dir, "QA") if not os.path.exists(qa_cache_dir): os.makedirs(qa_cache_dir) qa_mem = joblib.Memory(cachedir=qa_cache_dir, verbose=5) results = qa_mem.cache(multi_session_time_slice_diffs)(image_files) axes = plot_tsdiffs(results, use_same_figure=False) figures = [ax.get_figure() for ax in axes] output_filename_template = os.path.join(output_dir, "tsdiffana_plot_{0}.png") output_filenames = [ output_filename_template.format(i) for i in range(len(figures)) ] for fig, output_filename in zip(figures, output_filenames): fig.savefig(output_filename, bbox_inches="tight", dpi=200) pl.close(fig) if tooltips is None: tooltips = [None] * len(output_filename) # create thumbnails thumbnails = [] for output_filename, tooltip in zip(output_filenames, tooltips): thumbnail = Thumbnail(tooltip=tooltip) thumbnail.a = a(href=os.path.basename(output_filename)) thumbnail.img = img(src=os.path.basename(output_filename), height="250px", width="600px") thumbnail.description = "tsdiffana ({0} sessions)".format( len(sessions)) thumbnails.append(thumbnail) if results_gallery: results_gallery.commit_thumbnails(thumbnails) return thumbnails
def resample_img(input_img_filename, new_vox_dims, output_filename=None): """ Resamples an image to a new resolution Parameters ---------- input_img_filename: string path to image to be resampled new_vox_dims: list or tuple of +ve floats new vox dimensions to which the image is to be resampled output_filename: string (optional) where output image will be written Returns ------- output_filename: string where the resampled img has been written """ try: from nilearn.image import resample_img as ni_resample_img except ImportError: raise RuntimeError( "nilearn not found on your system; can't do resampling!") # sanity if output_filename is None: output_filename = os.path.join( os.path.dirname(input_img_filename), "resample_" + os.path.basename(input_img_filename)) # prepare for smart-caching output_dir = os.path.dirname(output_filename) cache_dir = os.path.join(output_dir, "resample_img_cache") if not os.path.exists(cache_dir): os.makedirs(cache_dir) mem = joblib.Memory(cachedir=cache_dir, verbose=5) # resample input img to new resolution resampled_img = mem.cache(ni_resample_img)( input_img_filename, target_affine=np.diag(new_vox_dims)) # save resampled img nibabel.save(resampled_img, output_filename) return output_filename
def transform(self, X, y=None): """Extract features from the array X. Parameters ---------- X : ndarray, shape (n_epochs, n_channels, n_times) y : None Only for compatibility with :class:`~sklearn.pipeline.Pipeline`. Returns ------- Xnew : ndarray, shape (n_epochs, n_features) Extracted features. """ mem = joblib.Memory(cachedir=self.memory) _extractor = mem.cache(extract_features) return _extractor(X, self.sfreq, self.selected_funcs, funcs_params=self.params, n_jobs=self.n_jobs)
def transform(self, X): """Extract features from the array X. Parameters ---------- X : ndarray, shape (n_epochs, n_channels, n_times) Returns ------- Xnew : ndarray, shape (n_epochs, n_features) Extracted features. """ mem = joblib.Memory(location=self.memory) _extractor = mem.cache(extract_features) return _extractor(X, self.sfreq, self.selected_funcs, funcs_params=self.params, n_jobs=self.n_jobs)
def do_3Dto4D_merge(threeD_img_filenames, output_dir=None, output_filename=None): """ This function produces a single 4D nifti image from several 3D. threeD_img_filenames: list of string paths to images to be merged Returns ------- returns nifit image object """ if isinstance(threeD_img_filenames, _basestring): return nibabel.load(threeD_img_filenames) if output_dir is None: output_dir = tempfile.mkdtemp() # prepare for smart caching merge_cache_dir = os.path.join(output_dir, "merge") if not os.path.exists(merge_cache_dir): os.makedirs(merge_cache_dir) merge_mem = joblib.Memory(cachedir=merge_cache_dir, verbose=5) # merging proper fourD_img = merge_mem.cache(nibabel.concat_images)(threeD_img_filenames, check_affines=False) # sanity if len(fourD_img.shape) == 5: fourD_img = nibabel.Nifti1Image( fourD_img.get_data()[..., ..., ..., 0, ...], fourD_img.get_affine()) # save image to disk if output_filename is not None: merge_mem.cache(nibabel.save)(fourD_img, output_filename) return fourD_img
# display figures in the notebook import tensorflow as tf import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_digits from sklearn.externals import joblib from sklearn import preprocessing from sklearn.model_selection import train_test_split from keras.models import Sequential from keras.layers import Dense, Activation, Dropout from keras.optimizers import SGD from keras.utils.np_utils import to_categorical m = joblib.Memory(cachedir='/tmp') @m.cache() def make_curves(random_state=42): digits = load_digits() rng = np.random.RandomState(random_state) data = np.asarray(digits.data, dtype='float32') target = np.asarray(digits.target, dtype='int32') # Add noise in the labels to cause more overfitting target[:200] = rng.randint(0, 10, size=200) X_train, X_test, y_train, y_test = train_test_split( data, target, test_size=0.15, random_state=random_state) # mean = 0 ; standard deviation = 1.0
def generate_segmentation_thumbnails(normalized_files, output_dir, subject_gm_file=None, subject_wm_file=None, subject_csf_file=None, only_native=False, brain='func', comments="", execution_log_html_filename=None, cmap=None, tooltip=None, results_gallery=None): """Generates thumbnails after indirect normalization (segmentation + normalization) Parameters ---------- normalized_file: list paths to normalized images (3Ds or 4Ds) output_dir: string dir to which all output will be written subject_gm_file: string (optional) path to subject GM file subject_csf_file: string (optional) path to subject WM file subject_csf_file: string (optional) path to subject CSF file brain: string (optional) a short commeent/tag like 'epi', or 'anat' cmap: optional cmap (color map) to use for plots result_gallery: ResultsGallery instance (optional) gallery to which thumbnails will be committed """ if isinstance(normalized_files, _basestring): normalized_file = normalized_files else: mean_normalized_file = os.path.join(output_dir, "%s.nii" % brain) compute_mean_3D_image(normalized_files, output_filename=mean_normalized_file) normalized_file = mean_normalized_file output = {} # prepare for smart caching qa_cache_dir = os.path.join(output_dir, "QA") if not os.path.exists(qa_cache_dir): os.makedirs(qa_cache_dir) qa_mem = joblib.Memory(cachedir=qa_cache_dir, verbose=5) thumb_desc = "Segmentation of %s " % brain if execution_log_html_filename: thumb_desc += (" (<a href=%s>see execution " "log</a>)") % ( os.path.basename(execution_log_html_filename)) _brain = "(%s) %s" % (comments, brain) if comments else brain # plot contours of template compartments on subject's brain if not only_native: template_compartments_contours = os.path.join( output_dir, "template_tpms_contours_on_%s.png" % _brain) template_compartments_contours_axial = os.path.join( output_dir, "template_compartments_contours_on_%s_axial.png" % _brain) qa_mem.cache(plot_segmentation)( normalized_file, GM_TEMPLATE, wm_filename=WM_TEMPLATE, csf_filename=CSF_TEMPLATE, display_mode='z', cmap=cmap, output_filename=template_compartments_contours_axial, title="template TPMs", close=True) qa_mem.cache(plot_segmentation)( normalized_file, gm_filename=GM_TEMPLATE, wm_filename=WM_TEMPLATE, csf_filename=CSF_TEMPLATE, output_filename=template_compartments_contours, cmap=cmap, close=True, title=("Template GM, WM, and CSF TPM contours on " "subject's %s") % _brain) # create thumbnail if results_gallery: thumbnail = Thumbnail(tooltip=tooltip) thumbnail.a = a( href=os.path.basename(template_compartments_contours)) thumbnail.img = img( src=os.path.basename(template_compartments_contours), height="250px") thumbnail.description = thumb_desc results_gallery.commit_thumbnails(thumbnail) output['axial'] = template_compartments_contours_axial # plot contours of subject's compartments on subject's brain if subject_gm_file: subject_compartments_contours = os.path.join( output_dir, "subject_tpms_contours_on_subject_%s.png" % _brain) subject_compartments_contours_axial = os.path.join( output_dir, "subject_tpms_contours_on_subject_%s_axial.png" % _brain) qa_mem.cache(plot_segmentation)( normalized_file, subject_gm_file, wm_filename=subject_wm_file, csf_filename=subject_csf_file, display_mode='z', cmap=cmap, output_filename=subject_compartments_contours_axial, close=True, title="subject TPMs") title_prefix = "Subject's GM" if subject_wm_file: title_prefix += ", WM" if subject_csf_file: title_prefix += ", and CSF" qa_mem.cache(plot_segmentation)( normalized_file, subject_gm_file, wm_filename=subject_wm_file, csf_filename=subject_csf_file, cmap=cmap, close=True, output_filename=subject_compartments_contours, title=("%s TPM contours on " "subject's %s") % (title_prefix, _brain)) # create thumbnail if results_gallery: thumbnail = Thumbnail(tooltip=tooltip) thumbnail.a = a( href=os.path.basename(subject_compartments_contours)) thumbnail.img = img( src=os.path.basename(subject_compartments_contours), height="250px") thumbnail.description = thumb_desc results_gallery.commit_thumbnails(thumbnail) if only_native: output['axial'] = subject_compartments_contours_axial return output
def generate_registration_thumbnails(target, source, procedure_name, output_dir, tooltip=None, execution_log_html_filename=None, results_gallery=None): """ Generates QA thumbnails post-registration. Parameters ---------- target: tuple of length 2 target[0]: string path to reference image used in the registration target[1]: string short name (e.g 'anat', 'epi', 'MNI', etc.) for the reference image source: tuple of length 2 source[0]: string path to source image source[1]: string short name (e.g 'anat', 'epi', 'MNI', etc.) for the source image procedure_name: string name of, or short comments on, the registration procedure used (e.g 'anat ==> func', etc.) """ output = {} # prepare for smart caching qa_cache_dir = os.path.join(output_dir, "QA") if not os.path.exists(qa_cache_dir): os.makedirs(qa_cache_dir) qa_mem = joblib.Memory(cachedir=qa_cache_dir, verbose=5) thumb_desc = procedure_name if execution_log_html_filename: thumb_desc += " (<a href=%s>see execution log</a>)" % ( os.path.basename(execution_log_html_filename)) # plot outline (edge map) of template on the # normalized image outline = os.path.join(output_dir, "%s_on_%s_outline.png" % (target[1], source[1])) qa_mem.cache(plot_registration)(target[0], source[0], output_filename=outline, close=True, title="Outline of %s on %s" % (target[1], source[1])) # create thumbnail if results_gallery: thumbnail = Thumbnail(tooltip=tooltip) thumbnail.a = a(href=os.path.basename(outline)) thumbnail.img = img(src=os.path.basename(outline), height="250px") thumbnail.description = thumb_desc results_gallery.commit_thumbnails(thumbnail) # plot outline (edge map) of the normalized image # on the SPM MNI template source, target = (target, source) outline = os.path.join(output_dir, "%s_on_%s_outline.png" % (target[1], source[1])) outline_axial = os.path.join( output_dir, "%s_on_%s_outline_axial.png" % (target[1], source[1])) qa_mem.cache(plot_registration)(target[0], source[0], output_filename=outline_axial, close=True, display_mode='z', title="Outline of %s on %s" % (target[1], source[1])) output['axial'] = outline_axial qa_mem.cache(plot_registration)(target[0], source[0], output_filename=outline, close=True, title="Outline of %s on %s" % (target[1], source[1])) # create thumbnail if results_gallery: thumbnail = Thumbnail(tooltip=tooltip) thumbnail.a = a(href=os.path.basename(outline)) thumbnail.img = img(src=os.path.basename(outline), height="250px") thumbnail.description = thumb_desc results_gallery.commit_thumbnails(thumbnail) return output
from sklearn.model_selection import cross_val_predict, cross_val_score, train_test_split, GridSearchCV from sklearn.datasets import load_svmlight_file from sklearn.decomposition import PCA, LatentDirichletAllocation from sklearn.svm import SVC, LinearSVC from sklearn.naive_bayes import BernoulliNB from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \ BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier from easyml.utils import util memory = joblib.Memory("./mycache") @memory.cache def get_data(file_name): data = load_svmlight_file(file_name) return data[0], data[1] def arff2svm(arff_files): svm_files = [] for arff_file in arff_files: name = arff_file[0:arff_file.rindex('.')] tpe = arff_file[arff_file.rindex('.') + 1:] svm_file = name + ".libsvm" svm_files.append(svm_file)
from urllib import request import lightgbm as lgbm import matplotlib.pyplot as plt import numpy as np import pandas as pd from sklearn import ensemble from sklearn import metrics from sklearn import model_selection from sklearn import tree from sklearn.externals import joblib import starboost as sb HERE = './' URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz' m = joblib.Memory(location='/tmp', mmap_mode='r') MAX_DEPTH = 3 N_ESTIMATORS = 100 LEARNING_RATE = 0.1 SUBSAMPLE = 50000 @m.cache def load_data(): filename = os.path.join(HERE, URL.rsplit('/', 1)[-1]) if not os.path.exists(filename): print(f'Downloading {URL} to {filename}...') request.urlretrieve(URL, filename) print(f'Parsing {filename}...') with gzip.GzipFile(filename) as f:
# We can combine our feature extraction, selection and final SVC in one step svc = LinearSVC() pipeline = Pipeline([('vectorize', vectorizer), ('select', selector), ('svc', svc)]) cross_val_score(pipeline, X, y, verbose=3) # [CV] no parameters to be set ......................................... # [CV] ................ no parameters to be set, score=0.888212 - 4.2s # [Parallel(n_jobs=1)]: Done 1 jobs | elapsed: 4.2s # [CV] no parameters to be set ......................................... # [CV] ................ no parameters to be set, score=0.891068 - 4.2s # [CV] no parameters to be set ......................................... # [CV] ................ no parameters to be set, score=0.888741 - 4.4s # [Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 12.8s finished # Parameter selection - pipeline returned a lot of different parameters # Pipeline object exposes the parameters of the estimators it wraps with # the following convention: name of the estimator, __, name of parameter pipeline.set_params(svc__C=10) # set SVC's C parameter # Choosing parameters by cross-validation may imply running transformers # many times on the same data with the same parameters. Can avoid overhead # by using joblib's memory memory = joblib.Memory(cachedir='.') memory.clear() selector.score_func = memory.cache(selector.score_func) # GridSearchCV - Use gridsearch to choose the best C between 3 values grid = GridSearchCV(estimator=pipeline, param_grid=dict(svc__C=[1e-2, 1, 1e2])) grid.fit(X, y) print grid.best_estimator_.named_steps['svc']
import sys import scipy import numpy as np from model import fit from sklearn.externals import joblib mem = joblib.Memory('mem') def load_image(fn, xshift=0, yshift=0, downsample=8, transpose=False): x = scipy.misc.imread(fn).astype('float32') / 255. m = int(min(x.shape[:2]) / 2) xc, yc = [int(s / 2) for s in x.shape[:2]] xc += xshift yc += yshift y = x[xc - m:xc + m, yc - m:yc + m, :] if downsample: y = y[::downsample, ::downsample, :] if transpose: y = y.transpose((2, 0, 1)) assert y.shape[0] == y.shape[1] # grey = y.mean(axis=0) # flatten = np.ravel(grey) return y @mem.cache def load_all(fns): X = np.array([load_image(fn, yshift=-150) for fn in fns]) return X
PROJECT_DIR = os.path.normpath(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)) RAW_DATA_DIR = os.path.join(PROJECT_DIR, 'data', 'raw') RAW_DATA_DIR_DUMMY = os.path.join(PROJECT_DIR, 'data', 'raw_dummy') DATA_FILE_NAMES = dict(results='parameters_and_results.h5', fields='field_data_{}_{}.h5') # Helper to switch between real and dummy data _USE_DUMMIES = {'do': False} # Helper to toggle cache usage _USE_CACHE = {'do': False} # A global joblib cache to persist output of functions. MEMORY = joblib.Memory(cachedir=os.path.abspath('cache'), verbose=0) def format_time(t): """Returns a well formatted time string.""" return str(timedelta(seconds=t)) def set_dummy_mode(use_dummies, verbose=True): """ Convenience function to toggle dummy mode. Parameters ---------- use_dummies : bool Whether to use dummy data.
import logging from pathlib import Path import sklearn.datasets from sklearn.externals import joblib location = Path(__file__).resolve().parent.parent / '.cache' location = str(location) mem = joblib.Memory(location=location, verbose=logging.DEBUG) @mem.cache def load_svmlight_file(*args, **kwargs): return sklearn.datasets.load_svmlight_file(*args, **kwargs)
def main(inputs, infile_estimator, infile1, infile2, outfile_result, outfile_object=None, groups=None): """ Parameter --------- inputs : str File path to galaxy tool parameter infile_estimator : str File path to estimator infile1 : str File path to dataset containing features infile2 : str File path to dataset containing target values outfile_result : str File path to save the results, either cv_results or test result outfile_object : str, optional File path to save searchCV object groups : str File path to dataset containing groups labels """ warnings.simplefilter('ignore') with open(inputs, 'r') as param_handler: params = json.load(param_handler) if groups: (params['search_schemes']['options']['cv_selector']['groups_selector'] ['infile_g']) = groups params_builder = params['search_schemes']['search_params_builder'] input_type = params['input_options']['selected_input'] if input_type == 'tabular': header = 'infer' if params['input_options']['header1'] else None column_option = (params['input_options']['column_selector_options_1'] ['selected_column_selector_option']) if column_option in [ 'by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name' ]: c = params['input_options']['column_selector_options_1']['col1'] else: c = None X = read_columns(infile1, c=c, c_option=column_option, sep='\t', header=header, parse_dates=True).astype(float) else: X = mmread(open(infile1, 'r')) header = 'infer' if params['input_options']['header2'] else None column_option = (params['input_options']['column_selector_options_2'] ['selected_column_selector_option2']) if column_option in [ 'by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name' ]: c = params['input_options']['column_selector_options_2']['col2'] else: c = None y = read_columns(infile2, c=c, c_option=column_option, sep='\t', header=header, parse_dates=True) y = y.ravel() optimizer = params['search_schemes']['selected_search_scheme'] optimizer = getattr(model_selection, optimizer) options = params['search_schemes']['options'] splitter, groups = get_cv(options.pop('cv_selector')) options['cv'] = splitter options['n_jobs'] = N_JOBS primary_scoring = options['scoring']['primary_scoring'] options['scoring'] = get_scoring(options['scoring']) if options['error_score']: options['error_score'] = 'raise' else: options['error_score'] = np.NaN if options['refit'] and isinstance(options['scoring'], dict): options['refit'] = primary_scoring if 'pre_dispatch' in options and options['pre_dispatch'] == '': options['pre_dispatch'] = None with open(infile_estimator, 'rb') as estimator_handler: estimator = load_model(estimator_handler) memory = joblib.Memory(location=CACHE_DIR, verbose=0) # cache iraps_core fits could increase search speed significantly if estimator.__class__.__name__ == 'IRAPSClassifier': estimator.set_params(memory=memory) else: for p, v in estimator.get_params().items(): if p.endswith('memory'): if len(p) > 8 and p[:-8].endswith('irapsclassifier'): # cache iraps_core fits could increase search # speed significantly new_params = {p: memory} estimator.set_params(**new_params) elif v: new_params = {p, None} estimator.set_params(**new_params) elif p.endswith('n_jobs'): new_params = {p: 1} estimator.set_params(**new_params) param_grid = _eval_search_params(params_builder) searcher = optimizer(estimator, param_grid, **options) # do train_test_split do_train_test_split = params['train_test_split'].pop('do_split') if do_train_test_split == 'yes': # make sure refit is choosen if not options['refit']: raise ValueError("Refit must be `True` for shuffle splitting!") split_options = params['train_test_split'] # splits if split_options['shuffle'] == 'stratified': split_options['labels'] = y X, X_test, y, y_test = train_test_split(X, y, **split_options) elif split_options['shuffle'] == 'group': if not groups: raise ValueError("No group based CV option was " "choosen for group shuffle!") split_options['labels'] = groups X, X_test, y, y_test, groups, _ =\ train_test_split(X, y, **split_options) else: if split_options['shuffle'] == 'None': split_options['shuffle'] = None X, X_test, y, y_test =\ train_test_split(X, y, **split_options) # end train_test_split if options['error_score'] == 'raise': searcher.fit(X, y, groups=groups) else: warnings.simplefilter('always', FitFailedWarning) with warnings.catch_warnings(record=True) as w: try: searcher.fit(X, y, groups=groups) except ValueError: pass for warning in w: print(repr(warning.message)) if do_train_test_split == 'no': # save results cv_results = pandas.DataFrame(searcher.cv_results_) cv_results = cv_results[sorted(cv_results.columns)] cv_results.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) # output test result using best_estimator_ else: best_estimator_ = searcher.best_estimator_ if isinstance(options['scoring'], collections.Mapping): is_multimetric = True else: is_multimetric = False test_score = _score(best_estimator_, X_test, y_test, options['scoring'], is_multimetric=is_multimetric) if not is_multimetric: test_score = {primary_scoring: test_score} for key, value in test_score.items(): test_score[key] = [value] result_df = pandas.DataFrame(test_score) result_df.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False) memory.clear(warn=False) if outfile_object: with open(outfile_object, 'wb') as output_handler: pickle.dump(searcher, output_handler, pickle.HIGHEST_PROTOCOL)
def make_lens_catalog(args): """ NAME make_lens_catalog PURPOSE Given location of collection pickle, this script produces a set of annotated images of lenses (heatmaps for lens locations, markers for where clicks were, etc). COMMENTS You have to download the file so it chooses whever your output directory is to also download the raw images. This should be pretty customizable. FLAGS -h Print this message --skill Weight by skill INPUTS collection.pickle OUTPUTS lens.dat Assumed format: ID kind x y Prob N0 Skill Dist Here: ID = Space Warps subject ID kind = Space Warps subject type (sim, dud, test) x,y = object (cluster) centroid, in pixels P = Space Warps subject probability N0 = number of markers in the cluster S = total skill per cluster, summed over markers D = biggest distance within cluster EXAMPLE BUGS AUTHORS This file is part of the Space Warps project, and is distributed under the GPL v2 by the Space Warps Science Team. http://spacewarps.org/ HISTORY 2013-07-16 started Davis (KIPAC) """ # ------------------------------------------------------------------ # Some defaults: flags = { 'skill': False, 'output_directory': './', 'output_name': 'catalog.dat', 'image_y_size': 440, 'catalog_path': '', 'update_collection': '', } # ------------------------------------------------------------------ # Read in options: # this has to be easier to do... for arg in args: if arg in flags: flags[arg] = args[arg] elif arg == 'collection_path': collection_path = args[arg] else: print "make_lens_atlas: unrecognized flag ", arg print "make_lens_catalog: illustrating behaviour captured in collection file: " print "make_lens_catalog: ", collection_path memory = joblib.Memory(cachedir=flags['output_directory']) memory.clear() catalog_path = flags['output_directory'] + flags['output_name'] if len(flags['output_name']) > 0: F = open(catalog_path, 'w') F.write('id,kind,x,y,prob,n0,skill,dist\n') # ------------------------------------------------------------------ # Read in files: collection = swap.read_pickle(collection_path, 'collection') ID_list = collection.list() print "make_lens_catalog: collection numbers ", len(ID_list) if flags['catalog_path'] != '': print "make_lens_catalog: filtering from catalog ", flags[ 'catalog_path'] catalog_in = csv2rec(flags['catalog_path']) ID_list = np.unique(catalog_in['id']) # ------------------------------------------------------------------ # Run through data: catalog = {} for ID in ID_list: subject = collection.member[ID] kind = subject.kind P = subject.mean_probability itwas = subject.annotationhistory['ItWas'] x_all = subject.annotationhistory['At_X'] y_all = subject.annotationhistory['At_Y'] x_markers = np.array([xi for xj in x_all for xi in xj]) y_markers = np.array([yi for yj in y_all for yi in yj]) catalog.update( {ID: { 'agents_reject': [], 'x': x_markers, 'y': y_markers, }}) PL_all = subject.annotationhistory['PL'] PD_all = subject.annotationhistory['PD'] # filter out the empty clicks PL_list = [] PL_nots = [] for i, xj in enumerate(x_all): # len(xj) of empty = 0 PL_list.append([PL_all[i]] * len(xj)) if len(xj) == 0: PL_nots.append(PL_all[i]) PL = np.array([PLi for PLj in PL_list for PLi in PLj]) PL_nots = np.array(PL_nots) # filter out the empty clicks PD_list = [] PD_nots = [] for i, xj in enumerate(x_all): PD_list.append([PD_all[i]] * len(xj)) if len(xj) == 0: PD_nots.append(PD_all[i]) catalog[ID]['agents_reject'].append(i) PD = np.array([PDi for PDj in PD_list for PDi in PDj]) PD_nots = np.array(PD_nots) skill = swap.expectedInformationGain(0.5, PL, PD) # skill # it is only fair to write out the NOTs, too # do the empty guys skill_nots = swap.expectedInformationGain(0.5, PL_nots, PD_nots) # skill x, y = -1, -1 N0 = len(skill_nots) S = np.sum(skill_nots) D = 0 ## catalog.append((ID, kind, x, y, P, N0, S, D)) if len(catalog) % 500 == 0: print len(catalog) if len(flags['output_name']) > 0: F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format( ID, kind, x, y, P, N0, S, D)) if len(x_markers) == 0: # apparently everyone was a not... catalog[ID]['agents_labels'] = np.array([]) continue # ------------------------------------------------------------------ # cluster print 'make_lens_catalog: subject ID = ', ID if flags['skill']: cluster_centers, cluster_center_labels, cluster_labels, \ n_clusters, dist_within = outlier_clusters(x_markers, y_markers, skill, memory=memory) else: cluster_centers, cluster_center_labels, cluster_labels, \ n_clusters, dist_within = outlier_clusters(x_markers, y_markers, None, memory=memory) # need to get: x, y, N0, S catalog[ID]['agents_labels'] = cluster_labels for cluster_center_label in cluster_center_labels: cluster_center = cluster_centers[cluster_center_label] members = (cluster_labels == cluster_center_label) x, y = cluster_center # convert y to catalog convention y = flags['image_y_size'] - y N0 = np.sum(members) S = np.sum(skill[members]) D = dist_within[cluster_center_label] if cluster_center_label == -1: # outlier cluster # so really every point is its own cluster... D = 0 ## catalog.append((ID, kind, x, y, P, N0, S, D)) ## if len(catalog)%500 == 0: ## print len(catalog) # TODO: make some requirement to be included (exclude outliers) if len(flags['output_name']) > 0: F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format( ID, kind, x, y, P, N0, S, D)) print 'make_lens_catalog: Clearing memory' # clear memory memory.clear() if len(flags['output_name']) > 0: print 'make_lens_catalog: closing file!' F.close() if len(flags['update_collection']) > 0: print 'make_lens_catalog: writing updated collection to', flags[ 'update_collection'] # TODO: get the other params correct!!!! collection_fat = swap.collection.Collection() for ID in catalog: subject = collection.member[ID] atx = subject.annotationhistory['At_X'] labels_in = list(catalog[ID]['agents_labels']) labels_fat = [] for atx_i in atx: labels_fat.append([]) for atx_ij in atx_i: labels_fat[-1].append(labels_in.pop(0)) subject.annotationhistory.update({'labels': labels_fat}) collection_fat.member.update({ID: subject}) swap.write_pickle(collection_fat, flags['update_collection']) print 'make_lens_catalog: All done!' return catalog
# This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import os import numpy as np import pandas as pd import seaborn as sns import time from sklearn.externals import joblib from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import OneClassSVM from ..base import BASE_PATH from .. import data_sets, feature_extraction _file_memory = joblib.Memory(cachedir=os.path.join(BASE_PATH, 'cache')) @_file_memory.cache def _build_and_fit(ds_url: str) -> pd.DataFrame: result_list = [] normal_list, anomalous_list = data_sets.get(ds_url) for i in range(1, 5): n = 10**i print('ds_url {} | n {:9,d}'.format(ds_url, n)) train_list = [] while len(train_list) < n: train_list += normal_list
def fit(self, X, y): super(LogExpPipeline, self).fit(X, np.log1p(y)) def predict(self, X): return np.expm1(super(LogExpPipeline, self).predict(X)) import backtest bt = backtest.Backtest() bt.init_candle() bt.resetbacktest() bt.index = bt.size bt.updateIndicators() truth = bt.df memory = joblib.Memory(cachedir=".") n = truth.shape[1] # # XGBoost model # xgb_params = {} xgb_params['objective'] = 'reg:linear' xgb_params['learning_rate'] = 0.001 xgb_params['max_depth'] = int(6.0002117448743721) xgb_params['max_depth'] = 9 xgb_params['subsample'] = 0.72476106045336319 xgb_params['min_child_weight'] = int(4.998433055249718) #xgb_params['colsample_bytree'] = 0.97058965304691203 #xgb_params['colsample_bylevel'] = 0.69302144647951536 xgb_params['reg_alpha'] = 0.59125639278096453 xgb_params['gamma'] = 0.11900602913417056
if encoding == 'plain': pass elif encoding == 'gzip': data = StringIO(data) data = gzip.GzipFile(fileobj=data).read() else: raise RuntimeError('unknown encoding') else: with open(url, 'r') as fid: data = fid.read() fid.close() return data mem = joblib.Memory(cachedir='_build') get_data = mem.cache(_get_data) def parse_sphinx_searchindex(searchindex): """Parse a Sphinx search index Parameters ---------- searchindex : str The Sphinx search index (contents of searchindex.js) Returns ------- filenames : list of str The file names parsed from the search index.
# You'll also need to install Spacy & run: # python -m spacy download en_core_web_sm import spacy import os.path import numpy as np import pandas as pd import string import tqdm import random from sklearn.externals import joblib # Load English tokenizer, tagger, parser, NER and word vectors nlp = spacy.load('en_core_web_sm') translator = str.maketrans('', '', string.punctuation) mem = joblib.Memory('cache') @mem.cache def textify(fn): docs = [] with open(fn, 'r') as fh: for j, line in enumerate(fh): # Skip first fields splits = line.split(',') word = splits[1] definition = ','.join(splits[5:]) definition = definition.replace('"', '').replace('\n', '') docs.append(definition + ' ' + word) return docs