Exemplo n.º 1
0
def extract_group_components(subject_components, variances,
                ccs_threshold=None, n_group_components=None, 
                cachedir=None):
    # Use asarray to cast to a non memmapped array
    subject_components = np.asarray(subject_components)
    if len(subject_components) == 1:
        # We are in a single subject case
        return subject_components[0, :n_group_components].T, \
                variances[0][:n_group_components]

    # The group components (concatenated subject components)
    group_components = subject_components.T
    group_components = np.reshape(group_components,
                                    (group_components.shape[0], -1))
    # Save memory
    del subject_components

    # Inter-subject CCA
    memory = Memory(cachedir=cachedir, mmap_mode='r')
    svd = memory.cache(linalg.svd)
    cca_maps, ccs, _ = svd(group_components, full_matrices=False)
    # Save memory
    del group_components
    if n_group_components is None:
        n_group_components = np.argmin(ccs > ccs_threshold)
    cca_maps = cca_maps[:, :n_group_components]
    ccs = ccs[:n_group_components]
    return cca_maps, ccs
Exemplo n.º 2
0
    def __init__(
        self,
        hmm=None,
        n_components=16, covariance_type='diag',
        min_duration=0.250,
        feature=None, cache=False
    ):

        super(SpeechActivityDetection, self).__init__()

        self.hmm = hmm
        self.hmm.min_duration = min_duration

        # default features for speech activity detection
        # are MFCC (12 coefficients + delta coefficient + delta energy)
        if feature is None:
            from pyannote.feature.yaafe import YaafeMFCC
            feature = YaafeMFCC(e=False, coefs=12, De=True, D=True)
        self.feature = feature

        if cache:

            # initialize cache
            from joblib import Memory
            from tempfile import mkdtemp
            memory = Memory(cachedir=mkdtemp(), verbose=0)

            # cache feature extraction method
            self.get_features = memory.cache(self.get_features)
Exemplo n.º 3
0
    def __init__(
        self,
        gmm_ubm,
        feature=None, cache=False
    ):

        super(SpeakerIdentification, self).__init__()

        self.gmm_ubm = gmm_ubm

        # default features for speaker identification are MFCC
        # 13 coefs + delta coefs  + delta delta coefs
        #          + delta energy + delta delta energy
        if feature is None:
            from pyannote.feature.yaafe import YaafeMFCC
            feature = YaafeMFCC(
                e=False, De=True, DDe=True,
                coefs=13, D=True, DD=True
            )
        self.feature = feature

        if cache:

            # initialize cache
            from joblib import Memory
            from tempfile import mkdtemp
            memory = Memory(cachedir=mkdtemp(), verbose=0)

            # cache feature extraction method
            self.get_features = memory.cache(self.get_features)
Exemplo n.º 4
0
def load_adni_longitudinal_rs_fmri(dirname='ADNI_longitudinal_rs_fmri',
                                   prefix='wr*.nii'):
    """ Returns paths of ADNI rs-fMRI
    """

    # get file paths and description
    images, subject_paths, description = _get_subjects_and_description(
        base_dir=dirname, prefix='I[0-9]*')
    images = np.array(images)
    # get func files
    func_files = list(map(lambda x: _glob_subject_img(
        x, suffix='func/' + prefix, first_img=True),
                     subject_paths))
    func_files = np.array(func_files)

    # get motion files
    # motions = None
    motions = list(map(lambda x: _glob_subject_img(
        x, suffix='func/' + 'rp_*.txt', first_img=True), subject_paths))

    # get phenotype from csv
    dx = pd.read_csv(os.path.join(_get_data_base_dir('ADNI_csv'),
                                  'DXSUM_PDXCONV_ADNIALL.csv'))
    roster = pd.read_csv(os.path.join(_get_data_base_dir('ADNI_csv'),
                                      'ROSTER.csv'))
    df = description[description['Image_ID'].isin(images)]
    df = df.sort_values(by='Image_ID')
    dx_group = np.array(df['DX_Group'])
    subjects = np.array(df['Subject_ID'])
    exams = np.array(df['EXAM_DATE'])
    exams = [date(int(e[:4]), int(e[5:7]), int(e[8:])) for e in exams]

    # caching dataframe extraction functions
    CACHE_DIR = _get_cache_base_dir()
    cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache')
    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)
    memory = Memory(cachedir=cache_dir, verbose=0)

    def _get_ridsfmri(subjects):
        return [_ptid_to_rid(s, roster) for s in subjects]
    rids = np.array(memory.cache(_get_ridsfmri)(subjects))

    def _get_examdatesfmri(rids):
        return [_get_dx(rids[i], dx, exams[i], viscode=None, return_code=True)
                for i in range(len(rids))]

    exam_dates = np.array(memory.cache(_get_examdatesfmri)(rids))

    def _get_viscodesfmri(rids):
        return [_get_vcodes(rids[i], str(exam_dates[i]), dx)
                for i in range(len(rids))]
    viscodes = np.array(memory.cache(_get_viscodesfmri)(rids))
    vcodes, vcodes2 = viscodes[:, 0], viscodes[:, 1]

    return Bunch(func=func_files, dx_group=dx_group, exam_codes=vcodes,
                 exam_dates=exam_dates, exam_codes2=vcodes2,
                 motion=motions,
                 subjects=subjects, images=images)
Exemplo n.º 5
0
 def construct_and_attach_filename_data(self):
     synsets = self.synset_list
     num_per_synset = self.data['num_per_synset']
     seed = self.data['seed']
     folder = self.local_home('PrecomputedDicts')
     mem = Memory(folder)
     compute_filename_dict = mem.cache(self.compute_filename_dict)
     filenames, filenames_dict = compute_filename_dict(synsets, num_per_synset, seed)
     self.filenames_dict = filenames_dict
Exemplo n.º 6
0
def add_caching_to_funcs(obj, funcNames):
	mem = Memory('../.add_caching_to_funcs', verbose=11)
	if obj is None or funcNames is None:
		return
	if isScalar(funcNames):
		funcNames = [funcNames]
	for name in funcNames:
		func = getattr(obj, name, None)
		if func is not None:
			setattr(obj, name, mem.cache(func))
Exemplo n.º 7
0
    def __init__(self):

        self.name = self.__class__.__name__

        try:
            from joblib import Memory
            mem = Memory(cachedir=self.home('cache'), verbose=False)
            self._get_meta = mem.cache(self._get_meta)
        except ImportError:
            pass
    def _run_suject_level1_glm(subject_data_dir, subject_output_dir,
                               **kwargs):
        """
        Just another wrapper.

        """

        mem = Memory(os.path.join(subject_output_dir, "cache_dir"))
        return mem.cache(run_suject_level1_glm)(subject_data_dir,
                                                subject_output_dir,
                                                **kwargs)
Exemplo n.º 9
0
    def __init__(self, meta=None):
        if meta is not None:
            self._meta = meta

        self.name = self.__class__.__name__

        try:
            from joblib import Memory
            mem = Memory(cachedir=self.home('cache'))
            self._get_meta = mem.cache(self._get_meta)
        except ImportError:
            pass
Exemplo n.º 10
0
def main():
##    subsdir=r'E:\elan projects\L2\submissions\extracted'
##    dstdir=os.path.join(subsdir,r'passed')
##    copypassedfiles(dstdir,subsdir)
    dstdir=r'E:\elan projects\L2\resubmission\full'
    import glob
    jsonflist=glob.glob(dstdir+'\\'+r'*.379.json')

    mem = Memory(cachedir=dstdir)
    json2agreementmatrix_cached=mem.cache(json2agreementmatrix)

    c=json2agreementmatrix_cached(jsonflist,task_type='all')
    print c
Exemplo n.º 11
0
Arquivo: solver.py Projeto: amoliu/lfd
 def __init__(self, use_cache=True, cachedir=None):
     """Inits TpsSolverFactory
     
     Args:
         use_cache: whether to cache solver matrices in file
         cache_dir: cached directory. if not specified, the .cache directory in parent directory of top-level package is used.
     """
     if use_cache:
         if cachedir is None:
             # .cache directory in parent directory of top-level package
             cachedir = os.path.join(__import__(__name__.split('.')[0]).__path__[0], os.path.pardir, ".cache")
         memory = Memory(cachedir=cachedir, verbose=0)
         self.get_solver_mats = memory.cache(self.get_solver_mats)
Exemplo n.º 12
0
    def _niigz2nii(self):
        """
        Convert .nii.gz to .nii (crucial for SPM).

        """

        cache_dir = os.path.join(self.output_dir, 'cache_dir')
        mem = Memory(cache_dir, verbose=100)

        self.func = mem.cache(do_niigz2nii)(self.func,
                                            output_dir=self.output_dir)
        if not self.anat is None:
            self.anat = mem.cache(do_niigz2nii)(self.anat,
                                                output_dir=self.output_dir)
Exemplo n.º 13
0
def load_adni_longitudinal_hippocampus_volume():
    """ Returns longitudinal hippocampus measures
    """

    BASE_DIR = _get_data_base_dir('ADNI_csv')

    roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv'))
    dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv'))
    fs = pd.read_csv(os.path.join(BASE_DIR, 'UCSFFSX51_05_20_15.csv'))

    # extract hippocampus numerical values
    column_idx = np.arange(131, 147)
    cols = ['ST' + str(c) + 'HS' for c in column_idx]
    hipp = fs[cols].values
    idx_num = np.array([~np.isnan(h).all() for h in hipp])
    hipp = hipp[idx_num, :]

    # extract roster id
    rids = fs['RID'].values[idx_num]

    # caching dataframe extraction functions
    CACHE_DIR = _get_cache_base_dir()
    cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache')
    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)
    memory = Memory(cachedir=cache_dir, verbose=0)

    # get subject id
    def _getptidshippo(rids):
        return [_rid_to_ptid(rid, roster) for rid in rids]
    ptids = memory.cache(_getptidshippo)(rids)

    # extract exam date
    exams = fs['EXAMDATE'].values[idx_num]
    vcodes = fs['VISCODE'].values[idx_num]
    vcodes2 = fs['VISCODE2'].values[idx_num]
    exams = list(map(
        lambda e: date(int(e[:4]), int(e[5:7]), int(e[8:])), exams))
    exams = np.array(exams)

    # extract diagnosis
    def _getdxhippo(rids, exams):
        return np.array(list(map(_get_dx, rids, [dx]*len(rids), exams)))
    dx_ind = memory.cache(_getdxhippo)(rids, exams)
    dx_group = DX_LIST[dx_ind]

    return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids),
                 hipp=np.array(hipp), exam_dates=np.array(exams),
                 exam_codes=np.array(vcodes), exam_codes2=np.array(vcodes2))
Exemplo n.º 14
0
def _load_data(root_dir="/",
               data_set="ds107",
               cache_dir="/volatile/storage/workspace/parietal_retreat/" +
               "covariance_learn/cache/",
               n_jobs=1):
    from joblib import Memory
    mem = Memory(cachedir=cache_dir)
    load_data_ = mem.cache(setup_data_paths.run)

    df = setup_data_paths.get_all_paths(root_dir=root_dir, data_set=data_set)
    # region_signals = joblib.load(os.path.join(root_dir, dump_file))
    region_signals = load_data_(root_dir=root_dir, data_set=data_set,
                                n_jobs=n_jobs,
                                dump_dir=os.path.join(cache_dir, data_set))
    return df, region_signals
Exemplo n.º 15
0
    def __init__(self, data_same, normalize=True, min_max_scale=False,
            scale_f1=None, scale_f2=None,
            nframes=1, batch_size=1, marginf=0, only_same=False,
            cache_to_disk=False):
        self.print_mean_DTW_costs(data_same)
        self.ratio_same = 0.5  # init
        self.ratio_same = self.compute_ratio_speakers(data_same)
        self._nframes = nframes
        print "nframes:", self._nframes

        (self._x1, self._x2, self._y_word, self._y_spkr,
                self._scale_f1, self._scale_f2) = self.prep_data(data_same,
                        normalize, min_max_scale, scale_f1, scale_f2)

        self._y1 = [numpy.zeros(x.shape[0], dtype='int8') for x in self._x1]
        self._y2 = [numpy.zeros(x.shape[0], dtype='int8') for x in self._x1]
        # self._y1 says if frames in x1 and x2 belong to the same (1) word or not (0)
        # self._y2 says if frames in x1 and x2 were said by the same (1) speaker or not(0)
        for ii, yy in enumerate(self._y_word):
            self._y1[ii][:] = yy
        for ii, yy in enumerate(self._y_spkr):
            self._y2[ii][:] = yy
        self._nwords = batch_size
        self._margin = marginf
        # marginf says if we pad taking a number of frames as margin
        self._x1_mem = []
        self._x2_mem = []
        self._y1_mem = []
        self._y2_mem = []
        self.cache_to_disk = cache_to_disk
        if self.cache_to_disk:
            from joblib import Memory
            self.mem = Memory(cachedir='joblib_cache', verbose=0)
Exemplo n.º 16
0
def ica_step(group_maps, group_variance, cachedir=None):
    memory = Memory(cachedir=cachedir, mmap_mode='r')
    # We do a spatial ICA: the arrays are transposed in the following,
    # axis1 = component, and axis2 is voxel number.

    _, ica_maps = memory.cache(fastica)(group_maps.T, whiten=False)

    # Project the ICAs on the group maps to give a 'cross-subject
    # reproducibility' score.
    proj = np.dot(ica_maps, group_maps)
    reproducibility_score = (np.abs(proj)*group_variance).sum(axis=-1)

    order = np.argsort(reproducibility_score)[::-1]

    ica_maps = ica_maps[order, :]

    return ica_maps.T
Exemplo n.º 17
0
  def __init__(self, caching=False):
    """Create a new CompatIdFetcher object.

    Args:
      caching: Whether to cache setup from run to run. See
        PrebuiltCompatibilityTest.CACHING for details.
    """
    self.compat_ids = None
    if caching:
      # This import occurs here rather than at the top of the file because we
      # don't want to force developers to install joblib. The caching argument
      # is only set to True if PrebuiltCompatibilityTest.CACHING is hand-edited
      # (for testing purposes).
      # pylint: disable=import-error
      from joblib import Memory
      memory = Memory(cachedir=tempfile.gettempdir(), verbose=0)
      self.FetchCompatIds = memory.cache(self.FetchCompatIds)
Exemplo n.º 18
0
def load_adni_longitudinal_csf_biomarker():
    """ Returns longitudinal csf measures
    """
    BASE_DIR = _get_data_base_dir('ADNI_csv')
    roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv'))
    dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv'))
    csf_files = ['UPENNBIOMK.csv', 'UPENNBIOMK2.csv', 'UPENNBIOMK3.csv',
                 'UPENNBIOMK4_09_06_12.csv', 'UPENNBIOMK5_10_31_13.csv',
                 'UPENNBIOMK6_07_02_13.csv', 'UPENNBIOMK7.csv',
                 'UPENNBIOMK8.csv']
    cols = ['RID', 'VISCODE', 'ABETA', 'PTAU', 'TAU']
    # 3,4,5,7,8
    csf = pd.DataFrame()
    for csf_file in csf_files[2:]:
        fs = pd.read_csv(os.path.join(BASE_DIR, csf_file))
        csf = csf.append(fs[cols])

    # remove nans from csf values
    biom = csf[cols[2:]].values
    idx = np.array([~np.isnan(v).any() for v in biom])
    biom = biom[idx]
    # get phenotype
    vcodes = csf['VISCODE'].values[idx]
    rids = csf['RID'].values[idx]

    # caching dataframe extraction functions
    CACHE_DIR = _get_cache_base_dir()
    cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache')
    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)
    memory = Memory(cachedir=cache_dir, verbose=0)

    def _getptidscsf(rids):
        return list(map(lambda x: _rid_to_ptid(x, roster), rids))
    ptids = memory.cache(_getptidscsf)(rids)

    # get diagnosis
    def _getdxcsf(rids, vcodes):
        return list(map(lambda x, y: DX_LIST[_get_dx(x, dx, viscode=y)],
                   rids, vcodes))
    dx_group = memory.cache(_getdxcsf)(rids, vcodes)

    return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids),
                 csf=np.array(biom), exam_codes=np.array(vcodes),
                 exam_codes2=np.array(vcodes))
Exemplo n.º 19
0
    def __init__(self, meta=None, seed=0, ntrain=15, ntest=15, num_splits=10):

        self.seed = seed
        self.ntrain = ntrain
        self.ntest = ntest
        self.num_splits = num_splits

        if meta is not None:
            self._meta = meta

        self.name = self.__class__.__name__

        try:
            from joblib import Memory
            mem = Memory(cachedir=self.home('cache'))
            self._get_meta = mem.cache(self._get_meta)
        except ImportError:
            pass
Exemplo n.º 20
0
    def __init__(self, *args, **kwargs):

        level = kwargs.pop('level', 10)

        # Initialize the memory object
        self.memory = Memory(*args, **kwargs)
        # The level parameter controls which data we cache
        # smaller numbers mean less caching
        self.level = level
Exemplo n.º 21
0
    def __init__(self, root,
                 filter_species_ids=None,
                 required_attributes=None,
                 transform=None,
                 is_training=False,
                 cachedir=CACHE_DIR):
        super(GogglesDataset, self).__init__()

        mem = Memory(cachedir)
        metadata_loader = mem.cache(self._load_metadata)

        self.is_training = is_training
        self._data_dir = root

        required_species, \
            self.attributes, \
            self._image_data = metadata_loader(root)  # _load_metadata(root) cached

        if filter_species_ids is not None:
            assert type(filter_species_ids) is list
            filter_species_ids = set(filter_species_ids)
            required_species = list(filter(lambda s: s.id in filter_species_ids, required_species))
            self._image_data = list(filter(lambda d: d.species.id in filter_species_ids, self._image_data))
        self._species_labels = {species: label for label, species in enumerate(required_species)}

        if is_training is not None:
            self._image_data = list(filter(
                lambda d: d.is_for_training == is_training,
                self._image_data))

        if required_attributes is not None:
            assert type(required_attributes) is list
            self.attributes = required_attributes
        elif filter_species_ids is not None:
            attributes = set()
            for species in required_species:
                attributes = attributes.union(species.attributes)
            self.attributes = list(sorted(attributes, key=lambda a: a.id))
        self.num_attributes = len(self.attributes)

        if transform is not None:
            self._transform = transform
        else:
            self._transform = transforms.Compose([transforms.ToTensor()])
Exemplo n.º 22
0
    def __init__(self, systemConfig, **kwargs):

        if systemConfig.get('cache', False):
            try:
                from tempfile import mkdtemp
                from joblib import Memory
            except ImportError:
                pass
            else:
                if 'cacheDir' in systemConfig:
                    cacheDir = systemConfig['cacheDir']
                    try:
                        os.makedirs(cacheDir)
                    except OSError as e:
                        if e.errno == errno.EEXIST and os.path.isdir(cacheDir):
                            pass
                        else:
                            raise
                else:
                    cacheDir = mkdtemp()

                self._mem = Memory(cachedir=cacheDir, verbose=0)

                # Cache outputs of these methods
                self.forward = self._mem.cache(self.forward)
                self.backprop = self._mem.cache(self.backprop)

        hx = [(systemConfig['dx'], systemConfig['nx']-1)]
        hz = [(systemConfig['dz'], systemConfig['nz']-1)]
        self.mesh = SimPEG.Mesh.TensorMesh([hx, hz], '00')

        self.mesh.ireg = systemConfig.get('ireg', DEFAULT_IREG)
        self.mesh.freeSurf = systemConfig.get('freeSurf', DEFAULT_FREESURF_BOUNDS)

        initMap = {
        #   Argument        Rename to Property
            'c':            'cR',
            'Q':            None,
            'rho':          None,
            'nPML':         None,
            'freeSurf':     None,
            'freq':         None,
            'ky':           None,
            'kyweight':     None,
            'Solver':       None,
            'dx':           None,
            'dz':           None,
            'dtype':        None,
        }

        for key in initMap.keys():
            if key in systemConfig:
                if initMap[key] is None:
                    setattr(self, key, systemConfig[key])
                else:
                    setattr(self, initMap[key], systemConfig[key])
Exemplo n.º 23
0
def load_adni_longitudinal_mmse_score():
    """ Returns longitudinal mmse scores
    """
    BASE_DIR = _get_data_base_dir('ADNI_csv')
    roster = pd.read_csv(os.path.join(BASE_DIR, 'ROSTER.csv'))
    dx = pd.read_csv(os.path.join(BASE_DIR, 'DXSUM_PDXCONV_ADNIALL.csv'))
    fs = pd.read_csv(os.path.join(BASE_DIR, 'MMSE.csv'))

    # extract nans free mmse
    mmse = fs['MMSCORE'].values
    idx_num = fs['MMSCORE'].notnull().values
    mmse = mmse[idx_num]

    # extract roster id
    rids = fs['RID'].values[idx_num]

    # caching dataframe extraction functions
    CACHE_DIR = _get_cache_base_dir()
    cache_dir = os.path.join(CACHE_DIR, 'joblib', 'load_data_cache')
    if not os.path.isdir(cache_dir):
        os.makedirs(cache_dir)
    memory = Memory(cachedir=cache_dir, verbose=0)

    def _getptidsmmse(rids):
        return [_rid_to_ptid(rid, roster) for rid in rids]

    # get subject id
    ptids = memory.cache(_getptidsmmse)(rids)
    # extract visit code (don't use EXAMDATE ; null for GO/2)
    vcodes = fs['VISCODE'].values
    vcodes = vcodes[idx_num]
    vcodes2 = fs['VISCODE2'].values
    vcodes2 = vcodes2[idx_num]

    def _getdxmmse(rids, vcodes2):
        return list(map(
            lambda x, y: DX_LIST[_get_dx(x, dx, viscode=y)], rids, vcodes2))

    # get diagnosis
    dx_group = memory.cache(_getdxmmse)(rids, vcodes2)

    return Bunch(dx_group=np.array(dx_group), subjects=np.array(ptids),
                 mmse=mmse, exam_codes=vcodes, exam_codes2=vcodes2)
Exemplo n.º 24
0
def compute_confidence_par(allLearners, dada):

    lab_confidence = np.zeros([dada.shape[0], len(allLearners)])
    tic = time.time()
    #import ipdb;ipdb.set_trace()
    print 'producing weighted outputs IN PARALLEL'
    
    mem = Memory(cachedir='tmp')
    classif_RBF2 = mem.cache(confidence_par)
    
    c = l_c[0]
    r = Parallel(n_jobs=N_JOBS)(delayed(confidence_par)(allLearners,ii,dada) for ii in enumerate(allLearners))
    res, iis = zip(*r)
    
    for t,y in enumerate(iis):
        lab_confidence[:,y] = res[t]
    
    print "time taken to produce confidence:", round(time.time() - tic,2), "seconds"
    #import ipdb;ipdb.set_trace()
    return lab_confidence
Exemplo n.º 25
0
    def test_cached(self):
        try:
            from joblib import Memory
            mem = Memory(self.cache_dir)
            dep_tree = {
                'a': 5,
                'b': 6,
                'c': mem.cache(slow_func),
            }
            data = Pipeline(dep_tree)
            t0 = time.time()
            data.resolve()
            delta = time.time() - t0

            t0 = time.time()
            data.resolve()
            delta = time.time() - t0
            assert delta < .1
        except:
            pass
Exemplo n.º 26
0
    def __init__(self, meta=None, seed=0, ntrain=10, ntest=10, num_splits=5):

        self.seed = seed
        self.ntrain = ntrain
        self.ntest = ntest
        self.num_splits = num_splits
        self.names = ["Face", "Body", "Object"]

        if meta is not None:
            self._meta = meta

        self.name = self.__class__.__name__

        try:
            from joblib import Memory

            mem = Memory(cachedir=self.home("cache"))
            self._get_meta = mem.cache(self._get_meta)
        except ImportError:
            pass
Exemplo n.º 27
0
def getagreement(tpl,datadir,task_type='all'):
    """Get agreement values for annotators in the :data:'tpl' list

    Args:
       tpl (list):  combination group of annotators
       datadir (str): Cache data directory used by joblib

    Returns:
       namedtuple defined as ``Agree = collections.namedtuple('Agree', ['kappa', 'alpha','avg_ao'], verbose=True)``
    """

    mem = Memory(cachedir=datadir)
    readjson=mem.cache(json2taskdata.readjson,mmap_mode='r')
    create_task_data= mem.cache(json2taskdata.create_task_data)
    count_occurrances=mem.cache(json2taskdata.count_occurrances)
    count_labels=mem.cache(json2taskdata.count_labels)

    annotators=set()
    lectask=[]
    #-------------------------------------------------------------------------------
    # for each annotator in group tpl
    #-------------------------------------------------------------------------------

    for stditem in tpl:
        aname=stditem.split('.')[0][3:][-2:]
        annotators.add(aname)
        lecdict=readjson(stditem)
        newlectask= create_task_data(lecdict,task_type=task_type,annotator=aname)
        label_data=json2taskdata.create_labels_list(newlectask)
        abscount=count_occurrances(str(label_data))
        yaml.dump(abscount,open(os.path.join( datadir,'abscount-'+aname+'.yaml'),'w'))

        setcount=count_labels(newlectask)
        yaml.dump(setcount,open(os.path.join( datadir,'setcount-'+aname+'.yaml'),'w'))

        lectask=lectask+newlectask

    task=AnnotationTask(data=lectask,distance=nltk.metrics.distance.masi_distance_mod)

    return  {frozenset(annotators): Agree(task.kappa(),task.alpha(),task.avg_Ao())}
Exemplo n.º 28
0
        def __init__(self, *args, **kwargs):
            from tempfile import mkdtemp
            from joblib import Memory
            self.cachedir = cachedir or mkdtemp()

            self.memory = Memory(cachedir=self.cachedir)
            for method in self.cached_methods:
                setattr(self, method, self.memory.cache(getattr(self, method)))

            if not os.path.isdir(self.cachedir):
                raise OSError("Non-existent directory: ", self.cachedir)

            super(_DiskCache, self).__init__(*args, **kwargs)
Exemplo n.º 29
0
    def __init__(
        self,
        segmentation=None,
        duration=1., step=0.1, gap=0., threshold=0.,
        feature=None, cache=False
    ):

        super(SpeechTurnSegmentation, self).__init__()

        if segmentation is None:

            self.segmentation = SegmentationGaussianDivergence(
                duration=duration, step=step, gap=gap,
                threshold=threshold
            )

        else:

            self.segmentation = segmentation

        # default features for segmentation
        # are MFCC (energy + 12 coefficients)
        if feature is None:
            from pyannote.feature.yaafe import YaafeMFCC
            feature = YaafeMFCC(
                e=True, De=False, DDe=False,
                coefs=12, D=False, DD=False
            )
        self.feature = feature

        if cache:

            # initialize cache
            from joblib import Memory
            from tempfile import mkdtemp
            memory = Memory(cachedir=mkdtemp(), verbose=0)

            # cache feature extraction method
            self.get_features = memory.cache(self.get_features)
Exemplo n.º 30
0
 def _set_memory(self, cache_dir):
     # Try importing joblib.
     try:
         from joblib import Memory
         self._memory = Memory(cachedir=self.cache_dir,
                               mmap_mode=None,
                               verbose=self.verbose,
                               )
         logger.debug("Initialize joblib cache dir at `%s`.",
                      self.cache_dir)
     except ImportError:  # pragma: no cover
         logger.warn("Joblib is not installed. "
                     "Install it with `conda install joblib`.")
         self._memory = None
Exemplo n.º 31
0
import os
import glob
import re
import numpy as np
import pandas as pd
import logging
from joblib import Memory
import hdbscan
from sklearn.preprocessing import StandardScaler

memory = Memory(os.path.join(os.path.dirname(__file__), ".cache"))


def load(paths):
    """
    Load the CSV file at the provided path, and return a pandas DataFrame.
    """
    if not isinstance(paths, list):
        paths = [paths]

    df = pd.DataFrame()
    for path in paths:
        new_df = pd.read_csv(path, delimiter='|')
        df = pd.concat([df, new_df])

    df = df.reset_index()

    return df


def get_number_columns(df):
Exemplo n.º 32
0
import json
from ambiverse_apikey import client_id, client_secret
from ambiverse_token import get_token
from joblib import Memory
import os
import logging
import sys
import time
import datetime
from get_category import query_category
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

cachedir = "./temp"
if not os.path.exists(cachedir):
    os.mkdir(cachedir)
memory = Memory(cachedir=cachedir, verbose=0)


@memory.cache
def ambiverse(item, tool_name):
    text = item["text"]  #.encode('utf-8')
    dpaId = item["dpaId"]
    ambiverse_token = get_token(client_id, client_secret)
    ambiverse_request_url = "https://api.ambiverse.com/v1/entitylinking/analyze"
    text_string = json.dumps({"text": text})
    payload = text_string
    headers = {
        'content-type': "application/json",
        'accept': "application/json",
        'authorization': ambiverse_token
    }
Exemplo n.º 33
0
import autograd.numpy as np
from autograd import grad
import math
from matplotlib import pyplot as plt
from joblib import Memory
from generate_trajectory import generate_trajectory, generate_observations
memory = Memory(cachedir='joblib_cache', verbose=0)
#import seaborn as sns
'''
@changing: to change this for a new experiment, modify the following:
    simulator()
    generate_variational()
    data()
    inputs to iterate()
'''
T = 50
Gamma = 0.1
C = 1
Sigma = 0.005
startState = 1e-3


def iterate(params, sim_variables, u1, u2, u3, m, v):
    '''
    @param params: variational distribution parameters
    @param prior_params: prior parameters
    @sim_variables: simulator variables variables
    @param u1: for reparametrizing variational distribution
    @param u2: for reparametrizing simulator
    @param u3: for reparametrizing KL divergence
    @param m, v: for Adam
# Finding studies with similar activations
# ========================================

######################################################################
# Transform the coordinates into brain maps
# -----------------------------------------
# Here we generate brain maps for all the studies in the NeuroQuery dataset,
# using the activation coordinates. This takes a long time(around 15mn) so we
# cache the result.

corpus_metadata = encoder.corpus_info["metadata"].set_index("pmid")
coordinates = pd.read_csv(datasets.fetch_peak_coordinates())

# We cache the `coordinates_to_maps` function with joblib to avoid recomputing
# this if we train a new model.
coord_to_maps = Memory(str(cache_directory)).cache(coordinates_to_maps)

# You can set target_affine to a different value to increase image resolution
# or reduce computation time. The model on neuroquery.saclay.inria.fr uses 4 mm
# resolution i.e. target_affine=(4, 4, 4)
# You can also adjust the smoothing by setting `fwhm` (Full Width at Half
# maximum)
brain_maps, masker = coord_to_maps(coordinates,
                                   target_affine=(6, 6, 6),
                                   fwhm=9.0)
brain_maps = brain_maps[(brain_maps.values != 0).any(axis=1)]
brain_maps /= np.sum(brain_maps.values, axis=1, keepdims=True)

######################################################################
# Find studies with activations similar to the input maps
# -------------------------------------------------------
Exemplo n.º 35
0
import mne
import numpy as np
import matplotlib
import pylab as plt
from glob import glob

from conf_analysis.behavior import metadata
from conf_analysis.meg import preprocessing, localizer, lcmv, srplots

from conf_analysis.meg import source_recon as sr
from joblib import Memory
from functools import reduce

memory = Memory(cachedir=metadata.cachedir)


def make_overview_figures(subjects, bem='three_layer', prefix=''):
    from conf_analysis.meg import srplots
    for sub in subjects:
        avg, idx, F = srplots.single_sub_contrast_indices(sub)
        print('Subject:', sub, 'F:', F)
        gamma_overview(sub, F=F, bem=bem, prefix=prefix + 'F%f' % F)
        stats_overview(sub, F=F, prefix=prefix + 'F%f' % F)


def gamma_overview(subject, F=45, bem='three_layer', prefix=''):
    '''
    Prepare data for an overview figure that shows source recon'ed activity.
    '''
    plt.figure(figsize=(15, 15))
    gs = matplotlib.gridspec.GridSpec(2 * 4, 6)
Exemplo n.º 36
0
# coding: utf-8
from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
from xgboost.compat import DASK_INSTALLED
from hypothesis import strategies
from hypothesis.extra.numpy import arrays
from joblib import Memory
from sklearn import datasets
import xgboost as xgb
import numpy as np

try:
    import cupy as cp
except ImportError:
    cp = None

memory = Memory('./cachedir', verbose=0)


def no_sklearn():
    return {
        'condition': not SKLEARN_INSTALLED,
        'reason': 'Scikit-Learn is not installed'
    }


def no_dask():
    return {'condition': not DASK_INSTALLED, 'reason': 'Dask is not installed'}


def no_pandas():
    return {
Exemplo n.º 37
0
import os
import numpy as np
import matplotlib.pyplot as plt
from joblib import Memory

import paths
from ..utils.files import listFilesInDir, ensure_dir_exists
from pamap_common import *  # noqa

memory = Memory('./')
join = os.path.join

# ================================================================
# consts

MISSING_DATA_VALUE = np.nan

OPTIONAL_DIR = join(paths.PAMAP2, 'Optional')
PROTOCOL_DIR = join(paths.PAMAP2, 'Protocol')
FIG_SAVE_DIR = join('figs', 'pamap2')
SAVE_DIR_LINE_GRAPH = join(FIG_SAVE_DIR, 'line')
SAVE_DIR_IMG = join(FIG_SAVE_DIR, 'img')

ACTIVITY_IDS_2_NAMES = {
    0: NAME_OTHER,
    1: NAME_LYING,
    2: NAME_SITTING,
    3: NAME_STANDING,
    4: NAME_WALK,
    5: NAME_RUN,
    6: NAME_CYCLE,
Exemplo n.º 38
0
def test_pipeline_memory_sampler():
    X, y = make_classification(
        n_classes=2,
        class_sep=2,
        weights=[0.1, 0.9],
        n_informative=3,
        n_redundant=1,
        flip_y=0,
        n_features=20,
        n_clusters_per_class=1,
        n_samples=5000,
        random_state=0,
    )
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma="scale", probability=True, random_state=0)
        transf = DummySampler()
        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
        cached_pipe = Pipeline([("transf", transf), ("svc", clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps["transf"].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert not hasattr(transf, "means_")
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma="scale", probability=True, random_state=0)
        transf_2 = DummySampler()
        cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe_2.named_steps["transf_2"].means_,
        )
        assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
Exemplo n.º 39
0
def test_pipeline_memory_transformer():
    iris = load_iris()
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        memory = Memory(cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(gamma="scale", probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([("transf", clone(transf)), ("svc", clf)])
        cached_pipe = Pipeline([("transf", transf), ("svc", clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the tranformer in the cached pipeline
        expected_ts = cached_pipe.named_steps["transf"].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert not hasattr(transf, "means_")
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe.named_steps["transf"].means_,
        )
        assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(gamma="scale", probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(
            pipe.named_steps["transf"].means_,
            cached_pipe_2.named_steps["transf_2"].means_,
        )
        assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts
    finally:
        shutil.rmtree(cachedir)
from nilearn import datasets
msdl_atlas_dataset = datasets.fetch_atlas_msdl()
rest_dataset = datasets.fetch_development_fmri(n_subjects=n_subjects)

# print basic information on the dataset
print('First subject functional nifti image (4D) is at: %s' %
      rest_dataset.func[0])  # 4D data

##############################################################################
# Extracting region signals
# --------------------------
from nilearn import input_data

# A "memory" to avoid recomputation
from joblib import Memory
mem = Memory('nilearn_cache')

masker = input_data.NiftiMapsMasker(msdl_atlas_dataset.maps,
                                    resampling_target="maps",
                                    detrend=True,
                                    high_variance_confounds=True,
                                    low_pass=None,
                                    high_pass=0.01,
                                    t_r=2,
                                    standardize=True,
                                    memory='nilearn_cache',
                                    memory_level=1,
                                    verbose=2)
masker.fit()

subject_time_series = []
Exemplo n.º 41
0
import logging
import pprint
import os
from tempfile import gettempdir
from tabulate import tabulate
from copy import copy, deepcopy
from urllib.parse import urlparse, parse_qs
from nidm.experiment import Navigate
from nidm.experiment.Utils import validate_uuid

from numpy import std, mean, median
import functools
import operator

from joblib import Memory
memory = Memory(gettempdir(), verbose=0)
USE_JOBLIB_CACHE = False

import simplejson


def convertListtoDict(lst):
    '''
    This function converts a list to a dictionary
    :param lst: list to convert
    :return: dictionary
    '''
    res_dct = {lst[i]: lst[i + 1] for i in range(0, len(lst), 2)}
    return res_dct

Exemplo n.º 42
0
def hdbscan(X,
            min_cluster_size=5,
            min_samples=None,
            alpha=1.0,
            cluster_selection_epsilon=0.0,
            metric='minkowski',
            p=2,
            leaf_size=40,
            algorithm='best',
            memory=Memory(cachedir=None, verbose=0),
            approx_min_span_tree=True,
            gen_min_span_tree=False,
            core_dist_n_jobs=4,
            cluster_selection_method='eom',
            allow_single_cluster=False,
            match_reference_implementation=False,
            **kwargs):
    """Perform HDBSCAN clustering from a vector array or distance matrix.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    min_cluster_size : int, optional (default=5)
        The minimum number of samples in a group for that group to be
        considered a cluster; groupings smaller than this size will be left
        as noise.

    min_samples : int, optional (default=None)
        The number of samples in a neighborhood for a point
        to be considered as a core point. This includes the point itself.
        defaults to the min_cluster_size.

	cluster_selection_epsilon: float, optional (default=0.0)
		A distance threshold. Clusters below this value will be merged.
        See [3]_ for more information.

    alpha : float, optional (default=1.0)
        A distance scaling parameter as used in robust single linkage.
        See [2]_ for more information.

    metric : string or callable, optional (default='minkowski')
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    p : int, optional (default=2)
        p value to use if using the minkowski metric.

    leaf_size : int, optional (default=40)
        Leaf size for trees responsible for fast nearest
        neighbour queries.

    algorithm : string, optional (default='best')
        Exactly which algorithm to use; hdbscan has variants specialised
        for different characteristics of the data. By default this is set
        to ``best`` which chooses the "best" algorithm given the nature of
        the data. You can force other options if you believe you know
        better. Options are:
            * ``best``
            * ``generic``
            * ``prims_kdtree``
            * ``prims_balltree``
            * ``boruvka_kdtree``
            * ``boruvka_balltree``

    memory : instance of joblib.Memory or string, optional
        Used to cache the output of the computation of the tree.
        By default, no caching is done. If a string is given, it is the
        path to the caching directory.

    approx_min_span_tree : bool, optional (default=True)
        Whether to accept an only approximate minimum spanning tree.
        For some algorithms this can provide a significant speedup, but
        the resulting clustering may be of marginally lower quality.
        If you are willing to sacrifice speed for correctness you may want
        to explore this; in general this should be left at the default True.

    gen_min_span_tree : bool, optional (default=False)
        Whether to generate the minimum spanning tree for later analysis.

    core_dist_n_jobs : int, optional (default=4)
        Number of parallel jobs to run in core distance computations (if
        supported by the specific algorithm). For ``core_dist_n_jobs``
        below -1, (n_cpus + 1 + core_dist_n_jobs) are used.

    cluster_selection_method : string, optional (default='eom')
        The method used to select clusters from the condensed tree. The
        standard approach for HDBSCAN* is to use an Excess of Mass algorithm
        to find the most persistent clusters. Alternatively you can instead
        select the clusters at the leaves of the tree -- this provides the
        most fine grained and homogeneous clusters. Options are:
            * ``eom``
            * ``leaf``

    allow_single_cluster : bool, optional (default=False)
        By default HDBSCAN* will not produce a single cluster, setting this
        to t=True will override this and allow single cluster results in
        the case that you feel this is a valid result for your dataset.
        (default False)

    match_reference_implementation : bool, optional (default=False)
        There exist some interpretational differences between this
        HDBSCAN* implementation and the original authors reference
        implementation in Java. This can result in very minor differences
        in clustering results. Setting this flag to True will, at a some
        performance cost, ensure that the clustering results match the
        reference implementation.

    **kwargs : optional
        Arguments passed to the distance metric

    Returns
    -------
    labels : ndarray, shape (n_samples, )
        Cluster labels for each point.  Noisy samples are given the label -1.

    probabilities : ndarray, shape (n_samples, )
        Cluster membership strengths for each point. Noisy samples are assigned
        0.

    cluster_persistence : array, shape  (n_clusters, )
        A score of how persistent each cluster is. A score of 1.0 represents
        a perfectly stable cluster that persists over all distance scales,
        while a score of 0.0 represents a perfectly ephemeral cluster. These
        scores can be guage the relative coherence of the clusters output
        by the algorithm.

    condensed_tree : record array
        The condensed cluster hierarchy used to generate clusters.

    single_linkage_tree : ndarray, shape (n_samples - 1, 4)
        The single linkage tree produced during clustering in scipy
        hierarchical clustering format
        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).

    min_spanning_tree : ndarray, shape (n_samples - 1, 3)
        The minimum spanning as an edgelist. If gen_min_span_tree was False
        this will be None.

    References
    ----------

    .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
       Density-based clustering based on hierarchical density estimates.
       In Pacific-Asia Conference on Knowledge Discovery and Data Mining
       (pp. 160-172). Springer Berlin Heidelberg.

    .. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
       cluster tree. In Advances in Neural Information Processing Systems
       (pp. 343-351).

    .. [3] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical 
	   Density-based Cluster Selection. arxiv preprint 1911.02282.
    """
    if min_samples is None:
        min_samples = min_cluster_size

    if type(min_samples) is not int or type(min_cluster_size) is not int:
        raise ValueError('Min samples and min cluster size must be integers!')

    if min_samples <= 0 or min_cluster_size <= 0:
        raise ValueError('Min samples and Min cluster size must be positive'
                         ' integers')

#     if min_cluster_size == 1:
#         raise ValueError('Min cluster size must be greater than one')

    if type(cluster_selection_epsilon) is int:
        cluster_selection_epsilon = float(cluster_selection_epsilon)

    if type(cluster_selection_epsilon
            ) is not float or cluster_selection_epsilon < 0.0:
        raise ValueError(
            'Epsilon must be a float value greater than or equal to 0!')

    if not isinstance(alpha, float) or alpha <= 0.0:
        raise ValueError('Alpha must be a positive float value greater than'
                         ' 0!')

    if leaf_size < 1:
        raise ValueError('Leaf size must be greater than 0!')

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not'
                             ' defined!')

    if match_reference_implementation:
        min_samples = min_samples - 1
        min_cluster_size = min_cluster_size + 1
        approx_min_span_tree = False

    if cluster_selection_method not in ('eom', 'leaf'):
        raise ValueError('Invalid Cluster Selection Method: %s\n'
                         'Should be one of: "eom", "leaf"\n')

    # Checks input and converts to an nd-array where possible
    if metric != 'precomputed' or issparse(X):
        X = check_array(X, accept_sparse='csr')
    else:
        # Only non-sparse, precomputed distance matrices are handled here
        #   and thereby allowed to contain numpy.inf for missing distances
        check_precomputed_distance_matrix(X)

    # Python 2 and 3 compliant string_type checking
    if isinstance(memory, six.string_types):
        memory = Memory(cachedir=memory, verbose=0)

    size = X.shape[0]
    min_samples = min(size - 1, min_samples)
    if min_samples == 0:
        min_samples = 1

    if algorithm != 'best':
        if metric != 'precomputed' and issparse(X) and metric != 'generic':
            raise ValueError(
                "Sparse data matrices only support algorithm 'generic'.")

        if algorithm == 'generic':
            (single_linkage_tree,
             result_min_span_tree) = memory.cache(_hdbscan_generic)(
                 X, min_samples, alpha, metric, p, leaf_size,
                 gen_min_span_tree, **kwargs)
        elif algorithm == 'prims_kdtree':
            if metric not in KDTree.valid_metrics:
                raise ValueError("Cannot use Prim's with KDTree for this"
                                 " metric!")
            (single_linkage_tree,
             result_min_span_tree) = memory.cache(_hdbscan_prims_kdtree)(
                 X, min_samples, alpha, metric, p, leaf_size,
                 gen_min_span_tree, **kwargs)
        elif algorithm == 'prims_balltree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Prim's with BallTree for this"
                                 " metric!")
            (single_linkage_tree,
             result_min_span_tree) = memory.cache(_hdbscan_prims_balltree)(
                 X, min_samples, alpha, metric, p, leaf_size,
                 gen_min_span_tree, **kwargs)
        elif algorithm == 'boruvka_kdtree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Boruvka with KDTree for this"
                                 " metric!")
            (single_linkage_tree,
             result_min_span_tree) = memory.cache(_hdbscan_boruvka_kdtree)(
                 X, min_samples, alpha, metric, p, leaf_size,
                 approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs,
                 **kwargs)
        elif algorithm == 'boruvka_balltree':
            if metric not in BallTree.valid_metrics:
                raise ValueError("Cannot use Boruvka with BallTree for this"
                                 " metric!")
            (single_linkage_tree,
             result_min_span_tree) = memory.cache(_hdbscan_boruvka_balltree)(
                 X, min_samples, alpha, metric, p, leaf_size,
                 approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs,
                 **kwargs)
        else:
            raise TypeError('Unknown algorithm type %s specified' % algorithm)
    else:

        if issparse(X) or metric not in FAST_METRICS:
            # We can't do much with sparse matrices ...
            (single_linkage_tree,
             result_min_span_tree) = memory.cache(_hdbscan_generic)(
                 X, min_samples, alpha, metric, p, leaf_size,
                 gen_min_span_tree, **kwargs)
        elif metric in KDTree.valid_metrics:
            # TO DO: Need heuristic to decide when to go to boruvka;
            # still debugging for now
            if X.shape[1] > 60:
                (single_linkage_tree,
                 result_min_span_tree) = memory.cache(_hdbscan_prims_kdtree)(
                     X, min_samples, alpha, metric, p, leaf_size,
                     gen_min_span_tree, **kwargs)
            else:
                (single_linkage_tree,
                 result_min_span_tree) = memory.cache(_hdbscan_boruvka_kdtree)(
                     X, min_samples, alpha, metric, p, leaf_size,
                     approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs,
                     **kwargs)
        else:  # Metric is a valid BallTree metric
            # TO DO: Need heuristic to decide when to go to boruvka;
            # still debugging for now
            if X.shape[1] > 60:
                (single_linkage_tree,
                 result_min_span_tree) = memory.cache(_hdbscan_prims_balltree)(
                     X, min_samples, alpha, metric, p, leaf_size,
                     gen_min_span_tree, **kwargs)
            else:
                (single_linkage_tree, result_min_span_tree
                 ) = memory.cache(_hdbscan_boruvka_balltree)(
                     X, min_samples, alpha, metric, p, leaf_size,
                     approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs,
                     **kwargs)

    return _tree_to_labels(X,
                           single_linkage_tree,
                           min_cluster_size,
                           cluster_selection_method,
                           allow_single_cluster,
                           match_reference_implementation,
         cluster_selection_epsilon) + \
            (result_min_span_tree,)
Exemplo n.º 43
0
import matplotlib.pyplot as plt
from joblib import Memory
import pandas

from sklearn.utils.testing import ignore_warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition.nmf import NMF
from sklearn.decomposition.nmf import _initialize_nmf
from sklearn.decomposition.nmf import _beta_divergence
from sklearn.decomposition.nmf import _check_init
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils.extmath import safe_sparse_dot, squared_norm
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted, check_non_negative

mem = Memory(cachedir='.', verbose=0)

###################
# Start of _PGNMF #
###################
# This class implements a projected gradient solver for the NMF.
# The projected gradient solver was removed from scikit-learn in version 0.19,
# and a simplified copy is used here for comparison purpose only.
# It is not tested, and it may change or disappear without notice.


def _norm(x):
    """Dot product-based Euclidean norm implementation
    See: http://fseoane.net/blog/2011/computing-the-vector-norm/
    """
    return np.sqrt(squared_norm(x))
Exemplo n.º 44
0
def pipeline(name_threshold: int,
             email_threshold: int,
             data_loc: str,
             lower_names: bool = True,
             lower_emails: bool = True,
             use_committer: bool = False,
             cache_loc: str = None,
             use_precalculated_popular: bool = True,
             debug_output=None):
    if cache_loc:
        memory = Memory(cache_loc, verbose=0)
        read_names_emails_gids_cache = memory.cache(read_names_emails_gids)
    else:
        read_names_emails_gids_cache = read_names_emails_gids
    names, emails, github_ids, repositories = read_names_emails_gids_cache(
        data_loc=data_loc, use_committer=use_committer)
    print("Input is ready! Number of samples", len(names))
    # prepare preprocessing function for names and emails
    preproces_emails = get_preprocessing(lower_emails)
    preproces_names = get_preprocessing(lower_names)

    names = list(map(preproces_names, names))
    emails = list(map(preproces_emails, emails))

    if use_precalculated_popular:
        from idmatching.blacklist import POPULAR_NAMES, POPULAR_EMAILS
        popular_names = POPULAR_NAMES
        popular_emails = POPULAR_EMAILS
    else:
        # collect popular names and emails
        popular_names = CooccurrenceFiltering(
            threshold=name_threshold,
            threshold_comp=">=",
            is_ignored_key=is_ignored_name,
            is_ignored_value=is_ignored_email).fit(names, emails).popular_keys

        popular_emails = CooccurrenceFiltering(
            threshold=email_threshold,
            threshold_comp=">=",
            is_ignored_key=is_ignored_email,
            is_ignored_value=is_ignored_name).fit(emails, names).popular_keys
    print("Number of popular names to ignore", len(popular_names))
    print("Number of popular emails to ignore", len(popular_emails))

    # prepare filtering functions
    is_ignored_popular_name = prepare_is_blacklisted_function(
        black_list=popular_names, preprocess_value=preproces_names)
    is_ignored_popular_email = prepare_is_blacklisted_function(
        black_list=popular_emails, preprocess_value=preproces_emails)

    # replace popular names with (name, repository) pair
    for i, (name, repository) in enumerate(zip(names, repositories)):
        if is_ignored_popular_name(name):
            names[i] = "(%s, %s)" % (name, repository)

    popular_names = CooccurrenceFiltering(
        threshold=name_threshold,
        threshold_comp=">=",
        is_ignored_key=is_ignored_name,
        is_ignored_value=is_ignored_email).fit(names, emails).popular_keys

    # print("Number of popular names to ignore after replacement", len(popular_names))
    # is_ignored_popular_name = prepare_is_blacklisted_function(black_list=popular_names,
    #                                                          preprocess_value=preproces_names)

    def is_ignored_popular_name(*args):
        return False

    # identity matching
    raw_persons = []
    for name, email in tqdm(zip(names, emails), total=len(names)):
        raw_persons.append(
            RawPerson(name=preproces_names(name),
                      email=preproces_emails(email)))

    identity2person = identity_matching_pipeline(
        raw_persons=raw_persons,
        is_ignored_name=is_ignored_name,
        is_ignored_email=is_ignored_email,
        is_popular_name=is_ignored_popular_name,
        is_popular_email=is_ignored_popular_email)

    # save result
    identity2person_to_save = sorted(
        "%s||%s\n" %
        ("|".join(sorted(person.names)), "|".join(sorted(person.emails)))
        for person in identity2person.values())
    if debug_output:
        with open(debug_output, "w") as f:
            f.writelines(identity2person_to_save)
    # evaluation
    # predicted
    name_emails2id = {}
    for k, v in tqdm(identity2person.items(), total=len(identity2person)):
        for em in v.emails:
            name_emails2id[em] = k
        for n in v.names:
            name_emails2id[n] = k
    # ground truth
    email_name2gid = {}
    gid2email_name = defaultdict(set)
    for name, email, gid in tqdm(zip(names, emails, github_ids),
                                 total=len(names)):
        name, email = preproces_names(name), preproces_emails(email)
        if not is_ignored_name(name) and not is_ignored_email(email):
            email_name2gid[name] = gid
            email_name2gid[email] = gid
            gid2email_name[gid].add(name)
            gid2email_name[gid].add(email)

    # measure quality per sample
    prec = []
    rec = []
    f1 = []
    cc_size = []

    for person_names_emails in tqdm(gid2email_name.values(),
                                    total=len(gid2email_name)):
        pred_id = set()
        for ent in person_names_emails:
            pred_id.add(name_emails2id[ent])
        for pid in pred_id:
            intersection = 0
            for ent in person_names_emails:
                if ent in identity2person[
                        pid].emails or ent in identity2person[pid].names:
                    intersection += 1
            rec.append(intersection / len(person_names_emails))
            prec.append(intersection / (len(identity2person[pid].emails) +
                                        len(identity2person[pid].names)))

            if prec[-1] == 0 and rec[-1] == 0:
                f1.append(0)
            else:
                f1.append(2 * prec[-1] * rec[-1] / (prec[-1] + rec[-1]))
            cc_size.append(
                len(identity2person[pid].emails) +
                len(identity2person[pid].names))

    def avr(x):
        return sum(x) / len(x)

    avr_prec, avr_rec, avr_f1 = avr(prec), avr(rec), avr(f1)
    print("Precision %s, recall %s, f1 %s" % (avr_prec, avr_rec, avr_f1))

    def wavr(x, w):
        return sum(x_ * w_ for x_, w_ in zip(x, w)) / sum(w)

    wavr_prec, wavr_rec, wavr_f1 = wavr(prec, cc_size), wavr(rec,
                                                             cc_size), wavr(
                                                                 f1, cc_size)
    print("Precision %s, recall %s, f1 %s" % (wavr_prec, wavr_rec, wavr_f1))

    return avr_prec, avr_rec, avr_f1, wavr_prec, wavr_rec, wavr_f1, identity2person, \
        gid2email_name, raw_persons
Exemplo n.º 45
0
import os

from joblib import Memory

from lint_analysis.bin_counts.models import BinCount

cache_dir = os.path.join(os.path.dirname(__file__), 'cache')
memory = Memory(cache_dir)

token_counts = memory.cache(BinCount.token_counts)
token_pos_counts = memory.cache(BinCount.token_pos_counts)
pos_series = memory.cache(BinCount.pos_series)
token_series = memory.cache(BinCount.token_series)
Exemplo n.º 46
0
# DBSCAN - density based algorithm
# Performance: near optimal
f7 = plt.figure(7)
plot_clusters(data, cluster.DBSCAN, (), {'eps': 0.25})

# HDBSCAN - density based algorithm that allows for varying density
# Performance: optimal
# Add desaturation for points with lower probability of belonging
# to a cluster.
f8 = plt.figure(8)
clusterer = hdbscan.HDBSCAN(algorithm='best',
                            alpha=1.0,
                            approx_min_span_tree=True,
                            gen_min_span_tree=True,
                            leaf_size=40,
                            memory=Memory(cachedir=None),
                            metric='euclidean',
                            min_cluster_size=6,
                            min_samples=None,
                            p=None,
                            cluster_selection_method='eom')
clusterer = clusterer.fit(data)

start_time = time.time()
end_time = time.time()
palette = sns.color_palette('deep')
cluster_colors = [
    sns.desaturate(palette[col], sat) if col >= 0 else (0.5, 0.5, 0.5)
    for col, sat in zip(clusterer.labels_, clusterer.probabilities_)
]
plt.scatter(data.T[0], data.T[1], c=cluster_colors, **plot_kwds)
Exemplo n.º 47
0
Created on March 28, 2018

@author: Alejandro Molina
'''
import numpy as np
from joblib import Memory

from spn.algorithms.Inference import likelihood, histogram_likelihood
from spn.algorithms.StructureLearning import learn_structure
from spn.algorithms.splitting.Clustering import get_split_rows_KMeans
from spn.algorithms.splitting.RDC import get_split_cols_RDC
from spn.gpu.TensorFlow import eval_tf
from spn.structure.Base import Context
from spn.structure.leaves.Histograms import create_histogram_leaf, add_domains

memory = Memory(cachedir="cache", verbose=0, compress=9)


@memory.cache
def learn(data, ds_context):
    spn = learn_structure(data, ds_context, get_split_rows_KMeans(),
                          get_split_cols_RDC(), create_histogram_leaf)

    return spn


if __name__ == '__main__':
    data = np.loadtxt("test_data.txt", delimiter=";", dtype=np.int32)

    ds_context = Context(meta_types=["discrete"] * data.shape[1])
    add_domains(data, ds_context)
Exemplo n.º 48
0
 def __init__(self):
     self.cachedir = os.path.dirname(os.path.realpath(__file__))
     self.memory = Memory(cachedir=self.cachedir, verbose=0)
     self.client = slack.WebClient(token=SLACK_API_BOT_TOKEN)
Exemplo n.º 49
0
def run(dataset, word2vec, epoch, frequency, gpu, out, model, batchsize, lr,
        fix_embedding, resume):
    """
    Train multi-domain user review classification using Blitzer et al.'s dataset
    (https://www.cs.jhu.edu/~mdredze/datasets/sentiment/)

    Please refer README.md for details.
    """
    memory = Memory(cachedir=out, verbose=1)
    w2v, vocab, train_dataset, dev_dataset, _, label_dict, domain_dict = \
        memory.cache(prepare_blitzer_data)(dataset, word2vec)
    if model == 'rnn':
        model = multidomain_sentiment.models.create_rnn_predictor(
            len(domain_dict),
            w2v.shape[0],
            w2v.shape[1],
            300,
            len(label_dict),
            2,
            300,
            dropout_rnn=0.1,
            initialEmb=w2v,
            dropout_emb=0.1,
            fix_embedding=fix_embedding)
    elif model == 'cnn':
        model = multidomain_sentiment.models.create_cnn_predictor(
            len(domain_dict),
            w2v.shape[0],
            w2v.shape[1],
            300,
            len(label_dict),
            300,
            dropout_fc=0.1,
            initialEmb=w2v,
            dropout_emb=0.1,
            fix_embedding=fix_embedding)
    else:
        assert not "should not get here"

    classifier = multidomain_sentiment.models.MultiDomainClassifier(
        model, domain_dict=domain_dict)

    if gpu >= 0:
        # Make a specified GPU current
        chainer.cuda.get_device_from_id(gpu).use()
        classifier.to_gpu()  # Copy the model to the GPU

    # Setup an optimizer
    optimizer = chainer.optimizers.Adam(alpha=lr)
    optimizer.setup(classifier)

    train_iter = chainer.iterators.SerialIterator(train_dataset, batchsize)

    # Set up a trainer
    updater = training.StandardUpdater(
        train_iter,
        optimizer,
        device=gpu,
        converter=multidomain_sentiment.training.convert)

    if dev_dataset is not None:
        stop_trigger = EarlyStoppingTrigger(monitor='validation/main/loss',
                                            max_trigger=(epoch, 'epoch'))
        trainer = training.Trainer(updater, stop_trigger, out=out)

        logger.info("train: {},  dev: {}".format(len(train_dataset),
                                                 len(dev_dataset)))
        # Evaluate the model with the development dataset for each epoch
        dev_iter = chainer.iterators.SerialIterator(dev_dataset,
                                                    batchsize,
                                                    repeat=False,
                                                    shuffle=False)

        evaluator = extensions.Evaluator(
            dev_iter,
            classifier,
            device=gpu,
            converter=multidomain_sentiment.training.convert)
        trainer.extend(evaluator, trigger=frequency)
        # This works together with EarlyStoppingTrigger to provide more reliable
        # early stopping
        trainer.extend(SaveRestore(),
                       trigger=chainer.training.triggers.MinValueTrigger(
                           'validation/main/loss'))
    else:
        trainer = training.Trainer(updater, (epoch, 'epoch'), out=out)
        logger.info("train: {}".format(len(train_dataset)))
        # SaveRestore will save the snapshot when dev_dataset is available
        trainer.extend(extensions.snapshot(), trigger=frequency)

    logger.info("With labels: %s" % json.dumps(label_dict))
    # Take a snapshot for each specified epoch
    if gpu < 0:
        # ParameterStatistics does not work with GPU as of chainer 2.x
        # https://github.com/chainer/chainer/issues/3027
        trainer.extend(extensions.ParameterStatistics(model,
                                                      trigger=(100,
                                                               'iteration')),
                       priority=99)

    # Write a log of evaluation statistics for each iteration
    trainer.extend(extensions.LogReport(trigger=(1, 'iteration')), priority=98)
    trainer.extend(extensions.PrintReport([
        'epoch', 'main/loss', 'validation/main/loss', 'main/accuracy',
        'validation/main/accuracy'
    ]),
                   trigger=frequency,
                   priority=97)

    if resume:
        # Resume from a snapshot
        chainer.serializers.load_npz(resume, trainer)

    logger.info("Started training")
    trainer.run()

    # Save final model (without trainer)
    chainer.serializers.save_npz(os.path.join(out, 'trained_model'), model)
    with open(os.path.join(out, 'vocab.json'), 'w') as fout:
        json.dump(vocab, fout)
Exemplo n.º 50
0
def fetch_lfw_pairs(subset='train',
                    data_home=None,
                    funneled=True,
                    resize=0.5,
                    color=False,
                    slice_=(slice(70, 195), slice(78, 172)),
                    download_if_missing=True):
    """Load the Labeled Faces in the Wild (LFW) pairs dataset (classification).

    Download it if necessary.

    =================   =======================
    Classes                                5749
    Samples total                         13233
    Dimensionality                         5828
    Features            real, between 0 and 255
    =================   =======================

    In the official `README.txt`_ this task is described as the
    "Restricted" task.  As I am not sure as to implement the
    "Unrestricted" variant correctly, I left it as unsupported for now.

      .. _`README.txt`: http://vis-www.cs.umass.edu/lfw/README.txt

    The original images are 250 x 250 pixels, but the default slice and resize
    arguments reduce them to 62 x 47.

    Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`.

    Parameters
    ----------
    subset : optional, default: 'train'
        Select the dataset to load: 'train' for the development training
        set, 'test' for the development test set, and '10_folds' for the
        official evaluation set that is meant to be used with a 10-folds
        cross validation.

    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By
        default all scikit-learn data is stored in '~/scikit_learn_data'
        subfolders.

    funneled : boolean, optional, default: True
        Download and use the funneled variant of the dataset.

    resize : float, optional, default 0.5
        Ratio used to resize the each face picture.

    color : boolean, optional, default False
        Keep the 3 RGB channels instead of averaging them to a single
        gray level channel. If color is True the shape of the data has
        one more dimension than the shape with color = False.

    slice_ : optional
        Provide a custom 2D slice (height, width) to extract the
        'interesting' part of the jpeg files and avoid use statistical
        correlation from the background

    download_if_missing : optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    Returns
    -------
    The data is returned as a Bunch object with the following attributes:

    data : numpy array of shape (2200, 5828). Shape depends on ``subset``.
        Each row corresponds to 2 ravel'd face images of original size 62 x 47
        pixels. Changing the ``slice_``, ``resize`` or ``subset`` parameters
        will change the shape of the output.

    pairs : numpy array of shape (2200, 2, 62, 47). Shape depends on ``subset``
        Each row has 2 face images corresponding to same or different person
        from the dataset containing 5749 people. Changing the ``slice_``,
        ``resize`` or ``subset`` parameters will change the shape of the
        output.

    target : numpy array of shape (2200,). Shape depends on ``subset``.
        Labels associated to each pair of images. The two label values being
        different persons or the same person.

    DESCR : string
        Description of the Labeled Faces in the Wild (LFW) dataset.

    """
    lfw_home, data_folder_path = _check_fetch_lfw(
        data_home=data_home,
        funneled=funneled,
        download_if_missing=download_if_missing)
    logger.debug('Loading %s LFW pairs from %s', subset, lfw_home)

    # wrap the loader in a memoizing function that will return memmaped data
    # arrays for optimal memory usage
    if LooseVersion(joblib.__version__) < LooseVersion('0.12'):
        # Deal with change of API in joblib
        m = Memory(cachedir=lfw_home, compress=6, verbose=0)
    else:
        m = Memory(location=lfw_home, compress=6, verbose=0)
    load_func = m.cache(_fetch_lfw_pairs)

    # select the right metadata file according to the requested subset
    label_filenames = {
        'train': 'pairsDevTrain.txt',
        'test': 'pairsDevTest.txt',
        '10_folds': 'pairs.txt',
    }
    if subset not in label_filenames:
        raise ValueError("subset='%s' is invalid: should be one of %r" %
                         (subset, list(sorted(label_filenames.keys()))))
    index_file_path = join(lfw_home, label_filenames[subset])

    # load and memoize the pairs as np arrays
    pairs, target, target_names = load_func(index_file_path,
                                            data_folder_path,
                                            resize=resize,
                                            color=color,
                                            slice_=slice_)

    module_path = dirname(__file__)
    with open(join(module_path, 'descr', 'lfw.rst')) as rst_file:
        fdescr = rst_file.read()

    # pack the results as a Bunch instance
    return Bunch(data=pairs.reshape(len(pairs), -1),
                 pairs=pairs,
                 target=target,
                 target_names=target_names,
                 DESCR=fdescr)
Exemplo n.º 51
0
def fetch_lfw_people(data_home=None,
                     funneled=True,
                     resize=0.5,
                     min_faces_per_person=0,
                     color=False,
                     slice_=(slice(70, 195), slice(78, 172)),
                     download_if_missing=True,
                     return_X_y=False):
    """Load the Labeled Faces in the Wild (LFW) people dataset \
(classification).

    Download it if necessary.

    =================   =======================
    Classes                                5749
    Samples total                         13233
    Dimensionality                         5828
    Features            real, between 0 and 255
    =================   =======================

    Read more in the :ref:`User Guide <labeled_faces_in_the_wild_dataset>`.

    Parameters
    ----------
    data_home : optional, default: None
        Specify another download and cache folder for the datasets. By default
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.

    funneled : boolean, optional, default: True
        Download and use the funneled variant of the dataset.

    resize : float, optional, default 0.5
        Ratio used to resize the each face picture.

    min_faces_per_person : int, optional, default None
        The extracted dataset will only retain pictures of people that have at
        least `min_faces_per_person` different pictures.

    color : boolean, optional, default False
        Keep the 3 RGB channels instead of averaging them to a single
        gray level channel. If color is True the shape of the data has
        one more dimension than the shape with color = False.

    slice_ : optional
        Provide a custom 2D slice (height, width) to extract the
        'interesting' part of the jpeg files and avoid use statistical
        correlation from the background

    download_if_missing : optional, True by default
        If False, raise a IOError if the data is not locally available
        instead of trying to download the data from the source site.

    return_X_y : boolean, default=False.
        If True, returns ``(dataset.data, dataset.target)`` instead of a Bunch
        object. See below for more information about the `dataset.data` and
        `dataset.target` object.

        .. versionadded:: 0.20

    Returns
    -------
    dataset : dict-like object with the following attributes:

    dataset.data : numpy array of shape (13233, 2914)
        Each row corresponds to a ravelled face image of original size 62 x 47
        pixels. Changing the ``slice_`` or resize parameters will change the
        shape of the output.

    dataset.images : numpy array of shape (13233, 62, 47)
        Each row is a face image corresponding to one of the 5749 people in
        the dataset. Changing the ``slice_`` or resize parameters will change
        the shape of the output.

    dataset.target : numpy array of shape (13233,)
        Labels associated to each face image. Those labels range from 0-5748
        and correspond to the person IDs.

    dataset.DESCR : string
        Description of the Labeled Faces in the Wild (LFW) dataset.

    (data, target) : tuple if ``return_X_y`` is True

        .. versionadded:: 0.20

    """
    lfw_home, data_folder_path = _check_fetch_lfw(
        data_home=data_home,
        funneled=funneled,
        download_if_missing=download_if_missing)
    logger.debug('Loading LFW people faces from %s', lfw_home)

    # wrap the loader in a memoizing function that will return memmaped data
    # arrays for optimal memory usage
    if LooseVersion(joblib.__version__) < LooseVersion('0.12'):
        # Deal with change of API in joblib
        m = Memory(cachedir=lfw_home, compress=6, verbose=0)
    else:
        m = Memory(location=lfw_home, compress=6, verbose=0)
    load_func = m.cache(_fetch_lfw_people)

    # load and memoize the pairs as np arrays
    faces, target, target_names = load_func(
        data_folder_path,
        resize=resize,
        min_faces_per_person=min_faces_per_person,
        color=color,
        slice_=slice_)

    X = faces.reshape(len(faces), -1)

    module_path = dirname(__file__)
    with open(join(module_path, 'descr', 'lfw.rst')) as rst_file:
        fdescr = rst_file.read()

    if return_X_y:
        return X, target

    # pack the results as a Bunch instance
    return Bunch(data=X,
                 images=faces,
                 target=target,
                 target_names=target_names,
                 DESCR=fdescr)
Exemplo n.º 52
0
import numpy as np
from joblib import Memory
from sklearn.preprocessing import OneHotEncoder

cache = Memory('cache').cache


@cache
def get_embedding_dim(embedding_path):
    with open(embedding_path, 'rb') as f:
        return len(f.readline().split()) - 1


@cache
def get_embedding_matrix(vocab, embedding_path):
    word2ind = {w: i for i, w in enumerate(vocab)}
    embedding_dim = get_embedding_dim(embedding_path)
    embeddings = np.random.normal(size=(len(vocab), embedding_dim))

    with open(embedding_path, 'rb') as f:
        for line in f:
            parts = line.split()
            word = parts[0]
            if word in word2ind:
                i = word2ind[word]
                vec = np.array([float(x) for x in parts[1:]])
                embeddings[i] = vec
    return embeddings
Exemplo n.º 53
0
# pylint: disable=unused-argument, broad-except

from typing import (
    List,
    Optional,
    Union,
)

from pathlib import Path
from joblib import Memory
from polyglot.text import Detector
from logzero import logger

from .seg_text import seg_text

memory = Memory(location=Path("joblib_cache"), verbose=0)


# fmt: off
# @memory.cache(ignore=['debug'])
def _sent_tokenizer(
        text: Union[str, List[str]],
        lang: Optional[str] = None,
        debug: bool = False,  # when True, disable joblib.Memory.cache
) -> List[str]:
    # fmt: on
    """Tokenize str|List[str] to sents."""
    if isinstance(text, str):
        text = [text]

    if lang is None:
Exemplo n.º 54
0
# http://www.njtransit.com/sf/sf_servlet.srv?hdnPageAction=TripPlannerTo
# http://www.njtransit.com/sf/sf_servlet.srv?hdnPageAction=TripPlannerServiceNearTo

import lxml.html
import requests
from joblib import Memory
from geolocate import geocode

from utils import find_nearest_weekday

# http://www.njtransit.com/rg/rg_servlet.srv?hdnPageAction=StationParkRideTo

memory = Memory(cachedir='.cache', verbose=0)


@memory.cache
def plan_trip_inner(source, destination):

    source_geocoded = geocode(source)
    destination_geocoded = geocode(destination)

    departure_time = find_nearest_weekday().replace(
        hour=6, minute=0)  # 6 am nearest weekday

    response = requests.post(
        "http://www.njtransit.com/sf/sf_servlet.srv?hdnPageAction=TripPlannerItineraryFrom",
        data={
            "starting_street_address":
            source,
            "dest_street_address":
            destination,
                dim=dim,
                noise_corr=noise_corr,
                sep=sep,
                score_error=(np.mean(this_scores) - validation_score),
                score_sem=(np.std(this_scores) / np.sqrt(len(this_scores))),
            ))

    return scores


###############################################################################
# Run the simulations

N_JOBS = -1
N_DRAWS = 1000
mem = Memory(cachedir='cache')

results = pandas.DataFrame(columns=[
    'cv_name', 'validation_score', 'train_size', 'dim', 'noise_corr', 'sep',
    'score_error', 'score_sem'
])

for dim, sep in [
    (300, 5.),
    (10000, 60.),
    (10, .5),
    (1, .13),
]:
    if dim > 1000:
        # Avoid memory problems
        n_jobs = 20
Exemplo n.º 56
0
        def inner():
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=max_workers) as executor:
                yield from executor.map(f, it)

        return inner()
    else:
        return map(f, it)


def interruptible(fn, *args, **kwargs):
    """Run fn in another thread. This enables to keep processing signals (hence
    KeyboardInterrupt) when fn is a long-running non-python function
    that releases the GIL.
    """
    executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
    future = executor.submit(fn, *args, **kwargs)
    try:
        return future.result()
    except:
        future.cancel()
        executor.shutdown(wait=True)
        raise


cache_dir = os.getenv("STRAPS_CACHE_DIR")
if cache_dir:
    os.makedirs(cache_dir, exist_ok=True)
# If cache_dir is None, Memory acts as a transparent wrapper.
pdt_cache = Memory(cache_dir, verbose=0)
Exemplo n.º 57
0
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.metrics import zero_one_loss
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import check_array
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Memoize the data extraction and memory map the resulting
# train / test splits in readonly mode
memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'),
                mmap_mode='r')


@memory.cache
def load_data(dtype=np.float32, order='F'):
    """Load the data, then cache and memmap the train/test split"""
    ######################################################################
    # Load dataset
    print("Loading dataset...")
    data = fetch_openml('mnist_784')
    X = check_array(data['data'], dtype=dtype, order=order)
    y = data["target"]

    # Normalize features
    X = X / 255
Exemplo n.º 58
0
#!/bin/env python

import os
import numpy as np
from sklearn.datasets.samples_generator import make_blobs

from joblib import Memory
_memory = Memory('.', verbose=1)

DATA_DIR = os.path.expanduser('~/Desktop/datasets/nn-search')
join = os.path.join


class Random:
    UNIFORM = 'uniform'
    GAUSS = 'gauss'
    WALK = 'walk'
    BLOBS = 'blobs'


class Gist:
    DIR = join(DATA_DIR, 'gist')
    TRAIN = join(DIR, 'gist_train.npy')  # noqa
    TEST = join(DIR, 'gist.npy')  # noqa
    TEST_100 = join(DIR, 'gist_100k.npy')  # noqa
    TEST_200 = join(DIR, 'gist_200k.npy')  # noqa
    QUERIES = join(DIR, 'gist_queries.npy')  # noqa
    TRUTH = join(DIR, 'gist_truth.npy')  # noqa


class Sift1M:
Exemplo n.º 59
0
from time import time
from typing import Callable, Tuple
from warnings import warn

import cupy as cp
import numpy as np
from cupyx.scipy import linalg
from joblib import Memory
from numpy import linalg as np_linalg

from GeneralEstimator import EstimatorDiscretize
from Operator import Operator
from decorators import timer

location = './cachedir'
memory = Memory(location, verbose=0, bytes_limit=1024 * 1024 * 1024)


@memory.cache
def numpy_svd(A_cpu: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    return np_linalg.svd(A_cpu, full_matrices=True, compute_uv=True, hermitian=True)


class Landweber(EstimatorDiscretize, Operator):
    def __init__(self, kernel: Callable, lower: float, upper: float, grid_size: int,
                 observations: np.ndarray, sample_size: int, adjoint: bool = False, quadrature: str = 'rectangle',
                 **kwargs):
        """
        Instance of Landweber solver for inverse problem in Poisson noise with integral operator.
        :param kernel: Kernel of the integral operator.
        :type kernel: Callable
Exemplo n.º 60
0
import numpy as np
from joblib import Memory

location = './cachedir'
memory = Memory(location, verbose=0)


@memory.cache()
def main(points_interest, T_space, axes):
    ''' Calculates the length and draws the lines for length
    of the butterfly wings.

    Parameters
    ----------
    ax: array
        the array containing the 3 intermediary Axes.
    points_interest: array
        the array containing the four points of interest,
        each of which is a coordinate specifying the start/end
        point of the left/right wing.
    T_space: float
        number of pixels between 2 ticks.

    Returns
    -------
    ax: ax
        an ax object
    dst_pix: tuple
        the tuple contains the distance of the left/right wing
        distance in pixels
    dst_mm: tuple