Exemplo n.º 1
0
def main():

    #match.objects.all().delete()
    s_docs_count = scopus_doc.objects.filter(DO__exists=True,shingle__exists=True).count()
    print(s_docs_count)

    model = Doc2Vec.load("/queries/title_model")

    s_docs_count = 10025

    chunk_size= 10000

    for i in range(s_docs_count//chunk_size+1):
        f = i*chunk_size
        l = (i+1)*chunk_size-1
        if l > s_docs_count:
            l = s_docs_count-1
        s_docs = scopus_doc.objects.filter(DO__exists=True,shingle__exists=True)[f:l]

        matches = []
        sims = []
        pool = Pool(processes=16)
        #matches.append(pool.map(partial(find_match,),s_docs))
        sims.append(pool.map(partial(find_sim,),s_docs))
        pool.terminate()
        #matches = [x for x in matches[0] if x is not None]
        #matches = list(filter(None.__ne__, matches[0]))
        sims = list(filter(None.__ne__, sims[0]))
        #match.objects.insert(matches)
        similarity.objects.insert
Exemplo n.º 2
0
    def run():
        def init_pool():
            logging.info('Init pool')
            signal.signal(signal.SIGINT, signal.SIG_IGN)

        pool = Pool(args.process, init_pool)
        try:
            task_inputs = gen_inputs()

            if args.process == 1:
                results = itertools.imap(task, task_inputs)
            else:
                results = pool.imap(task, task_inputs, 1)
                #results = imap_wrap(pool, task, task_inputs, 100)

            data_header = ['chrom', 'start', 'end', 'region_id', 'nsamples']
            param_header = [
                'status', 'ia', 'ib', 'ic', 'a', 'b', 'c', 'top_cn',
                'top_theta', 'theta', 'll', 'step'
            ]
            out_header = data_header + param_header
            print(*out_header, sep='\t')
            for data, params in results:
                row = [data[c] for c in data_header] \
                    + [params[c] for c in param_header]
                print(*row, sep='\t')
            pool.close()
        except Exception as e:
            pool.terminate()
            raise e
        finally:
            pool.join()
Exemplo n.º 3
0
    def run_parallel_fep(self, mutant_params, system_idx, mutant_idx, n_steps,
                         n_iterations, windows):
        logger.debug('Computing FEP for {}...'.format(self.name))
        if not self.opt:
            mutant_systems = mutant_params.build_fep_systems(
                system_idx, mutant_idx, windows)
        else:
            mutant_systems = mutant_params

        nstates = len(mutant_systems)
        chunk = math.ceil(nstates / self.num_gpu)
        groups = grouper(range(nstates), chunk)
        pool = Pool(processes=self.num_gpu)

        system = copy.deepcopy(self.wt_system)
        box_vectors = self.input_pdb.topology.getPeriodicBoxVectors()
        system.setDefaultPeriodicBoxVectors(*box_vectors)
        system.addForce(
            mm.MonteCarloBarostat(1 * unit.atmospheres,
                                  self.temperature * unit.kelvin, 25))  ###

        fep = partial(run_fep,
                      sim=self,
                      system=system,
                      pdb=self.extended_pdb,
                      n_steps=n_steps,
                      n_iterations=n_iterations,
                      all_mutants=mutant_systems)
        u_kln = pool.map(fep, groups)
        pool.close()
        pool.join()
        pool.terminate()
        ddg = FSim.gather_dg(self, u_kln, nstates)

        return ddg
Exemplo n.º 4
0
def main():
    model = Doc2Vec.load("/usr/local/apsis/queries/title_model")
    do_shingle = True
    print(scopus_doc.objects.filter(TI__isnull=True).count())
    if do_shingle:
        unshingled_s_docs = scopus_doc.objects.filter(shingle__exists=False)
        print(unshingled_s_docs.count())
        for sd in unshingled_s_docs:
            if not hasattr(sd, 'TI'):
                print(sd)
                sd.delete()
            else:
                #try:
                sd.shingle = list(shingle(sd.TI, 2))
                sd.save()
            #except:
            #    pass

    scopus_docs_all = scopus_doc.objects.filter(shingle__exists=True,
                                                DO__exists=True,
                                                doc2vec_checked=False)
    s_docs_i = scopus_docs_all.count()

    #s_docs_i = 10

    chunk_size = 3

    #similarity.objects.all().delete()

    for i in range(s_docs_i // chunk_size + 1):
        print(i)
        #t0 = time.time()
        f = i * chunk_size
        l = (i + 1) * chunk_size
        if l > s_docs_i:
            l = s_docs_i - 1
        s_docs = scopus_docs_all[f:l]

        print(s_docs)

        # initialise an empty list, and append sim items to it in parallel
        sims = []
        pool = Pool(processes=chunk_size)
        sims.append(pool.map(partial(compare, model=model), s_docs))
        pool.terminate()

        #sims = [item for sublist in sims for item in sublist]
        #try:
        #    sims = [item for sublist in sims for item in sublist]
        #except:
        #    pass

        # Flatten and remove nones
        #print(sims)
        sims = flatten(sims)
        sims = list(filter(None.__ne__, sims))

        similarity.objects.insert(sims)

        s_docs.update(doc2vec_checked=True)
Exemplo n.º 5
0
def pcall_mp(fun, args, cores=cores):
    """Calls a function for every input in args"""
    mainpool = Pool(cores)  # create pool
    out = mainpool.map(fun, args)  # return list
    mainpool.terminate()
    del mainpool  # delete pool
    return out
Exemplo n.º 6
0
    def handle(self, *args, **options):
        qid = options['qid']

        q = Query.objects.get(pk=qid)
        docs = Doc.objects.filter(query=q,
                                  wosarticle__cr__isnull=False,
                                  cdo__citation__isnull=True)

        docs = docs

        ndocs = docs.count()

        print(ndocs)

        # Chunk size, so as to prevent overuse of memory
        chunk_size = 1000

        for i in range(ndocs // chunk_size + 1):
            cdos = []
            f = i * chunk_size
            print(f)
            l = (i + 1) * chunk_size
            if l > ndocs:
                l = ndocs - 1
            chunk_docs = docs[f:l]
            pool = Pool(processes=4)
            cdos.append(pool.map(doc_cites, chunk_docs))
            pool.terminate()
            gc.collect()

            django.db.connections.close_all()
            cdos = flatten(cdos)

            CDO.objects.bulk_create(cdos)
Exemplo n.º 7
0
 def msolve(A, Y, init=None):
     if use_cuda:
         Z = np.asarray(np.hstack(list(
             map(lambda y: cg(A, y, x0=None, tol=tol, atol=atol, use_cuda=True), np.split(Y, n_jobs, axis=1)))))
     else:
         if n_jobs <= 1:
             Z = np.asarray(np.hstack([scipy.sparse.linalg.cg(A, Y[:, i],
                                                              x0=init[:, i] if init is not None else None,
                                                              tol=tol, atol=atol)[0][:, np.newaxis] for i in
                                       range(Y.shape[1])]))
         else:
             p = Pool(n_jobs)
             try:
                 if init is None:
                     Z = np.asarray(np.hstack(list(
                         p.map(lambda y: cg(A, y, x0=None, tol=tol, atol=atol, use_cuda=False),
                               np.split(Y, Y.shape[1], axis=1)))))
                 else:
                     Z = np.asarray(np.hstack(list(
                         p.map(lambda y, init_: cg(A, y, x0=init_, tol=tol, atol=atol, use_cuda=False),
                               zip(np.split(Y, Y.shape[1], axis=1), np.split(init, init.shape[1], axis=1))))))
                 p.close()
                 p.join()
             except KeyboardInterrupt:
                 print("Caught KeyboardInterrupt, terminating workers")
                 p.terminate()
                 p.join()
                 raise
     return Z
Exemplo n.º 8
0
def pcall_mp(fun,args,cores=cores):
    """Calls a function for every input in args"""
    mainpool = Pool(cores) # create pool
#    print("Using",cores,"cores")
    out = mainpool.map(fun,args) # return list
    mainpool.terminate()
    del mainpool # delete pool
    return out
Exemplo n.º 9
0
def pcall_mp(fun, args, cores=cores):
    """Calls a function for every input in args"""
    mainpool = Pool(cores)  # create pool
    #    print("Using",cores,"cores")
    out = mainpool.map(fun, args)  # return list
    mainpool.terminate()  # clear the pool
    del mainpool  # delete pool
    return out
Exemplo n.º 10
0
    def convertpool(self):

        if len(self.todo) > 0:

            if self.type in [".h264", ".mp4", ".avi"]:

                pool = Pool(min(self.pools, len(self.todo)))
                try:
                    pool.map(self.conv_single, self.todo)
                    pool.close()
                    lineprint("Done converting all videofiles!",
                              label="pirecorder")
                except KeyboardInterrupt:
                    lineprint("User terminated converting pool..",
                              label="pirecorder")
                    self.terminated = True
                    pool.terminate()
                    return
                except Exception as e:
                    excep = "Got exception: %r, terminating pool" % (e, )
                    lineprint(excep, label="pirecorder")
                    pool.terminate()
                finally:
                    pool.join()

                if self.delete:
                    for filein in self.todo:
                        os.remove(filein)
                    lineprint("Deleted all original videofiles..",
                              label="pirecorder")

            elif self.type in [".jpg", ".jpeg", ".png"]:

                vidname = commonpref(self.todo)
                lineprint("Start converting " + str(len(self.todo)) +
                          " images",
                          label="pirecorder")

                frame_array = []
                for filename in self.todo:
                    frame = cv2.imread(filename)
                    frame_array.append(frame)
                    #os.rename(filename, self.outdir+"/"+filename)
                h, w, _ = frame_array[0].shape
                if self.outdir != "":
                    vidname = self.outdir + "/" + os.path.basename(vidname)
                vidout = videowriter(vidname, w, h, self.imgfps,
                                     self.resizeval)
                for i in range(len(frame_array)):
                    vidout.write(frame_array[i])
                vidout.release()
                lineprint("Finished converting " + os.path.basename(vidname),
                          label="pirecorder")

            else:
                lineprint("No video or image files found..",
                          label="pirecorder")
Exemplo n.º 11
0
def get_gameplays():

    PlayTypeDict = {}

    PlayTypeStrings = {
        'Pass': ['pass incomplete', 'pass complete', 'sacked'],
        'Admin': ['spiked the ball', 'Timeout', 'Penalty', 'aborted'],
        'Kneel': ['knee', 'knelt'],
        'Punt': ['Punts'],
        'Field Goal': ['field goal', 'no good'],
        'Special Teams':
        ['kicks off', 'kicks onside', 'extra point', 'two point'],
        'Run': [
            'left end', 'right end', ' for ', 'up the middle', 'middle for',
            'left tackle', 'left guard', 'right guard', 'right tackle'
        ],
    }

    YearStart = 1998
    YearsToGo = 20
    for Year in range(YearStart, YearStart + YearsToGo):

        PlayTypeCounts = {
            'Pass': 0,
            'Run': 0,
            'Punt': 0,
            'Field Goal': 0,
            'Admin': 0,
            'Kneel': 0,
            'Special Teams': 0
        }
        for GameNumber in range(1, 17):
            print('Game', GameNumber, 'in', Year, 'Time: ', datetime.now())

            PlayTypeDict = {}
            PathList = []
            for Team in TeamLookup:
                for GameLocation in ['H', 'A']:
                    path = 'https://widgets.sports-reference.com/wg.fcgi?css=1&site=pfr&url=%2Fplay-index%2Fplay_finder.cgi%3Frequest%3D1%26match%3Dall%26year_min%3D{YEAR}%26year_max%3D{YEAR}%26game_type%3DR%26game_num_min%3D{GameNumber}%26game_num_max%3D{GameNumber}%26week_num_min%3D0%26week_num_max%3D99%26game_location%3D{GameLocation}%26minutes_max%3D15%26seconds_max%3D0%26minutes_min%3D0%26seconds_min%3D0%26team_id%3D{TEAM}%26field_pos_min_field%3Dteam%26field_pos_max_field%3Dteam%26end_field_pos_min_field%3Dteam%26end_field_pos_max_field%3Dteam%26type%255B%255D%3DPASS%26type%255B%255D%3DRUSH%26type%255B%255D%3DPUNT%26type%255B%255D%3DKOFF%26type%255B%255D%3DONSD%26type%255B%255D%3DFG%26type%255B%255D%3DXP%26type%255B%255D%3D2PC%26no_play%3DN%26turnover_type%255B%255D%3Dinterception%26turnover_type%255B%255D%3Dfumble%26score_type%255B%255D%3Dtouchdown%26score_type%255B%255D%3Dfield_goal%26score_type%255B%255D%3Dsafety%26order_by%3Dyds_to_go&div=div_all_plays&del_col=1,11,12,13,14'.format(
                        YEAR=Year,
                        GameNumber=GameNumber,
                        TEAM=Team,
                        GameLocation=GameLocation)

                    PathList.append(path)
                    #req = get(path)
            p = Pool(8)  # Pool tells how many at a time
            records = p.map(GetAndParsePath, PathList)
            p.terminate()
            p.join()

            with open(
                    'output/PlayTypeCounts-Year-' + str(Year) + '-Game-' +
                    str(GameNumber) + '.json', 'w') as outfile:
                json.dump(PlayTypeDict, outfile)
Exemplo n.º 12
0
def fmultiprocess(log,
                  function,
                  inputArray,
                  poolSize=False,
                  timeout=3600,
                  **kwargs):
    """multiprocess pool

    **Key Arguments:**
        - ``log`` -- logger
        - ``function`` -- the function to multiprocess
        - ``inputArray`` -- the array to be iterated over
        - ``poolSize`` -- limit the number of CPU that are used in multiprocess job
        - ``timeout`` -- time in sec after which to raise a timeout error if the processes have not completed

    **Return:**
        - ``resultArray`` -- the array of results

    **Usage:**

        .. code-block:: python 

            from fundamentals import multiprocess
            # DEFINE AN INPUT ARRAY
            inputArray = range(10000)
            results = multiprocess(log=log, function=functionName, poolSize=10, timeout=300,
                                  inputArray=inputArray, otherFunctionKeyword="cheese")
    """
    log.debug('starting the ``multiprocess`` function')

    # DEFINTE POOL SIZE - NUMBER OF CPU CORES TO USE (BEST = ALL - 1)
    if not poolSize:
        poolSize = psutil.cpu_count()

    if poolSize:
        p = Pool(processes=poolSize)
    else:
        p = Pool()

    # MAP-REDUCE THE WORK OVER MULTIPLE CPU CORES
    if "log" in inspect.getargspec(function)[0]:
        mapfunc = partial(function, log=log, **kwargs)
        resultArray = p.map_async(mapfunc, inputArray)
    else:
        mapfunc = partial(function, **kwargs)
        resultArray = p.map_async(mapfunc, inputArray)

    resultArray = resultArray.get(timeout=timeout)

    p.close()
    p.terminate()

    log.debug('completed the ``multiprocess`` function')
    return resultArray
Exemplo n.º 13
0
def fmultiprocess(log, function, inputArray, poolSize=False, **kwargs):
    """multiprocess pool

    **Key Arguments:**
        - ``log`` -- logger
        - ``function`` -- the function to multiprocess
        - ``inputArray`` -- the array to be iterated over

    **Return:**
        - ``resultArray`` -- the array of results

    **Usage:**

        .. code-block:: python 

            from fundamentals import multiprocess
            # DEFINE AN INPUT ARRAY
            inputArray = range(10000)
            results = multiprocess(log=log, function=functionName,
                                  inputArray=inputArray, otherFunctionKeyword="cheese")
    """
    log.info('starting the ``multiprocess`` function')

    # DEFINTE POOL SIZE - NUMBER OF CPU CORES TO USE (BEST = ALL - 1)
    # if cpu_count() > 1:
    #     poolSize = cpu_count() - 1
    # else:
    #     poolSize = 1

    # if len(inputArray) < poolSize:
    #     poolSize = len(inputArray)
    if poolSize:
        p = Pool(processes=poolSize)
    else:
        p = Pool()

    # MAP-REDUCE THE WORK OVER MULTIPLE CPU CORES
    try:
        mapfunc = partial(function, log=log, **kwargs)
        resultArray = p.map(mapfunc, inputArray)
    except:
        try:
            mapfunc = partial(function, **kwargs)
            resultArray = p.map(mapfunc, inputArray)
        except:
            mapfunc = partial(function, log=log, **kwargs)
            resultArray = p.map(mapfunc, inputArray)

    p.close()
    p.terminate()
    p.join()

    log.info('completed the ``multiprocess`` function')
    return resultArray
Exemplo n.º 14
0
 def inner(*args):
     pool = Pool(processes=1)
     res = pool.apply_async(f,args)
     try:
         v = res.get(timeout=sec)
     except Exception as inst:
         print(inst)
         v = None
     finally:
         pool.terminate()
         return v
Exemplo n.º 15
0
def loop_fourier(snaps,
                 modes,
                 settings,
                 Rd=1.47492,
                 folder='./',
                 offsets=None,
                 max_amp=False,
                 parttype='stars'):
    """
    Measure the Fourier modes for a range of snapshots
    """

    pool = Pool()
    snapbase = 'snap_'

    settings_array = []
    for i, s in enumerate(snaps):
        settings_array.append(settings.copy())
        settings_array[i]['filename'] = s

    nsnaps = len(snaps)

    if offsets is None:
        use_offset = False
    else:
        use_offset = True
        argd = utils.check_args(snaps, offsets=offsets)
        offsets = argd['offsets']

        for i, offset in enumerate(offsets):
            settings_array[i]['offset'] = offset

    try:
        amps = pool.map(fourier_mode_helper, settings_array, [Rd] * nsnaps,
                        [modes] * nsnaps, [max_amp] * nsnaps,
                        [use_offset] * nsnaps)
    except KeyboardInterrupt:
        print('got ^C while pool mapping, terminating the pool')
        pool.terminate()
        print('pool is terminated')
    except Exception as e:
        print('got exception: %r, terminating the pool' % (e, ))
        pool.terminate()
        print('pool is terminated')

    return amps
Exemplo n.º 16
0
    def apply_function(self, function, *args):
        """
        Map a user supplied function over the snapshots.
        Uses pathos.multiprocessing (https://github.com/uqfoundation/pathos.git).
        """
        pool = Pool()

        try:
            val = pool.map(function, self.snaps)
            return val
        except KeyboardInterrupt:
            print('got ^C while pool mapping, terminating the pool')
            pool.terminate()
            print('pool is terminated')
        except Exception as e:
            print('got exception: %r, terminating the pool' % (e, ))
            pool.terminate()
            print('pool is terminated')
Exemplo n.º 17
0
 def get_mutant_energy(self, parameters, dcd, top, num_frames):
     chunk = math.ceil(len(parameters) / self.num_gpu)
     groups = grouper(range(len(parameters)), chunk)
     pool = Pool(processes=self.num_gpu)
     mutant_eng = partial(mutant_energy,
                          sim=self,
                          dcd=dcd,
                          top=top,
                          num_frames=num_frames,
                          all_mutants=parameters)
     mutants_systems_energies = pool.map(mutant_eng, groups)
     pool.close()
     pool.join()
     pool.terminate()
     mutants_systems_energies = [
         x for y in mutants_systems_energies for x in y
     ]
     return mutants_systems_energies
Exemplo n.º 18
0
    def eval_EFG(self,x,num_procs=None,info=False):

        from multiprocess import Pool,cpu_count

        if not num_procs:
            num_procs = cpu_count()
        num_samples = self.parameters['num_samples']
        pool = Pool(num_procs)
        num = int(np.ceil(float(num_samples)/float(num_procs)))
        results = list(zip(*pool.map(lambda i: self.eval_EFG_sequential(x,num,i,info),range(num_procs),chunksize=1)))
        pool.terminate()
        pool.join()
        if not info:
            assert(len(results) == 4)
        else:
            assert(len(results) == 5)
        assert(all([len(vals) == num_procs for vals in results]))
        return [sum(vals)/float(num_procs) for vals in results]
Exemplo n.º 19
0
class ProcessPoolExecutor(Executor):
    """Process Pool Executor"""
    def __init__(self):
        super(ProcessPoolExecutor, self).__init__()
        import os
        from multiprocess import Pool
        self.pool = Pool(os.cpu_count() or 1)

    def submit(self, func, *args, **kwargs):
        from concurrent.futures import Future
        fut = Future()
        self.tasks[fut] = self.pool.apply_async(func, args, kwargs,
                                                fut.set_result,
                                                fut.set_exception)
        fut.add_done_callback(self.tasks.pop)
        return fut

    def shutdown(self, wait=True):
        super(ProcessPoolExecutor, self).shutdown(wait)
        self.pool.terminate()
        self.pool.join()
Exemplo n.º 20
0
class ProcessPoolExecutor(Executor):
    """Process Pool Executor"""
    def __init__(self):
        super(ProcessPoolExecutor, self).__init__()
        import os
        from multiprocess import Pool
        self.pool = Pool(os.cpu_count() or 1)

    def submit(self, func, *args, **kwargs):
        from concurrent.futures import Future
        fut = Future()
        self.tasks[fut] = self.pool.apply_async(
            func, args, kwargs, fut.set_result, fut.set_exception
        )
        fut.add_done_callback(self.tasks.pop)
        return fut

    def shutdown(self, wait=True):
        super(ProcessPoolExecutor, self).shutdown(wait)
        self.pool.terminate()
        self.pool.join()
Exemplo n.º 21
0
    def _fi_multi_process(fi_func, arg_list, progressbar=True, n_jobs=None):
        executor_instance = Pool(n_jobs)
        mapper = executor_instance.imap if progressbar else executor_instance.map
        if progressbar:
            warn("Progress bars slow down runs by 10-20%. For slightly \n"
                 "faster runs, do progress_bar=False")
            n_iter = len(arg_list)
            p = ProgressBar(n_iter, units='features')
        try:
            importance_dicts = []
            for importance in mapper(fi_func, arg_list):
                importance_dicts.append(importance)
                if progressbar:
                    p.animate()
        except:
            warn("Multiprocessing failed, going single process")
            importance_dicts = FeatureImportance._fi_single_process(
                fi_func, arg_list, progressbar=progressbar)
        finally:
            executor_instance.close()
            executor_instance.join()
            executor_instance.terminate()

        return importance_dicts
Exemplo n.º 22
0
    def run_parallel_dynamics(self, output_folder, name, n_steps, equi,
                              mutant_parameters):
        system = copy.deepcopy(self.wt_system)
        n_steps = math.ceil(n_steps / self.num_gpu)
        n_steps = n_steps * 2500

        if mutant_parameters is not None:
            non_bonded_force = system.getForce(self.nonbonded_index)
            self.apply_nonbonded_parameters(non_bonded_force,
                                            mutant_parameters[0],
                                            mutant_parameters[1],
                                            mutant_parameters[2],
                                            mutant_parameters[3])

        box_vectors = self.input_pdb.topology.getPeriodicBoxVectors()
        system.setDefaultPeriodicBoxVectors(*box_vectors)
        system.addForce(
            mm.MonteCarloBarostat(1 * unit.atmospheres,
                                  self.temperature * unit.kelvin, 25))  ###

        dcd_names = [
            output_folder + name + '_gpu' + str(i) + '.dcd'
            for i in range(self.num_gpu)
        ]
        groups = grouper(dcd_names, 1)
        pool = Pool(processes=self.num_gpu)
        run = partial(run_dynamics,
                      system=system,
                      sim=self,
                      equi=equi,
                      n_steps=n_steps)
        pool.map(run, groups)
        pool.close()
        pool.join()
        pool.terminate()
        return dcd_names
Exemplo n.º 23
0
    def get_missing_ids(self):
        ## First argument: Make function that inputs a enterez id and spits out a hugo id
        ## Second argument: Make a list of all of the missing
        missing = []
        values = self.df.values
        for i in range(len(values)):  #
            if type(values[i][0]) is type(3.6) or values[i][0] == "nan":
                enterez = values[i][1]
                missing.append({'index': i, 'enterez': enterez})

        p = Pool(self.num_workers)
        hugo_ids = p.map(get_id, missing)
        p.terminate()
        p.join()

        ## Change Hugo Ids to new ids
        number_found = 0
        for i in range(len(hugo_ids)):
            self.df.iloc[missing[i]['index'], 0] = hugo_ids[i]
            if hugo_ids[i] != "nan":
                number_found += 1
        print("Found ", number_found, "/", len(missing),
              "of the missing values")
        return self.df
Exemplo n.º 24
0
    def eval_EQ(self,p,num_procs=None,quiet=True):
        """
        Evaluates E[Q(p,r)] and its gradient in parallel. 

        Parameters
        ----------
        p : generator powers
        num_procs : number of parallel processes
        quiet : flag
        """
       
        from multiprocess import Pool,cpu_count
 
        if not num_procs:
            num_procs = cpu_count()
        num_samples = self.parameters['num_samples']
        pool = Pool(num_procs)
        num = int(np.ceil(float(num_samples)/float(num_procs)))
        results = list(zip(*pool.map(lambda i: self.eval_EQ_sequential(p,num,i,quiet),range(num_procs),chunksize=1)))
        pool.terminate()
        pool.join()
        assert(len(results) == 2)
        assert(all([len(vals) == num_procs for vals in results]))
        return [sum(vals)/float(num_procs) for vals in results]
Exemplo n.º 25
0
def test():
    print('cpuCount() = %d\n' % cpuCount())
    
    #
    # Create pool
    #
    
    PROCESSES = 4
    print('Creating pool with %d processes\n' % PROCESSES)
    pool = Pool(PROCESSES)    

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.apply_async(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imap_unordered(calculatestar, TASKS)

    print('Ordered results using pool.apply_async():')
    for r in results:
        print('\t', r.get())
    print()

    print('Ordered results using pool.imap():')
    for x in imap_it:
        print('\t', x)
    print()

    print('Unordered results using pool.imap_unordered():')
    for x in imap_unordered_it:
        print('\t', x)
    print()

    print('Ordered results using pool.map() --- will block till complete:')
    for x in pool.map(calculatestar, TASKS):
        print('\t', x)
    print()

    #
    # Simple benchmarks
    #

    N = 100000
    print('def pow3(x): return x**3')
    
    t = time.time()
    A = list(map(pow3, xrange(N)))
    print('\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))
    
    t = time.time()
    B = pool.map(pow3, xrange(N))
    print('\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))

    t = time.time()
    C = list(pool.imap(pow3, xrange(N), chunksize=N//8))
    print('\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t))
    
    assert A == B == C, (len(A), len(B), len(C))
    print()
    
    L = [None] * 1000000
    print('def noop(x): pass')
    print('L = [None] * 1000000')
    
    t = time.time()
    A = list(map(noop, L))
    print('\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))
    
    t = time.time()
    B = pool.map(noop, L)
    print('\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L)//8))
    print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t))

    assert A == B == C, (len(A), len(B), len(C))
    print()

    del A, B, C, L

    #
    # Test error handling
    #

    print('Testing error handling:')

    try:
        print(pool.apply(f, (5,)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.apply()')
    else:
        raise AssertionError('expected ZeroDivisionError')

    try:
        print(pool.map(f, range(10)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.map()')
    else:
        raise AssertionError('expected ZeroDivisionError')
            
    try:
        print(list(pool.imap(f, range(10))))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from list(pool.imap())')
    else:
        raise AssertionError('expected ZeroDivisionError')

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError('expected ZeroDivisionError')
            
    assert i == 9
    print('\tGot ZeroDivisionError as expected from IMapIterator.next()')
    print()
    
    #
    # Testing timeouts
    #
    
    print('Testing ApplyResult.get() with timeout:', end='')
    res = pool.apply_async(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()

    print('Testing IMapIterator.next() with timeout:', end='')
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()
            
    #
    # Testing callback
    #

    print('Testing callback:')
    
    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]
        
    r = pool.apply_async(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.map_async(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print('\tcallbacks succeeded\n')
    else:
        print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B))
    
    #
    # Check there are no outstanding tasks
    #
    
    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print('Testing close():')

    for worker in pool._pool:
        assert worker.is_alive()

    result = pool.apply_async(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tclose() succeeded\n')

    #
    # Check terminate() method
    #

    print('Testing terminate():')

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tterminate() succeeded\n')

    #
    # Check garbage collection
    #

    print('Testing garbage collection:')

    pool = Pool(2)
    processes = pool._pool
    
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)
    
    for worker in processes:
        assert not worker.is_alive()

    print('\tgarbage collection succeeded\n')
Exemplo n.º 26
0
    def partial_dependence(self,
                           feature_ids,
                           modelinstance,
                           filter_classes=None,
                           grid=None,
                           grid_resolution=30,
                           n_jobs=-1,
                           grid_range=None,
                           sample=True,
                           sampling_strategy='random-choice',
                           n_samples=1000,
                           bin_count=50,
                           return_metadata=False,
                           progressbar=True,
                           variance_type='estimate'):
        """
        Approximates the partial dependence of the predict_fn with respect to the
        variables passed.

        Parameters:
        -----------
        feature_ids: list
            the names/ids of the features for which partial dependence is to be computed.
            Note that the algorithm's complexity scales exponentially with additional
            features, so generally one should only look at one or two features at a
            time. These feature ids must be available in the class's associated DataSet.
            As of now, we only support looking at 1 or 2 features at a time.
        modelinstance: skater.model.model.Model subtype
            an estimator function of a fitted model used to derive prediction. Supports
            classification and regression. Supports classification(binary, multi-class) and regression.
            predictions = predict_fn(data)

            Can either by a skater.model.remote.DeployedModel or a
            skater.model.local.InMemoryModel
        filter_classes: array type
            The classes to run partial dependence on. Default None invokes all classes.
            Only used in classification models.
        grid: numpy.ndarray
            2 dimensional array on which we fix values of features. Note this is
            determined automatically if not given based on the percentiles of the
            dataset.
        grid_resolution: int
            how many unique values to include in the grid. If the percentile range
            is 5% to 95%, then that range will be cut into <grid_resolution>
            equally size bins. Defaults to 30.
        n_jobs: int
            The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
            Defaults to using all cores(-1).
        grid_range: tuple
            the percentile extrama to consider. 2 element tuple, increasing, bounded
            between 0 and 1.
        sample: boolean
            Whether to sample from the original dataset.
        sampling_strategy: string
            If sampling, which approach to take. See DataSet.generate_sample for
            details.
        n_samples: int
            The number of samples to use from the original dataset. Note this is
            only active if sample = True and sampling strategy = 'uniform'. If
            using 'uniform-over-similarity-ranks', use samples per bin
        bin_count: int
            The number of bins to use when using the similarity based sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'.
            total samples = bin_count * samples per bin.
        samples_per_bin: int
            The number of samples to collect for each bin within the sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'. If using
            sampling_strategy = 'uniform', use n_samples.
            total samples = bin_count * samples per bin.
        variance_type: string

        return_metadata: boolean

        :Example:
        >>> from skater.model import InMemoryModel
        >>> from skater.core.explanations import Interpretation
        >>> from sklearn.ensemble import RandomForestClassier
        >>> from sklearn.datasets import load_boston
        >>> boston = load_boston()
        >>> X = boston.data
        >>> y = boston.target
        >>> features = boston.feature_names

        >>> rf = RandomForestClassier()
        >>> rf.fit(X,y)


        >>> model = InMemoryModel(rf, examples = X)
        >>> interpreter = Interpretation()
        >>> interpreter.load_data(X)
        >>> feature_ids = ['ZN','CRIM']
        >>> interpreter.partial_dependence.partial_dependence(features,model)
        """

        if self.data_set is None:
            load_data_not_called_err_msg = "self.interpreter.data_set not found. \n" \
                                           "Please call Interpretation.load_data \n" \
                                           "before running this method."
            raise (
                exceptions.DataSetNotLoadedError(load_data_not_called_err_msg))

        feature_ids = self._check_features(feature_ids)

        if filter_classes:
            err_msg = "members of filter classes must be \n" \
                      "members of modelinstance.classes. \n" \
                      "Expected members of: \n" \
                      "{0}\n" \
                      "got: \n" \
                      "{1}".format(modelinstance.target_names,
                                   filter_classes)
            filter_classes = list(filter_classes)
            assert all([
                i in modelinstance.target_names for i in filter_classes
            ]), err_msg

        # TODO: There might be a better place to do this check
        if not isinstance(modelinstance, ModelType):
            raise (exceptions.ModelError(
                "Incorrect estimator function used for computing partial dependence, try one \n"
                "creating one with skater.model.local.InMemoryModel or \n"
                "skater.model.remote.DeployedModel"))

        if modelinstance.model_type == 'classifier' and modelinstance.probability is False:

            if modelinstance.unique_values is None:
                raise (exceptions.ModelError(
                    'If using classifier without probability scores, unique_values cannot \n'
                    'be None'))
            self.interpreter.logger.warn(
                "Classifiers with probability scores can be explained \n"
                "more granularly than those without scores. If a prediction method with \n"
                "scores is available, use that instead.")

        # TODO: This we can change easily to functional style
        missing_feature_ids = []
        for feature_id in feature_ids:
            if feature_id not in self.data_set.feature_ids:
                missing_feature_ids.append(feature_id)

        if missing_feature_ids:
            missing_feature_id_err_msg = "Features {0} not found in \n" \
                                         "Interpretation.data_set.feature_ids \n" \
                                         "{1}".format(missing_feature_ids, self.data_set.feature_ids)
            raise (KeyError(missing_feature_id_err_msg))

        if grid_range is None:
            grid_range = (.05, 0.95)
        else:
            if not hasattr(grid_range, "__iter__"):
                err_msg = "Grid range {} needs to be an iterable".format(
                    grid_range)
                raise (exceptions.MalformedGridRangeError(err_msg))

        self._check_grid_range(grid_range)

        if not modelinstance.has_metadata:
            examples = self.data_set.generate_sample(strategy='random-choice',
                                                     sample=True,
                                                     n_samples=10)

            examples = DataManager(examples,
                                   feature_names=self.data_set.feature_ids)
            modelinstance._build_model_metadata(examples)

        # if you dont pass a grid, build one.
        grid = np.array(grid)
        if not grid.any():
            # Currently, if a given feature has fewer unique values than the value
            # of grid resolution, then the grid will be set to those unique values.
            # Otherwise it will take the percentile
            # range according with grid_resolution bins.
            grid = self.data_set.generate_grid(feature_ids,
                                               grid_resolution=grid_resolution,
                                               grid_range=grid_range)
        else:
            # want to ensure all grids have 2 axes
            if len(grid.shape) == 1 and \
                    (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])):
                grid = grid[:, np.newaxis].T
                grid_resolution = grid.shape[1]

        self.interpreter.logger.debug("Grid shape used for pdp: {}".format(
            grid.shape))
        self.interpreter.logger.debug(
            "Grid resolution for pdp: {}".format(grid_resolution))

        # make sure data_set module is giving us correct data structure
        self._check_grid(grid, feature_ids)

        # generate data
        data_sample = self.data_set.generate_sample(strategy=sampling_strategy,
                                                    sample=sample,
                                                    n_samples=n_samples,
                                                    bin_count=bin_count)

        assert type(data_sample) == self.data_set.data_type, "Something went wrong\n" \
                                                             "Theres a type mismatch between\n" \
                                                             "the sampled data and the origina\nl" \
                                                             "training set. Check Skater.models\n"

        _pdp_metadata = self._build_metadata_dict(modelinstance, feature_ids,
                                                  self.data_set.feature_ids,
                                                  filter_classes,
                                                  variance_type)

        self.interpreter.logger.debug("Shape of sampled data: {}".format(
            data_sample.shape))
        self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids))
        self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata))

        # cartesian product of grid
        grid_expanded = pd.DataFrame(list(product(*grid))).values

        if grid_expanded.shape[0] <= 0:
            empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \
                                          "grid shape: {}".format(grid_expanded.shape)
            raise (exceptions.MalformedGridError(empty_grid_expanded_err_msg))

        predict_fn = modelinstance._get_static_predictor()

        n_jobs = None if n_jobs < 0 else n_jobs
        pd_func = functools.partial(_compute_pd,
                                    estimator_fn=predict_fn,
                                    grid_expanded=grid_expanded,
                                    pd_metadata=_pdp_metadata,
                                    input_data=data_sample,
                                    filter_classes=filter_classes)
        arg_list = [i for i in range(grid_expanded.shape[0])]
        executor_instance = Pool(n_jobs)

        if progressbar:
            self.interpreter.logger.warn(
                "Progress bars slow down runs by 10-20%. For slightly \n"
                "faster runs, do progress_bar=False")
            mapper = executor_instance.imap
            p = ProgressBar(len(arg_list), units='grid cells')
        else:
            mapper = executor_instance.map

        pd_list = []
        try:
            if n_jobs == 1:
                raise ValueError("Skipping to single processing")
            for pd_row in mapper(pd_func, arg_list):
                if progressbar:
                    p.animate()
                pd_list.append(pd_row)
        except:
            self.interpreter.logger.warn(
                "Multiprocessing failed, going single process")
            for pd_row in map(pd_func, arg_list):
                if progressbar:
                    p.animate()
                pd_list.append(pd_row)
        finally:
            executor_instance.close()
            executor_instance.join()
            executor_instance.terminate()
        if return_metadata:
            return pd.DataFrame(list(pd_list)), _pdp_metadata
        else:
            return pd.DataFrame(list(pd_list))
Exemplo n.º 27
0
                    "--image_dir",
                    required=True,
                    help="path to the input dir")
    ap.add_argument("-p",
                    "--plot",
                    type=bool,
                    default=False,
                    required=False,
                    help="plot results")
    ap.add_argument("-ps",
                    "--pool_size",
                    type=int,
                    default=1,
                    required=False,
                    help="pool size for multiprocessing")
    args = vars(ap.parse_args())

    images = glob.glob(args["image_dir"] + '*')
    plot = bool(args["plot"])
    pool_size = int(args["pool_size"])
    print(args["image_dir"], plot)

    pool = Pool(pool_size)
    pool_outputs = pool.map(partial(get_boxes, config=config, plot=plot),
                            images[:])
    pool.close()
    pool.join()
    pool.terminate()
    for output in pool_outputs:
        cv2.imwrite(output[2].replace('\\in', '\\out'), output[3])
Exemplo n.º 28
0
def main():
    qid = sys.argv[1]

    very_par = True
    ## Get query
    global q
    q = Query.objects.get(pk=qid)

    i = 0
    n_records = 0
    records = []
    if very_par == True:
        record = []
    else:
        record = {}

    max_chunk_size = 2000
    chunk_size = 0

    print(q.title)

    title = str(q.id)

    with open(settings.QUERY_DIR + title + "/s_results.txt",
              encoding="utf-8") as res:
        for line in res:
            if '\ufeff' in line:  # BOM on first line
                continue
            if 'ER  -' in line:

                # end of record - save it and start a new one
                n_records += 1
                records.append(record)
                if very_par:
                    record = []
                else:
                    record = {}
                chunk_size += 1
                if chunk_size == max_chunk_size:
                    #print("doing chunk starting from record:")
                    #print(records[0])
                    # parallely add docs
                    pool = Pool(processes=32)
                    if very_par:
                        pool.map(add_doc_text, records)
                    else:
                        pool.map(add_doc, records)
                    pool.terminate()
                    records = []
                    chunk_size = 0
                continue
            if re.match("^EF", line):  #end of file
                #done!
                continue
            record.append(line)

    print(chunk_size)

    if chunk_size < max_chunk_size:
        # parallely add docs
        pool = Pool(processes=32)
        if very_par:
            pool.map(add_doc_text, records)
        else:
            pool.map(add_doc, records)
        pool.terminate()

    django.db.connections.close_all()
    q.r_count = len(Doc.objects.filter(query=q))
    q.save()
Exemplo n.º 29
0
def fmultiprocess(
        log,
        function,
        inputArray,
        poolSize=False,
        timeout=3600,
        **kwargs):
    """multiprocess pool

    **Key Arguments:**
        - ``log`` -- logger
        - ``function`` -- the function to multiprocess
        - ``inputArray`` -- the array to be iterated over
        - ``poolSize`` -- limit the number of CPU that are used in multiprocess job
        - ``timeout`` -- time in sec after which to raise a timeout error if the processes have not completed

    **Return:**
        - ``resultArray`` -- the array of results

    **Usage:**

        .. code-block:: python 

            from fundamentals import multiprocess
            # DEFINE AN INPUT ARRAY
            inputArray = range(10000)
            results = multiprocess(log=log, function=functionName, poolSize=10, timeout=300,
                                  inputArray=inputArray, otherFunctionKeyword="cheese")
    """
    log.debug('starting the ``multiprocess`` function')

    # DEFINTE POOL SIZE - NUMBER OF CPU CORES TO USE (BEST = ALL - 1)
    if not poolSize:
        poolSize = psutil.cpu_count()

    if poolSize:
        p = Pool(processes=poolSize)
    else:
        p = Pool()

    cpuCount = psutil.cpu_count()
    chunksize = int((len(inputArray) + 1) / (cpuCount * 3))

    if chunksize == 0:
        chunksize = 1

    # MAP-REDUCE THE WORK OVER MULTIPLE CPU CORES
    if "log" in inspect.getargspec(function)[0]:
        mapfunc = partial(function, log=log, **kwargs)
        resultArray = p.map_async(mapfunc, inputArray, chunksize=chunksize)
    else:
        mapfunc = partial(function, **kwargs)
        resultArray = p.map_async(mapfunc, inputArray, chunksize=chunksize)

    resultArray = resultArray.get(timeout=timeout)

    p.close()
    p.terminate()

    log.debug('completed the ``multiprocess`` function')
    return resultArray
Exemplo n.º 30
0
def read_wos(res, q, update, deduplicate=False):
    from django.db import connection
    connection.close()
    if deduplicate:
        print("nonstandard WoS, searching for duplicates")
    i = 0
    n_records = 0
    records = []
    record = {}
    mfields = ['AU', 'AF', 'CR', 'C1', 'WC']

    max_chunk_size = 2000
    chunk_size = 0

    q.doc_set.clear()

    p = 4

    for line in res:
        if '\ufeff' in line:  # BOM on first line
            continue
        if line == 'ER\n':
            # end of record - save it and start a new one
            n_records += 1
            records.append(record)
            record = {}
            chunk_size += 1
            if chunk_size == max_chunk_size:
                # parallely add docs
                if deduplicate:
                    print("adding as if scopus")  #
                    pool = Pool(processes=1)
                    pool.map(partial(add_scopus_doc, q=q, update=update),
                             records)
                else:
                    pool = Pool(processes=p)
                    pool.map(partial(add_doc, q=q, update=update), records)
                pool.terminate()
                records = []
                chunk_size = 0
            continue
        if re.match("^EF", line):  #end of file
            if chunk_size < max_chunk_size:
                # parallely add doc
                #pool.map(update_doc, records)
                if deduplicate:
                    print("adding as if scopus")
                    from django.db import connection
                    connection.close()
                    pool = Pool(processes=1)
                    pool.map(partial(add_scopus_doc, q=q, update=update),
                             records)
                else:
                    pool = Pool(processes=p)
                    pool.map(partial(add_doc, q=q, update=update), records)
                pool.terminate()
            #done!
            break
        if re.match("(^[A-Z][A-Z1-9]) (.*)", line):
            s = re.search("(^[A-Z][A-Z1-9]) (.*)", line)
            key = s.group(1).strip()
            value = s.group(2).strip()
            if key in mfields:
                record[key] = [value]
            else:
                record[key] = value
        elif len(line) > 1:
            if key in mfields:
                record[key].append(line.strip())
            else:
                try:
                    record[key] += " " + line.strip()
                except:
                    print(line)
                    print(record)

    return n_records
Exemplo n.º 31
0
def main():
    qid = sys.argv[1]

    ## Get query
    global q
    q = Query.objects.get(pk=qid)

    #Doc.objects.filter(query=qid).delete() # doesn't seem like a good idea

    i = 0
    n_records = 0
    records = []
    record = {}
    mfields = ['AU', 'AF', 'CR', 'C1', 'WC']

    max_chunk_size = 2000
    chunk_size = 0

    print(q.title)

    title = str(q.id)

    with open(settings.QUERY_DIR + title + "/results.txt",
              encoding="utf-8") as res:
        for line in res:
            if '\ufeff' in line:  # BOM on first line
                continue
            if line == 'ER\n':
                # end of record - save it and start a new one
                n_records += 1
                records.append(record)
                record = {}
                chunk_size += 1
                if chunk_size == max_chunk_size:
                    # parallely add docs
                    pool = Pool(processes=50)
                    pool.map(update_doc, records)
                    #pool.map(partial(add_doc, q=q),records)
                    pool.terminate()

                    records = []
                    chunk_size = 0
                continue
            if re.match("^EF", line):  #end of file
                if chunk_size < max_chunk_size:
                    # parallely add docs
                    pool = Pool(processes=50)
                    pool.map(update_doc, records)
                    #pool.map(partial(add_doc, q=q),records)
                    pool.terminate()
                #done!
                break
            if re.match("(^[A-Z][A-Z1-9])", line):
                s = re.search("(^[A-Z][A-Z1-9]) (.*)", line)
                key = s.group(1).strip()
                value = s.group(2).strip()
                if key in mfields:
                    record[key] = [value]
                else:
                    record[key] = value
            elif len(line) > 1:
                if key in mfields:
                    record[key].append(line.strip())
                else:
                    record[key] += " " + line.strip()

    django.db.connections.close_all()
    q.r_count = n_records
    q.save()
Exemplo n.º 32
0
    def recursive_function(self, remotepath, *args, **kwargs):
        """The recursive wrapper around the original function."""
        recursive = kwargs.pop('recursive', False)
        recursive_se = kwargs.pop('recursivese', None)
        if recursive_se is not None and recursive == False:
            recursive = True
        list_file = kwargs.pop('list', None)
        parallel = kwargs.pop('parallel', 1)
        if 'verbose' in kwargs:
            verbose = kwargs['verbose']
        else:
            verbose = False

        if isinstance(recursive, str):
            regex = re.compile(recursive)
            recursive = True
        else:
            regex = None

        if list_file is not None:
            list_file = open(list_file, 'wt')

        good = 0
        bad = 0

        def is_good(path):
            if verbose:
                print_(self.iterating + " " + path)
            try:
                ret = self.function(path, *args, **kwargs)
            except Exception as e:
                print_(self.iterating + " " + path + " failed.")
                print_(e)
                return False, path
            else:
                if ret == 0:
                    return True, path
                else:
                    return False, path

        if recursive is True:
            if parallel > 1:
                # Deal with signal weirdness when using a Pool
                # Otherwise we won't be able to kill things with CTRL-C
                def abort(*args, **kwargs):
                    print_("%d Aborting!" % (os.getpid(), ))
                    raise Exception("%d Aborting!" % (os.getpid(), ))

                orig_sigint = signal.getsignal(signal.SIGINT)
                orig_sigterm = signal.getsignal(signal.SIGTERM)
                signal.signal(signal.SIGINT, abort)
                signal.signal(signal.SIGTERM, abort)
                p = Pool(parallel)
                mapper = p.imap
            else:
                mapper = map

            try:
                for g, path in mapper(
                        is_good,
                        utils.remote_iter_recursively(remotepath,
                                                      regex,
                                                      se=recursive_se,
                                                      ignore_exceptions=True)):
                    if g:
                        good += 1
                    else:
                        bad += 1
                        if list_file is not None:
                            list_file.write(path + '\n')
            except Exception:
                if parallel > 1:
                    # Kill all child processes
                    for proc in p._pool:
                        os.kill(proc.pid, signal.SIGTERM)
                raise
            finally:
                if parallel > 1:
                    # Clean up processing pool
                    p.terminate()
                    p.join()
                    del p
                    # Resetting signal handlers
                    signal.signal(signal.SIGINT, orig_sigint)
                    signal.signal(signal.SIGTERM, orig_sigterm)

            if verbose:
                print_("%s %d files. %d files failed." %
                       (self.iterated, good, bad))
            if list_file is not None:
                list_file.close()
            if bad == 0:
                return 0
            else:
                return 1
        else:
            g, path = is_good(remotepath)
            if list_file is not None:
                if not g:
                    list_file.write(remotepath + '\n')
                list_file.close()
            if g:
                return 0
            else:
                return 1
Exemplo n.º 33
0
def main():

    print("starting topic model - about to read WoS results")
    #df = scrapeWoS.readWoS('../query/results.txt')
    df = scrapeWoS.readWoS('results.txt')
    # filter articles with missing abstracts, and only include articles and reviews
    df = df[(pd.notnull(df.AB)) & (pd.notnull(df.UT)) &
            (df.DT.isin(["Article", "Review"]))].reset_index()

    # Randomly reorder (or take a sample) the docs
    df = df.sample(frac=0.1).reset_index(drop=True)

    print("found " + str(len(df)) + " articles")
    sys.stdout.flush()

    df.IN = df.index.tolist()

    # The number of documents to analyze each iteration
    batchsize = 100
    # The total number of documents in Wikipedia
    D = len(df)
    documentstoanalyze = int(D)
    iterations = D // batchsize
    # The number of topics
    try:
        K = int(sys.argv[1])
        print(K)
    except:
        K = 100

    docset = list(df.AB)

    #	####### making a corpus
    stoplist = set(nltk.corpus.stopwords.words("english"))
    stoplist.add('Elsevier')

    mycorpus = MyCorpus(docset, stoplist)

    dictionary = mycorpus.dictionary  #.filter_extremes(no_below=5, no_above=0.9)

    dictionary.filter_extremes(no_below=5, no_above=0.9)

    x = list(dictionary.values())
    y = list(dictionary)

    vocab = sorted(zip(y, x))

    W = len(vocab)

    print("found " + str(W) + " terms")

    sys.stdout.flush()

    run_id = db.init()

    # add terms to db

    db.add_terms(vocab)

    # add empty topics to db
    db.add_topics(K)

    #	for k in range(K):
    #		db.add_topic(k)

    # add all docs
    def f_doc(d):
        django.db.connections.close_all()
        db.add_doc(d[0], d[1], d[2], d[3], d[4], d[5])
        django.db.connections.close_all()

    def f_gamma(d):
        django.db.connections.close_all()
        doc_size = len(docset[d])
        doc_id = iteration * 100 + d + doc_diff
        for k in range(len(gamma[d])):
            db.add_doc_topic(doc_id, k, gamma[d][k], gamma[d][k] / doc_size)
        django.db.connections.close_all()

    def f_lambda(topic_no):
        django.db.connections.close_all()
        lambda_sum = sum(ldalambda[topic_no])
        db.clear_topic_terms(topic_no)
        for term_no in range(len(ldalambda[topic_no])):
            db.add_topic_term(topic_no, term_no,
                              ldalambda[topic_no][term_no] / lambda_sum)
        django.db.connections.close_all()

    # Take the information we need from the doc list then delete to free up memory
    docs = zip(df.TI, df.AB, df.IN, df.UT, df.PY, df.AU)
    all_docs = list(df.AB)

    # Add the documents to the database
    print("Adding docs to database")
    pool = Pool(processes=8)
    pool.map(f_doc, docs)
    pool.terminate()

    global doc_diff
    doc_diff = db.docdiff(df.IN[-1])
    del (df)

    del (docs)
    gc.collect()

    #	docs = zip(df.TI,df.AB,df.IN,df.UT,df.PY,df.AU)

    #	pool = Pool(processes=8)
    #	pool.map(f_auth,docs)
    #	pool.terminate()

    print("All docs added, initialising topic model")
    olda = gensim.models.LdaMulticore(num_topics=K,
                                      id2word=dictionary,
                                      workers=8)

    # Run until we've seen D documents. (Feel free to interrupt *much*
    # sooner than this.)
    for iteration in range(0, iterations):
        t0 = time.time()
        firstDoc = batchsize * iteration
        lastDoc = batchsize * (iteration + 1)
        if lastDoc > D:
            lastDoc = D

        docset = all_docs[firstDoc:lastDoc]

        mycorpus = MiniCorpus(docset, stoplist, dictionary)

        olda_t0 = time.time()
        olda.update(mycorpus)
        elapsed = time.time() - t0

        print("updated with docs " + str(firstDoc) + " to " + str(lastDoc) +
              ": took " + str(elapsed))

        # Get the gamma (doc-topic matrix) and add to database
        gamma = olda.inference(mycorpus)

        gamma = gamma[0]

        docs = range(len(gamma))

        pool = Pool(processes=8)
        pool.map(f_gamma, docs)
        pool.terminate()

        # Every 10th iteration, get the lambda (topic-term matrix) and add to database
        if (iteration % 20 == 0):

            ldalambda = olda.inference(mycorpus, collect_sstats=True)[1]
            numpy.savetxt('lambda-%d.dat' % iteration, ldalambda)
            numpy.savetxt('gamma-%d.dat' % iteration, gamma)

            topics = range(len(ldalambda))

            pool = Pool(processes=8)
            pool.map(f_lambda, topics)
            pool.terminate()

        gc.collect()
        sys.stdout.flush()
        elapsed = time.time() - t0
        print(elapsed)
        db.increment_batch_count()

    olda.save("olda" + str(run_id))
Exemplo n.º 34
0
    def partial_dependence(self, feature_ids, modelinstance, filter_classes=None, grid=None,
                           grid_resolution=30, n_jobs=-1, grid_range=None, sample=True,
                           sampling_strategy='random-choice', n_samples=1000,
                           bin_count=50, return_metadata=False,
                           progressbar=True, variance_type='estimate'):

        """
        Approximates the partial dependence of the predict_fn with respect to the
        variables passed.

        Parameters:
        -----------
        feature_ids: list
            the names/ids of the features for which partial dependence is to be computed.
            Note that the algorithm's complexity scales exponentially with additional
            features, so generally one should only look at one or two features at a
            time. These feature ids must be available in the class's associated DataSet.
            As of now, we only support looking at 1 or 2 features at a time.
        modelinstance: skater.model.model.Model subtype
            an estimator function of a fitted model used to derive prediction. Supports
            classification and regression. Supports classification(binary, multi-class) and regression.
            predictions = predict_fn(data)

            Can either by a skater.model.remote.DeployedModel or a
            skater.model.local.InMemoryModel
        filter_classes: array type
            The classes to run partial dependence on. Default None invokes all classes.
            Only used in classification models.
        grid: numpy.ndarray
            2 dimensional array on which we fix values of features. Note this is
            determined automatically if not given based on the percentiles of the
            dataset.
        grid_resolution: int
            how many unique values to include in the grid. If the percentile range
            is 5% to 95%, then that range will be cut into <grid_resolution>
            equally size bins. Defaults to 30.
        n_jobs: int
            The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
            Defaults to using all cores(-1).
        grid_range: tuple
            the percentile extrama to consider. 2 element tuple, increasing, bounded
            between 0 and 1.
        sample: boolean
            Whether to sample from the original dataset.
        sampling_strategy: string
            If sampling, which approach to take. See DataSet.generate_sample for
            details.
        n_samples: int
            The number of samples to use from the original dataset. Note this is
            only active if sample = True and sampling strategy = 'uniform'. If
            using 'uniform-over-similarity-ranks', use samples per bin
        bin_count: int
            The number of bins to use when using the similarity based sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'.
            total samples = bin_count * samples per bin.
        samples_per_bin: int
            The number of samples to collect for each bin within the sampler. Note
            this is only active if sample = True and
            sampling_strategy = 'uniform-over-similarity-ranks'. If using
            sampling_strategy = 'uniform', use n_samples.
            total samples = bin_count * samples per bin.
        variance_type: string

        return_metadata: boolean

        :Examples:
        >>> from skater.model import InMemoryModel
        >>> from skater.core.explanations import Interpretation
        >>> from sklearn.ensemble import RandomForestClassier
        >>> from sklearn.datasets import load_boston
        >>> boston = load_boston()
        >>> X = boston.data
        >>> y = boston.target
        >>> features = boston.feature_names

        >>> rf = RandomForestClassier()
        >>> rf.fit(X,y)


        >>> model = InMemoryModel(rf.predict_proba, examples = X)
        >>> interpreter = Interpretation()
        >>> interpreter.load_data(X)
        >>> feature_ids = ['ZN','CRIM']
        >>> interpreter.partial_dependence.partial_dependence(features,model)
        """

        if self.data_set is None:
            load_data_not_called_err_msg = "self.interpreter.data_set not found. \n" \
                                           "Please call Interpretation.load_data \n" \
                                           "before running this method."
            raise(exceptions.DataSetNotLoadedError(load_data_not_called_err_msg))

        feature_ids = self._check_features(feature_ids)

        if filter_classes:
            err_msg = "members of filter classes must be \n" \
                      "members of modelinstance.classes. \n" \
                      "Expected members of: \n" \
                      "{0}\n" \
                      "got: \n" \
                      "{1}".format(modelinstance.target_names,
                                   filter_classes)
            filter_classes = list(filter_classes)
            assert all([i in modelinstance.target_names for i in filter_classes]), err_msg

        # TODO: There might be a better place to do this check
        if not isinstance(modelinstance, ModelType):
            raise(exceptions.ModelError("Incorrect estimator function used for computing partial dependence, try one \n"
                                        "creating one with skater.model.local.InMemoryModel or \n"
                                        "skater.model.remote.DeployedModel"))

        if modelinstance.model_type == 'classifier' and modelinstance.probability is False:

            if modelinstance.unique_values is None:
                raise(exceptions.ModelError('If using classifier without probability scores, unique_values cannot \n'
                                            'be None'))
            self.interpreter.logger.warn("Classifiers with probability scores can be explained \n"
                                         "more granularly than those without scores. If a prediction method with \n"
                                         "scores is available, use that instead.")

        # TODO: This we can change easily to functional style
        missing_feature_ids = []
        for feature_id in feature_ids:
            if feature_id not in self.data_set.feature_ids:
                missing_feature_ids.append(feature_id)

        if missing_feature_ids:
            missing_feature_id_err_msg = "Features {0} not found in \n" \
                                         "Interpretation.data_set.feature_ids \n" \
                                         "{1}".format(missing_feature_ids, self.data_set.feature_ids)
            raise(KeyError(missing_feature_id_err_msg))

        if grid_range is None:
            grid_range = (.05, 0.95)
        else:
            if not hasattr(grid_range, "__iter__"):
                err_msg = "Grid range {} needs to be an iterable".format(grid_range)
                raise(exceptions.MalformedGridRangeError(err_msg))

        self._check_grid_range(grid_range)

        if not modelinstance.has_metadata:
            examples = self.data_set.generate_sample(strategy='random-choice',
                                                     sample=True,
                                                     n_samples=10)

            examples = DataManager(examples, feature_names=self.data_set.feature_ids)
            modelinstance._build_model_metadata(examples)

        # if you dont pass a grid, build one.
        grid = np.array(grid)
        if not grid.any():
            # Currently, if a given feature has fewer unique values than the value
            # of grid resolution, then the grid will be set to those unique values.
            # Otherwise it will take the percentile
            # range according with grid_resolution bins.
            grid = self.data_set.generate_grid(feature_ids,
                                               grid_resolution=grid_resolution,
                                               grid_range=grid_range)
        else:
            # want to ensure all grids have 2 axes
            if len(grid.shape) == 1 and \
                    (StaticTypes.data_types.is_string(grid[0]) or StaticTypes.data_types.is_numeric(grid[0])):
                grid = grid[:, np.newaxis].T
                grid_resolution = grid.shape[1]

        self.interpreter.logger.debug("Grid shape used for pdp: {}".format(grid.shape))
        self.interpreter.logger.debug("Grid resolution for pdp: {}".format(grid_resolution))

        # make sure data_set module is giving us correct data structure
        self._check_grid(grid, feature_ids)

        # generate data
        data_sample = self.data_set.generate_sample(strategy=sampling_strategy,
                                                    sample=sample,
                                                    n_samples=n_samples,
                                                    bin_count=bin_count)

        assert type(data_sample) == self.data_set.data_type, "Something went wrong\n" \
                                                             "Theres a type mismatch between\n" \
                                                             "the sampled data and the origina\nl" \
                                                             "training set. Check Skater.models\n"

        _pdp_metadata = self._build_metadata_dict(modelinstance,
                                                  feature_ids,
                                                  self.data_set.feature_ids,
                                                  filter_classes,
                                                  variance_type)

        self.interpreter.logger.debug("Shape of sampled data: {}".format(data_sample.shape))
        self.interpreter.logger.debug("Feature Ids: {}".format(feature_ids))
        self.interpreter.logger.debug("PD metadata: {}".format(_pdp_metadata))

        # cartesian product of grid
        grid_expanded = pd.DataFrame(list(product(*grid))).values

        if grid_expanded.shape[0] <= 0:
            empty_grid_expanded_err_msg = "Must have at least 1 pdp value" \
                                          "grid shape: {}".format(grid_expanded.shape)
            raise(exceptions.MalformedGridError(empty_grid_expanded_err_msg))

        predict_fn = modelinstance._get_static_predictor()

        n_jobs = None if n_jobs < 0 else n_jobs
        pd_func = functools.partial(_compute_pd,
                                    estimator_fn=predict_fn,
                                    grid_expanded=grid_expanded,
                                    pd_metadata=_pdp_metadata,
                                    input_data=data_sample,
                                    filter_classes=filter_classes)
        arg_list = [i for i in range(grid_expanded.shape[0])]
        executor_instance = Pool(n_jobs)

        if progressbar:
            self.interpreter.logger.warn("Progress bars slow down runs by 10-20%. For slightly \n"
                                         "faster runs, do progress_bar=False")
            mapper = executor_instance.imap
            p = ProgressBar(len(arg_list), units='grid cells')
        else:
            mapper = executor_instance.map

        pd_list = []
        try:
            if n_jobs == 1:
                raise ValueError("Skipping to single processing")
            for pd_row in mapper(pd_func, arg_list):
                if progressbar:
                    p.animate()
                pd_list.append(pd_row)
        except:
            self.interpreter.logger.warn("Multiprocessing failed, going single process")
            for pd_row in map(pd_func, arg_list):
                if progressbar:
                    p.animate()
                pd_list.append(pd_row)
        finally:
            executor_instance.close()
            executor_instance.join()
            executor_instance.terminate()
        if return_metadata:
            return pd.DataFrame(list(pd_list)), _pdp_metadata
        else:
            return pd.DataFrame(list(pd_list))
Exemplo n.º 35
0
def test():
    print('cpuCount() = %d\n' % cpuCount())

    #
    # Create pool
    #

    PROCESSES = 4
    print('Creating pool with %d processes\n' % PROCESSES)
    pool = Pool(PROCESSES)

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.apply_async(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imap_unordered(calculatestar, TASKS)

    print('Ordered results using pool.apply_async():')
    for r in results:
        print('\t', r.get())
    print()

    print('Ordered results using pool.imap():')
    for x in imap_it:
        print('\t', x)
    print()

    print('Unordered results using pool.imap_unordered():')
    for x in imap_unordered_it:
        print('\t', x)
    print()

    print('Ordered results using pool.map() --- will block till complete:')
    for x in pool.map(calculatestar, TASKS):
        print('\t', x)
    print()

    #
    # Simple benchmarks
    #

    N = 100000
    print('def pow3(x): return x**3')

    t = time.time()
    A = list(map(pow3, range(N)))
    print('\tmap(pow3, range(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))

    t = time.time()
    B = pool.map(pow3, range(N))
    print('\tpool.map(pow3, range(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t))

    t = time.time()
    C = list(pool.imap(pow3, range(N), chunksize=N // 8))
    print('\tlist(pool.imap(pow3, range(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t))

    assert A == B == C, (len(A), len(B), len(C))
    print()

    L = [None] * 1000000
    print('def noop(x): pass')
    print('L = [None] * 1000000')

    t = time.time()
    A = list(map(noop, L))
    print('\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))

    t = time.time()
    B = pool.map(noop, L)
    print('\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t))

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L) // 8))
    print('\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t))

    assert A == B == C, (len(A), len(B), len(C))
    print()

    del A, B, C, L

    #
    # Test error handling
    #

    print('Testing error handling:')

    try:
        print(pool.apply(f, (5, )))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.apply()')
    else:
        raise AssertionError('expected ZeroDivisionError')

    try:
        print(pool.map(f, range(10)))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from pool.map()')
    else:
        raise AssertionError('expected ZeroDivisionError')

    try:
        print(list(pool.imap(f, range(10))))
    except ZeroDivisionError:
        print('\tGot ZeroDivisionError as expected from list(pool.imap())')
    else:
        raise AssertionError('expected ZeroDivisionError')

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError('expected ZeroDivisionError')

    assert i == 9
    print('\tGot ZeroDivisionError as expected from IMapIterator.next()')
    print()

    #
    # Testing timeouts
    #

    print('Testing ApplyResult.get() with timeout:', end='')
    res = pool.apply_async(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()

    print('Testing IMapIterator.next() with timeout:', end='')
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print()
    print()

    #
    # Testing callback
    #

    print('Testing callback:')

    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

    r = pool.apply_async(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.map_async(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print('\tcallbacks succeeded\n')
    else:
        print('\t*** callbacks failed\n\t\t%s != %s\n' % (A, B))

    #
    # Check there are no outstanding tasks
    #

    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print('Testing close():')

    for worker in pool._pool:
        assert worker.is_alive()

    result = pool.apply_async(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tclose() succeeded\n')

    #
    # Check terminate() method
    #

    print('Testing terminate():')

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.is_alive()

    print('\tterminate() succeeded\n')

    #
    # Check garbage collection
    #

    print('Testing garbage collection:')

    pool = Pool(2)
    processes = pool._pool

    ignore = pool.apply(pow3, [2])
    results = [pool.apply_async(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)

    for worker in processes:
        assert not worker.is_alive()

    print('\tgarbage collection succeeded\n')
Exemplo n.º 36
0
def upload_dtm(run_id, output_path):
    stat = RunStats.objects.get(pk=run_id)
    print("upload dtm results to db")

    info = readInfo(os.path.join(output_path, "lda-seq/info.dat"))

    topic_ids = db.add_topics(stat.K, stat.run_id)

    vocab_ids = []
    input_path = output_path.replace("-output-", "-input-")
    with open(os.path.join(input_path, 'foo-vocab.dat'), 'r') as f:
        for l in f:
            try:
                vocab_ids.append(int(l.split(':')[0].strip()))
            except:
                pass

    ids = []
    docsizes = []
    with open(os.path.join(input_path, 'foo-docids.dat'), 'r') as f:
        for l in f:
            try:
                id, s = [int(x.strip()) for x in l.split(':')]
                ids.append(id)
                docsizes.append(s)
            except:
                pass

    time_range = sorted([tp.n for tp in stat.periods.all().order_by('n')])

    #################################
    # TopicTerms

    print("writing topic terms")
    topics = range(info['NUM_TOPICS'])
    pool = Pool(processes=8)
    pool.map(
        partial(dtm_topic,
                info=info,
                topic_ids=topic_ids,
                vocab_ids=vocab_ids,
                ys=time_range,
                run_id=run_id,
                output_path=output_path), topics)
    pool.terminate()
    gc.collect()

    ######################################
    # Doctopics
    print("writing doctopics")
    gamma = np.fromfile(os.path.join(output_path, 'lda-seq/gam.dat'),
                        dtype=float,
                        sep=" ")
    gamma = gamma.reshape((int(len(gamma) / stat.K), stat.K))

    gamma = find(csr_matrix(gamma))
    glength = len(gamma[0])
    chunk_size = 100000
    ps = 16
    parallel_add = True

    all_dts = []

    make_t = 0
    add_t = 0

    for i in range(glength // chunk_size + 1):
        dts = []
        values_list = []
        f = i * chunk_size
        l = (i + 1) * chunk_size
        if l > glength:
            l = glength
        docs = range(f, l)
        doc_batches = []
        for p in range(ps):
            doc_batches.append([x for x in docs if x % ps == p])
        pool = Pool(processes=ps)
        make_t0 = time()
        values_list.append(
            pool.map(
                partial(db.f_gamma_batch,
                        gamma=gamma,
                        docsizes=docsizes,
                        docUTset=ids,
                        topic_ids=topic_ids,
                        run_id=run_id), doc_batches))
        pool.terminate()
        make_t += time() - make_t0
        django.db.connections.close_all()

        add_t0 = time()
        values_list = [item for sublist in values_list for item in sublist]

        pool = Pool(processes=ps)
        pool.map(db.insert_many, values_list)
        pool.terminate()

        add_t += time() - add_t0
        gc.collect()
        sys.stdout.flush()

    stat = RunStats.objects.get(run_id=run_id)
    stat.last_update = timezone.now()
    stat.status = 3  # 3 = finished
    stat.save()
    management.call_command('update_run', run_id)
Exemplo n.º 37
0
def run_blei_dtm(stat,
                 call_to_blei_algorithm=True,
                 dtm_path="/home/galm/software/dtm/dtm/main",
                 archiving=True):
    """
    Run dynamic NMF model on utterances (speeches) or paragraphs from the parliament data

    :param stat:  RunStats object with the parameters to run the model with
    :param call_to_blei_algorithm: boolean whether to call the blei algorithm or not
    :param dtm_path: path to the dtm binary
    :return: 0 if successful, 1 otherwise
    """

    start_datetime = timezone.now()

    print("starting topic model for runstat with settings:")
    for field in stat._meta.fields:
        field_value = getattr(stat, field.name)
        if field_value:
            print("{}: {}".format(field.name, field_value))

    run_id = stat.run_id
    s = Search.objects.get(pk=stat.psearch.id)

    ##########################
    ## create input and output folder

    input_path = './dtm-input-sid{}_{}'.format(stat.psearch.id, stat.pk)
    output_path = './dtm-output-sid{}_{}'.format(stat.psearch.id, stat.pk)

    if os.path.isdir(input_path):
        if call_to_blei_algorithm:
            shutil.rmtree(input_path)
            os.mkdir(input_path)
    else:
        os.mkdir(input_path)

    if os.path.isdir(output_path):
        if call_to_blei_algorithm:
            shutil.rmtree(output_path)
            os.mkdir(output_path)
    else:
        os.mkdir(output_path)

    # load text from database

    if s.search_object_type == 1:
        ps = Paragraph.objects.filter(search_matches=s)
        docs = ps.filter(
            text__iregex='\w').order_by('utterance__document__parlperiod__n')
        texts, docsizes, ids = process_texts(docs)
        tc = ps.order_by().values(
            'utterance__document__parlperiod__n').annotate(
                count=models.Count('utterance__document__parlperiod__n'))
        time_counts = {
            item['utterance__document__parlperiod__n']: item['count']
            for item in tc
        }
        pps = ParlPeriod.objects.filter(
            document__utterance__paragraph__in=ps).distinct()

    elif s.search_object_type == 2:
        uts = Utterance.objects.filter(
            search_matches=s).order_by('document__parlperiod__n')
        texts, docsizes, ids = merge_utterance_paragraphs(uts)
        tc = uts.order_by().values('document__parlperiod__n').annotate(
            count=models.Count('document__parlperiod__n'))
        time_counts = {
            item['document__parlperiod__n']: item['count']
            for item in tc
        }
        pps = ParlPeriod.objects.filter(document__utterance__in=uts).distinct()
    else:
        print("search object type invalid")
        return 1

    for i, pp in enumerate(pps.order_by('n')):
        try:
            tp, created = TimePeriod.objects.get_or_create(parlperiod=pp,
                                                           n=i,
                                                           ys=pp.years,
                                                           title=str(pp))
        except MultipleObjectsReturned:
            tp = TimePeriod.objects.filter(
                parlperiod=pp, n=i, ys=pp.years,
                title=str(pp)).order_by('id').first()
        stat.periods.add(tp)

    time_range = [s.n for s in stat.periods.all().order_by('n')]

    #########################
    ## Get the features now
    print("Extracting word features...")

    if stat.language is "german":
        stemmer = SnowballStemmer("german")
        tokenizer = german_stemmer()
        stopword_list = [stemmer.stem(t) for t in stopwords.words("german")]

    elif stat.language is "english":
        stemmer = SnowballStemmer("english")
        stopword_list = [stemmer.stem(t) for t in stopwords.words("english")]
        tokenizer = snowball_stemmer()
    else:
        print("Language not recognized: {}".format(stat.language))
        return 1

    if stat.extra_stopwords:
        stopword_list = list(set(stopword_list) | set(stat.extra_stopwords))

    if stat.max_features == 0:
        n_features = 100000000
    else:
        n_features = stat.max_features

    vectorizer = CountVectorizer(max_df=stat.max_df,
                                 min_df=stat.min_freq,
                                 max_features=n_features,
                                 ngram_range=(1, stat.ngram),
                                 tokenizer=tokenizer,
                                 stop_words=stopword_list)

    t0 = time()
    dtm = vectorizer.fit_transform(texts)

    print("done in %0.3fs." % (time() - t0))

    with open(os.path.join(input_path, 'foo-doctexts.dat'), 'w') as f:
        for i, text in enumerate(texts):
            f.write("D#{}: ".format(i) + text + "\n")
        f.write('\n')

    del texts

    gc.collect()

    print("Save terms to DB")

    # Get the vocab, add it to db
    vocab = vectorizer.get_feature_names()
    vocab_ids = []
    pool = Pool(processes=8)
    vocab_ids.append(pool.map(partial(db.add_features, run_id=run_id), vocab))
    pool.terminate()

    vocab_ids = vocab_ids[0]
    with open(os.path.join(input_path, 'foo-vocab.dat'), 'w') as f:
        for i, w in enumerate(vocab):
            f.write(str(vocab_ids[i]) + ": " + w + "\n")
        f.write('\n')

    del vocab

    django.db.connections.close_all()

    print("write input files for Blei algorithm")

    with open(os.path.join(input_path, 'foo-mult.dat'), 'w') as mult:
        for d in range(dtm.shape[0]):
            words = find(dtm[d])
            uwords = len(words[0])
            mult.write(str(uwords) + " ")
            for w in range(uwords):
                index = words[1][w]
                count = words[2][w]
                mult.write(str(index) + ": " + str(count) + " ")
            mult.write('\n')

    ##########################
    ##put counts per time step in the seq file

    with open(os.path.join(input_path, 'foo-seq.dat'), 'w') as seq:
        seq.write(str(len(time_range)))

        for key, value in time_counts.items():
            seq.write('\n')
            seq.write(str(value))

    ##########################
    # Run the dtm

    if call_to_blei_algorithm:
        print("Calling Blei algorithm")

        process_output = open(
            os.path.join(output_path, 'blei_dtm_algorithm.log'), 'w')
        subprocess.Popen([
            dtm_path, "--ntopics={}".format(
                stat.K), "--mode=fit", "--rng_seed={}".format(stat.rng_seed),
            "--initialize_lda=true", "--corpus_prefix={}".format(
                os.path.join(os.path.abspath(input_path), 'foo')),
            "--outname={}".format(
                os.path.abspath(output_path)), "--top_chain_var={}".format(
                    stat.top_chain_var), "--alpha={}".format(stat.alpha),
            "--lda_sequence_min_iter=10", "--lda_sequence_max_iter={}".format(
                stat.max_iter), "--lda_max_em_iter=20"
        ],
                         stdout=process_output,
                         stderr=process_output).wait()
        print("Blei algorithm done")

    ##########################
    ## Upload the dtm results to the db

    print("upload dtm results to db")

    info = readInfo(os.path.join(output_path, "lda-seq/info.dat"))

    topic_ids = db.add_topics(stat.K, stat.run_id)

    #################################
    # TopicTerms

    print("writing topic terms")
    topics = range(info['NUM_TOPICS'])
    pool = Pool(processes=8)
    pool.map(
        partial(dtm_topic,
                info=info,
                topic_ids=topic_ids,
                vocab_ids=vocab_ids,
                ys=time_range,
                run_id=run_id,
                output_path=output_path), topics)
    pool.terminate()
    gc.collect()

    ######################################
    # Doctopics
    print("writing doctopics")
    gamma = np.fromfile(os.path.join(output_path, 'lda-seq/gam.dat'),
                        dtype=float,
                        sep=" ")
    gamma = gamma.reshape((int(len(gamma) / stat.K), stat.K))

    gamma = find(csr_matrix(gamma))
    glength = len(gamma[0])
    chunk_size = 100000
    ps = 16
    parallel_add = True

    all_dts = []

    make_t = 0
    add_t = 0

    for i in range(glength // chunk_size + 1):
        dts = []
        values_list = []
        f = i * chunk_size
        l = (i + 1) * chunk_size
        if l > glength:
            l = glength
        docs = range(f, l)
        doc_batches = []
        for p in range(ps):
            doc_batches.append([x for x in docs if x % ps == p])
        pool = Pool(processes=ps)
        make_t0 = time()
        values_list.append(
            pool.map(
                partial(db.f_gamma_batch,
                        gamma=gamma,
                        docsizes=docsizes,
                        docUTset=ids,
                        topic_ids=topic_ids,
                        run_id=run_id), doc_batches))
        pool.terminate()
        make_t += time() - make_t0
        django.db.connections.close_all()

        add_t0 = time()
        values_list = [item for sublist in values_list for item in sublist]
        pool = Pool(processes=ps)

        if s.search_object_type == 1:
            pool.map(db.insert_many_pars, values_list)
        elif s.search_object_type == 2:
            pool.map(db.insert_many_utterances, values_list)

        pool.terminate()
        add_t += time() - add_t0
        gc.collect()
        sys.stdout.flush()

    stat = RunStats.objects.get(run_id=run_id)
    stat.last_update = timezone.now()
    stat.runtime = timezone.now() - start_datetime
    stat.status = 3  # 3 = finished
    stat.save()
    management.call_command('update_run', run_id)

    totalTime = time() - t0

    tm = int(totalTime // 60)
    ts = int(totalTime - (tm * 60))

    print("done! total time: " + str(tm) + " minutes and " + str(ts) +
          " seconds")
    print("a maximum of " +
          str(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1000) +
          " MB was used")

    if archiving:
        subprocess.Popen([
            "zip",
            "-r",
            output_path + ".zip",
            output_path + "/",
        ],
                         stdout=process_output,
                         stderr=process_output).wait()
        subprocess.Popen([
            "zip",
            "-r",
            input_path + ".zip",
            input_path + "/",
        ],
                         stdout=process_output,
                         stderr=process_output).wait()
        subprocess.Popen(["rm", "-r", output_path],
                         stdout=process_output,
                         stderr=process_output).wait()
        subprocess.Popen(["rm", "-r", input_path],
                         stdout=process_output,
                         stderr=process_output).wait()

    return 0
Exemplo n.º 38
0
    def feature_importance(self,
                           model_instance,
                           ascending=True,
                           filter_classes=None,
                           n_jobs=-1,
                           progressbar=True,
                           n_samples=5000,
                           method='prediction-variance',
                           scorer_type='default',
                           use_scaling=False):
        """
        Computes feature importance of all features related to a model instance.
        Supports classification, multi-class classification, and regression.

        Wei, Pengfei, Zhenzhou Lu, and Jingwen Song.
        "Variable Importance Analysis: A Comprehensive Review".
        Reliability Engineering & System Safety 142 (2015): 399-432.


        Parameters
        ----------
        model_instance: skater.model.model.Model subtype
            the machine learning model "prediction" function to explain, such that
            predictions = predict_fn(data).
        ascending: boolean, default True
            Helps with ordering Ascending vs Descending
        filter_classes: array type
            The classes to run partial dependence on. Default None invokes all classes.
            Only used in classification models.
        n_jobs: int
            How many concurrent processes to use. Defaults -1, which grabs as many as are available.
            Use 1 to avoid multiprocessing altogether.
        progressbar: bool
            Whether to display progress. This affects which function we use to operate on the pool
            of processes, where including the progress bar results in 10-20% slowdowns.
        n_samples: int
            How many samples to use when computing importance.
        method: string (default 'prediction-variance'; 'model-scoring' for estimator specific scoring metric
            How to compute feature importance. 'model-scoring' requires Interpretation.training_labels.
            Note this choice should only rarely makes any significant differences
            prediction-variance: mean absolute value of changes in predictions, given perturbations.
            model-scoring: difference in log_loss or MAE of training_labels given perturbations.
        scorer_type: string
            only used when method='model-scoring', and in this case defines which scoring function to use.
            Default value is 'default', which evaluates to:
                regressors: mean absolute error
                classifiers with probabilities: cross entropy
                classifiers without probabilities: f1 score
            See Skater.model.scorers for details.
        use_scaling: bool
            Whether to weight the importance values by the strength of the perturbations.
            Generally doesn't effect results unless n_samples is very small.

        Returns
        -------
        importances : Sorted Series


        Examples
        --------
            >>> from skater.model import InMemoryModel
            >>> from skater.core.explanations import Interpretation
            >>> from sklearn.ensemble import RandomForestClassier
            >>> rf = RandomForestClassier()
            >>> rf.fit(X,y)
            >>> model = InMemoryModel(rf.predict_proba, examples = X)
            >>> interpreter = Interpretation()
            >>> interpreter.load_data(X)
            >>> interpreter.feature_importance.feature_importance(model)
        """

        if filter_classes:
            err_msg = "members of filter classes must be" \
                      "members of model_instance.classes." \
                      "Expected members of: {0}\n" \
                      "got: {1}".format(model_instance.target_names,
                                        filter_classes)
            filter_classes = list(filter_classes)
            assert all([
                i in model_instance.target_names for i in filter_classes
            ]), err_msg

        if n_samples <= self.data_set.n_rows:
            inputs, labels = self.data_set.generate_sample(
                strategy='random-choice',
                include_y=True,
                sample=True,
                n_samples=n_samples)
        else:
            inputs, labels = self.data_set.X, self.data_set.y

        if method == 'model-scoring' and labels is None:
            raise FeatureImportanceError(
                "If labels are not set, you"
                "can only use feature importance methods that do "
                "not require ground truth labels")

        original_predictions = model_instance.predict(inputs)
        n_jobs = None if n_jobs < 0 else n_jobs
        predict_fn = model_instance._get_static_predictor()
        executor_instance = Pool(n_jobs)
        arg_list = self.data_set.feature_ids
        scorer = model_instance.scorers.get_scorer_function(
            scorer_type=scorer_type)

        if progressbar:
            self.interpreter.logger.warn(
                "Progress bars slow down runs by 10-20%. For slightly \n"
                "faster runs, do progress_bar=False")
            n_iter = len(self.data_set.feature_ids)
            p = ProgressBar(n_iter, units='features')
            mapper = executor_instance.imap
        else:
            mapper = executor_instance.map

        fi_func = partial(compute_feature_importance,
                          input_data=inputs,
                          estimator_fn=predict_fn,
                          original_predictions=original_predictions,
                          feature_info=self.data_set.feature_info,
                          feature_names=self.data_set.feature_ids,
                          training_labels=labels,
                          method=method,
                          scaled=use_scaling,
                          scorer=scorer)

        importances = {}
        try:
            if n_jobs == 1:
                raise ValueError("Skipping to single processing")
            importance_dicts = []
            for importance in mapper(fi_func, arg_list):
                importance_dicts.append(importance)
                if progressbar:
                    p.animate()
        except:
            self.interpreter.logger.warn(
                "Multiprocessing failed, going single process")
            importance_dicts = []
            for importance in map(fi_func, arg_list):
                importance_dicts.append(importance)
                if progressbar:
                    p.animate()
        finally:
            executor_instance.close()
            executor_instance.join()
            executor_instance.terminate()

        for i in importance_dicts:
            importances.update(i)

        importances = pd.Series(importances).sort_values(ascending=ascending)

        if not importances.sum() > 0:
            self.interpreter.logger.debug(
                "Importances that caused a bug: {}".format(importances))
            raise (FeatureImportanceError(
                "Something went wrong. Importances do not sum to a positive value\n"
                "This could be due to:\n"
                "1) 0 or infinite divisions\n"
                "2) perturbed values == original values\n"
                "3) feature is a constant\n"
                "Please submit an issue here:\n"
                "https://github.com/datascienceinc/Skater/issues"))

        importances = divide_zerosafe(
            importances, (np.ones(importances.shape[0]) * importances.sum()))
        return importances