def testingStepFinished(self, testingStep, executionSessions):
        pool = multiprocessing.Pool(
            self.config['testing_video_generation_processes'],
            maxtasksperchild=1)

        futures = []
        for session in executionSessions:
            future = pool.apply_async(func=createDebugVideoSubProcess,
                                      args=(self.config.serialize(),
                                            str(session.id), "", False, False,
                                            None, None, "annotated_videos"))
            futures.append((session, future))

        for session, future in futures:
            localFuture = future
            # for retry in range(5):
            # try:
            value = localFuture.get(
                timeout=self.config['debug_video_generation_timeout'])
            if value:
                getLogger().error(value)
            # break
            # except billiard.exceptions.WorkerLostError:
            #     if retry == 4:
            #         raise
            #     localFuture = pool.apply_async(func=createDebugVideoSubProcess,
            #                               args=(self.config.serialize(), str(session.id), "", False, False, None, None, "annotated_videos"))
            # except BrokenPipeError:
            #     if retry == 4:
            #         raise
            #     localFuture = pool.apply_async(func=createDebugVideoSubProcess,
            #                               args=(self.config.serialize(), str(session.id), "", False, False, None, None, "annotated_videos"))

        pool.close()
        pool.join()
示例#2
0
def getMovies():
    ''' Fetch information pertaining to all released movies
    '''
    r = requests.get(settings.MOVIES_LINK)
    movies_objects = []
    if (r.status_code == 200):
        try:
            soup = BeautifulSoup(r.text, 'html5lib')
            movies_html = soup.find('ul', {'class': 'drop-list masterBorderColor'})
            movies = [settings.NOS_CINEMAS_URL + movie['href']
                  for movie in movies_html.find_all('a', {'class': 'list-item'})]
            # partially apply getMovie
            getMoviePart = partial(getMovie, released=True)
        
            with multiprocessing.Pool(15) as proc_pool:
                movies_objects = proc_pool.map(getMoviePart, movies)
        except Exception as e:
            print(f"[getMovies] Error while parsing HTML {str(e)}")
            raise Exception(f"[getMovies] {str(e)}")

    else:
        print("[getMovies] Não foi possível obter a lista de filmes")
        raise Exception(f"GET {settings.MOVIES_LINK} returned unexpected response code: {r.status_code}")

    return list(filter(lambda x: x != None, movies_objects))
示例#3
0
def generateAllCharts(config, applicationId=None, enableCumulativeCoverage=False):
    getLogger().info(f"Generating charts based on results.")

    pool = multiprocessing.Pool(config['chart_generation_workers'], initializer=setupLocalLogging)

    futures = []

    futures.append(pool.apply_async(generateRewardChart, [config.serialize(), applicationId]))
    futures.append(pool.apply_async(generateFitnessChart, [config.serialize(), applicationId]))
    futures.append(pool.apply_async(generateTracesWithNewBranchesChart, [config.serialize(), applicationId]))
    if enableCumulativeCoverage:
        futures.append(pool.apply_async(generateCoverageChart, [config.serialize(), applicationId]))

    futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'totalLosses', "Total Loss", 'total_loss_chart.png']))
    futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'presentRewardLosses', "Present Reward Loss", 'present_reward_loss_chart.png']))
    futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'discountedFutureRewardLosses', "Discounted Future Reward Loss", 'discounted_future_reward_loss_chart.png']))
    futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'stateValueLosses', "State Value Loss", 'state_value_loss_chart.png']))
    futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'advantageLosses', "Advantage Loss", 'advantage_loss_chart.png']))
    futures.append(pool.apply_async(generateLossChart, [config.serialize(), applicationId, 'actionProbabilityLosses', "Action Probability Loss", 'action_probability_loss_chart.png']))

    if config['chart_enable_cumulative_coverage_chart'] and enableCumulativeCoverage:
        futures.append(pool.apply_async(generateCumulativeCoverageChart, [config.serialize(), applicationId, 100]))
        futures.append(pool.apply_async(generateCumulativeCoverageChart, [config.serialize(), applicationId, 25]))
        futures.append(pool.apply_async(generateCumulativeCoverageChart, [config.serialize(), applicationId, 10]))
        futures.append(pool.apply_async(generateCumulativeCoverageChart, [config.serialize(), applicationId, 5]))

    if config['chart_enable_cumulative_errors_chart']:
        futures.append(pool.apply_async(generateCumulativeErrorsFoundChart, [config.serialize(), applicationId]))

    for future in futures:
        future.get()

    pool.close()
    pool.join()
    getLogger().info(f"Completed generating all the charts.")
示例#4
0
def get_intersection_buffers(roads, road_bounds, intersection_buffer_units,
                             tile_max_units):
    """Buffers all intersections
    :param roads: List of shapely geometries representing road segments
    :param road_bounds: Bounding box of the roads shapefile
    :param intersection_buffer_units: Number of units to use for buffer radius
    :param tile_max_units: Maxium number of units for each side of a tile
    """

    # As an optimization, the road network is divided up into a grid of tiles,
    # and intersections are calculated within each tile.
    def roads_per_tile_iter():
        """Generator which yields a set of roads for each tile"""
        min_x, min_y, max_x, max_y = road_bounds
        bounds_width = max_x - min_x
        bounds_height = max_y - min_y
        x_divisions = ceil(bounds_width / tile_max_units)
        y_divisions = ceil(bounds_height / tile_max_units)
        tile_width = bounds_width / x_divisions
        tile_height = bounds_height / y_divisions

        # Create a spatial index for roads to efficiently match up roads to tiles
        logger.info('Generating spatial index for intersections')
        roads_index = rtree.index.Index()
        for idx, road in enumerate(roads):
            roads_index.insert(idx, road.bounds)

        logger.info('Number of tiles: {}'.format(int(x_divisions *
                                                     y_divisions)))
        for x_offset in range(0, int(x_divisions)):
            for y_offset in range(0, int(y_divisions)):
                road_ids_in_tile = roads_index.intersection([
                    min_x + x_offset * tile_width,
                    min_y + y_offset * tile_height,
                    min_x + (1 + x_offset) * tile_width,
                    min_y + (1 + y_offset) * tile_height
                ])
                roads_in_tile = [
                    roads[road_id] for road_id in road_ids_in_tile
                ]
                if len(roads_in_tile) > 1:
                    yield roads_in_tile

    # Allocate one worker per core, and parallelize the discovery of intersections
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    tile_intersections = pool.imap(get_intersections, roads_per_tile_iter())
    pool.close()
    pool.join()

    logger.info('Buffering intersections')
    # Note: tile_intersections is a list of multipoints (which is a list of points).
    # itertools.chain.from_iterable flattens the list into a list of single points.
    buffered_intersections = [
        intersection.buffer(intersection_buffer_units)
        for intersection in itertools.chain.from_iterable(tile_intersections)
    ]

    # If intersection buffers overlap, union them to treat them as one
    logger.info('Performing unary union on buffered intersections')
    return unary_union(buffered_intersections)
示例#5
0
文件: test.py 项目: Sanyam07/ml
    def _grid_optimize_pool(self, X, Y, regressor_class, grid):
        pool = billiard.Pool(processes=self.cores, soft_timeout=self.timeout)
        pool_res = []
        results = []

        for params in self._grid_iterator(grid):
            regressor = regressor_class(**params)
            r = pool.apply_async(_pickle_bypass,
                                 (self, "_cross_validation", X, Y, regressor))
            pool_res.append((r, params))

        for r, reg_c_params in pool_res:
            start_time = time()
            err_msg = False
            try:
                score, start_time = r.get(self.timeout + 5)
                if self.verbose > 1:
                    print "* %.5t *" % score, regressor_class.__name__, params
                results.append((score, params))
            except billiard.SoftTimeiLmitExceeded:
                print "TIME LIM"
                err_msg = True
            except billiard.TimeoutError:
                print "* TIMEOUT * same shit"
                err_msg = True
            if self.verbose > 1 or err_msg:
                print "time"
        pool.terminate()  # TODO still not sure
        pool.join()
        if self.verbose > 1:
            print "Terminated"

        results.sort(reverse=self.maximise)
        return results[0] if results else None
示例#6
0
def computeCumulativeCoverageForTestingSteps(testingStepIds, config):
    futures = []

    pool = multiprocessing.Pool(config['chart_generation_dataload_workers'])

    for stepId in testingStepIds:
        futures.append(pool.apply_async(computeCumulativeBranchTraceForTestingSteps, [stepId, config]))

    cumulativeBranchTrace = {}

    for future in futures:
        branchTrace = future.get()
        for fileName in branchTrace:
            if fileName not in cumulativeBranchTrace:
                cumulativeBranchTrace[fileName] = branchTrace[fileName]
            else:
                cumulativeBranchTrace[fileName] = cumulativeBranchTrace[fileName].maximum(branchTrace[fileName])

    total = 0
    executedAtleastOnce = 0
    for fileName in cumulativeBranchTrace:
        total += cumulativeBranchTrace[fileName].shape[0]
        executedAtleastOnce += len(numpy.nonzero(cumulativeBranchTrace[fileName])[0])

    # Just an extra check here to cover our ass in case of division by zero
    if total == 0:
        total += 1

    pool.close()
    pool.join()

    return float(executedAtleastOnce) / float(total), executedAtleastOnce, total
示例#7
0
def multiprocess(tasks, pool_size=get_num_cpus()):
    """
    Executes several tasks concurrently via Python ``multiprocessing``
    processes, puts the results into a queue, and generates these back to the
    caller.
    """

    pool = billiard.Pool(pool_size)

    result_q = Queue()

    def build_results(result):
        if type(result) in (types.GeneratorType, list, tuple, set):
            for r in result:
                result_q.put(r)
        else:
            result_q.put(result)

    for task in tasks:
        run = pool.apply_async(task.func,
                               args=task.args,
                               callback=build_results)
        run.get()
    pool.close()
    pool.join()

    while not result_q.empty():
        result = result_q.get_nowait()
        yield result
示例#8
0
文件: test.py 项目: Sanyam07/ml
    def _cross_validation_pool(self, X, Y, regressor):
        pool = billiard.Pool(processes=self.cores, soft_timeout=self.timeout)
        kf = cv.KFold(X.shape[0],
                      n_folds=self.folds,
                      shuffle=self.shuffle,
                      random_state=self.random_state)
        Y_prime = np.zeros(Y.shape)
        cv_res = []

        for train_ind, test_ind in kf:
            X_train, X_test, Y_train = X[train_ind], X[test_ind], Y[train_ind]
            r = pool.apply_async(regressor, (X_train, Y_train, X_test))
            cv_res.append((r, test_ind))
        try:
            for r, test_ind in cv_res:
                if len(Y_prime.shape) == 1:
                    Y_prime[test_ind] = r.get(self.timeout + 5)
                else:
                    Y_prime[test_ind, :] = r.get(self.timeout + 5)
        except (billiard.SoftTimeLimitExceeded, billiard.TimeoutError):
            pool.terminate()
            if self.verbose > 2:
                print "TERMINATED"
            return None

        pool.close()
        pool.join()
        if self.verbose > 2:
            print "CLOSED"
        return Y_prime
示例#9
0
def concurrent_twitter_query_wad(self, username, tweets_num=10, threads=2):
    if current_cred >= len(all_creds):
        raise Exception('API keys exhausted.')
    try:
        all_tweets = twitterdata.get_all_tweets(username, tweets_num)
    except:
        raise
    try:
        tweets_output = getOutput(all_tweets)
    except:
        raise
    pool = mp.Pool(threads)
    try:
        pool_results = pool.starmap(wad_helper, [(tweets_output, 'weapon'),
                                                 (tweets_output, 'drugs'),
                                                 (tweets_output, 'alcohol')])
        self.update_state(state="PROGRESS")
    except:
        raise
    finally:
        pool.terminate()
    weapon_vals = pool_results[0]
    drug_vals = pool_results[1]
    alcohol_vals = pool_results[2]
    print('Data acquired concurrently for user: '******'weapons': weapon_vals,
        'alcohol': alcohol_vals,
        'drugs': drug_vals
    }
    result = {username: result}
    print(result)
    self.update_state("FINISHED")
    return {"result": result}
示例#10
0
def do_prediction(self, intbl, selections, gene_names,
                  filteropt=1, filterval=1, spec_ecutoff=0.4, nonspec_ecutoff=0.35):
    '''
    intbl: preprocessed table
    filteropt: 1 for highest t-val, 2 for p-val cutoff
    filterval: # TFs for opt 1 and p-val cutoff for opt 2
    '''

    if type(intbl) is str: # got an error in the pipeline from inittbl
        return {'current': 1, 'total': 1, 'error': intbl}

    # intbl: #rowidx,seq,val,diff,t,pbmname,escore_seq
    start_time = time.time()

    #while not inittask.ready():
    #    time.sleep(1)
    #intbl = inittask.get()
    predfiles = [app.config['PREDDIR'] + "/" + s for s in selections] # os.listdir(preddir)
    preds = [l for l in utils.chunkify(predfiles,app.config['PCOUNT']) if len(l) != 0] # chunks the predfiles for each process

    # collect the short2long_map -- shared, so only one i/o
    emap = pd.read_csv("%s/index_short_to_long.csv" % (app.config["ESCORE_DIR"]), header=0, index_col=0, sep=',', dtype='Int32') # pd.DataFrame
    emap = np.array(emap[emap.columns[0]]) - 1 #emap[emap.columns[0]].to_numpy() - 1

    # ---- MULTIPROCESSING PART ----
    pool = mp.Pool(processes=app.config['PCOUNT'])
    # need to use manager here
    shared_ready_sum = mp.Manager().Value('i', 0)

    predict_partial = ft.partial(predict, **{'dataset':intbl, 'ready_count':shared_ready_sum, 'emap':emap,
            'filteropt':filteropt, 'filterval':filterval, 'spec_ecutoff':spec_ecutoff, 'nonspec_ecutoff':nonspec_ecutoff})
    async_pools = [pool.apply_async(predict_partial, (preds[i], )) for i in range(0,len(preds))]

    # run the job, update progress bar
    total = len(predfiles)
    while not all([p.ready() for p in async_pools]):
        time.sleep(2) # super important to avoid checking every loop
        self.update_state(state='PROGRESS',
                          meta={'current': shared_ready_sum.value, 'total': total, 'status': 'Processing input data...'})

    res = [p.get() for p in async_pools]

    self.update_state(state='PROGRESS',
                          meta={'current': shared_ready_sum.value, 'total': total, 'status': 'post-processing'})
    print("Terminate all children process..")
    pool.terminate() # terminate to kill all child processes !!! Like.. super important,
                     # to avoid memory leak, seriously...
    datavalues = postprocess(res,predfiles,gene_names,filteropt,filterval)

    ''' SET the values in redis '''
    #print("marktesting",colnames,datavalues)
    savetomongo(self.request.id, datavalues.to_dict('records') ,app.config['USER_DATA_EXPIRY'])
    # significance_score can be z-score or p-value depending on the out_type

    #db.expire("%s:vals:*" % self.request.id, app.config['USER_DATA_EXPIRY'])

    return {'current': shared_ready_sum.value, 'total': len(predfiles), 'status': 'Task completed!',
            'result': 'done', 'taskid': self.request.id,
            'time':(time.time()-start_time)} # -- somehow cannot do jsonify(postproc)
示例#11
0
 def fast_calculate_features(self, words):
     import billiard
     pool = billiard.Pool(4)
     feats = pool.map(bio_chunk_features, words)
     pool.close()
     #result = job.apply_async()
     #feats = result.get()
     return feats
示例#12
0
def generateCoverageChart(config, applicationId):
    getLogger().info(f"Generating the coverage chart")

    config = KwolaCoreConfiguration(config)

    testingSteps = sorted(
        [step for step in TrainingManager.loadAllTestingSteps(config, applicationId=applicationId) if step.status == "completed"],
        key=lambda step: step.startTime, reverse=False)

    coverageValueFutures = []

    pool = multiprocessing.Pool(config['chart_generation_dataload_workers'])

    for step in testingSteps:
        coverageValueFutures.append(pool.apply_async(computeCumulativeCoverageForTestingSteps, [[step.id], config]))

    coverageValues = [future.get()[0] for future in coverageValueFutures]
    executedLinesValues = [future.get()[1] for future in coverageValueFutures]
    totalLinesValues = [future.get()[2] for future in coverageValueFutures]

    coverageValues = scipy.signal.medfilt(coverageValues, kernel_size=9)
    executedLinesValues = scipy.signal.medfilt(executedLinesValues, kernel_size=9)
    totalLinesValues = scipy.signal.medfilt(totalLinesValues, kernel_size=9)

    fig, ax = plt.subplots()
    ax.plot(range(len(coverageValues)), coverageValues, color='green')
    ax.set(xlabel='Testing Step #', ylabel='Coverage',
           title='Code Coverage')
    ax.grid()

    _, localFilePath = tempfile.mkstemp(suffix=".png")
    fig.savefig(localFilePath)
    with open(localFilePath, 'rb') as f:
        config.saveKwolaFileData("charts", "coverage_chart.png", f.read())
    os.unlink(localFilePath)

    fig, ax = plt.subplots()
    ax.plot(range(len(executedLinesValues)), executedLinesValues, color='green')
    ax2 = ax.twinx()
    ax2.plot(range(len(totalLinesValues)), totalLinesValues, color='red')
    ax.set(xlabel='Testing Step #', ylabel='Lines Executed (green)',
           title='Lines Available / Lines Triggered')
    ax2.set(ylabel="Lines Available (red)")
    ax.grid()
    ax2.grid()

    _, localFilePath = tempfile.mkstemp(suffix=".png")
    fig.savefig(localFilePath)
    with open(localFilePath, 'rb') as f:
        config.saveKwolaFileData("charts", "lines_triggered.png", f.read())
    os.unlink(localFilePath)

    pool.close()
    pool.join()
示例#13
0
文件: test.py 项目: Sanyam07/ml
    def _multiple_pool(self, X, Y, regressor_classes, grids):
        # only valid solution
        pool = billiard.Pool(processes=self.cores, soft_timeout=self.timeout)
        pool_res = []
        results = []
        timer = time()

        if type(grids) is dict:
            grids = [grids] * len(regressor_classes)
        else:
            assert len(regressor_classes) == len(grids)

        for reg_c, grid in izip(regressor_classes, grids):
            for params in self._grid_iterator(grid):
                regressor = reg_c(**params)

                r = pool.apply_async(
                    _pickle_bypass,
                    (self, "_cross_validation", X, Y, regressor))
                pool_res.append((r, reg_c, params))

        for r, reg_c_params in pool_res:
            start_time = time()
            try:
                score, start_time = r.get(self.timeout + 5)
                if self.verbose > 1:
                    print "* %.5t *" % score, reg_c, params
                results.append((score, reg_c, params))
            except billiard.SoftTimeLimitExceeded:
                print "TIME LIM"
            except billiard.TimeoutError:
                print "* TIMEOUT * same shit"
            if self.verbose > 1:
                print "time"
        pool.terminate()  # TODO still not sure
        pool.join()

        best_reg = None
        best_score = -np.inf if self.maximise else np.inf
        best_params = None

        for score, reg_c, prms in results:
            if (self.maximise
                    and score > best_score) or (not self.maximise
                                                and score < best_score):
                best_score = score
                best_reg = reg_c
                best_params = prms

        if self.verbose > 1:
            print "multiple done in timer"
            print best_score, best_reg, best_params

        return best_score, best_reg, best_params
示例#14
0
def updateSessionsAvailability(date):
    ''' Fetch availability for sessions after the given date
    :param: lower limit for session date
    '''
    print('[updateSessionsAvailability] Updating session information...')
    sessions = Session.objects \
                            .filter(start_date__gte=date) \
                            .all()
    
    p = multiprocessing.Pool(processes=15)
    sessions_updated = p.map(getSessionAvailability, sessions)
    Session.objects.bulk_update(sessions_updated, ['availability'])
    print('[updateSessionsAvailability] Sessions updated!')
    def testingStepFinished(self, testingStep, executionSessions):
        pool = multiprocessing.Pool(self.config['video_generation_processes'], maxtasksperchild=1)

        futures = []
        for session in executionSessions:
            future = pool.apply_async(func=createDebugVideoSubProcess, args=(self.config.configurationDirectory, str(session.id), "", False, False, None, None, "annotated_videos"))
            futures.append(future)

        for future in futures:
            future.get()

        pool.close()
        pool.join()
示例#16
0
def load_data():
    pool = mp.Pool(8)
    jobs = []
    f = open('sample.csv', 'r', buffering=(2 << 16))
    i = 0
    current_objects = []
    lines = f.readlines(100000)
    while lines:
        jobs.append(pool.apply_async(post_objects, [lines]))
        lines = f.readlines(100000)
    for job in jobs:
        job.get()

    #clean up
    pool.close()
示例#17
0
def generateCumulativeErrorsFoundChart(configDir, applicationId):
    getLogger().info(f"Generating the cumulative errors chart")

    config = KwolaCoreConfiguration(configDir)

    testingSteps = sorted([
        step for step in TrainingManager.loadAllTestingSteps(
            config, applicationId=applicationId) if step.status == "completed"
    ],
                          key=lambda step: step.startTime,
                          reverse=False)

    bugsByTestingStepId = {step.id: 0 for step in testingSteps}

    for bug in loadAllBugs(config, applicationId):
        if bug.testingStepId in bugsByTestingStepId:
            bugsByTestingStepId[bug.testingStepId] += 1

    cumulativeErrorsFound = []

    pool = multiprocessing.Pool(config['chart_generation_dataload_workers'])

    currentTotal = 0
    for step in testingSteps:
        currentTotal += bugsByTestingStepId[step.id]
        cumulativeErrorsFound.append(currentTotal)

    fig, ax = plt.subplots()

    ax.plot(range(len(cumulativeErrorsFound)),
            cumulativeErrorsFound,
            color='green')

    ax.set(xlabel='Testing Step #',
           ylabel='Total Errors Found',
           title='Cumulative Errors Found')
    ax.grid()

    _, localFilePath = tempfile.mkstemp(suffix=".png")
    fig.savefig(localFilePath)
    with open(localFilePath, 'rb') as f:
        filePath = f"{config.getKwolaUserDataDirectory('charts')}/errors_found.png"
        saveKwolaFileData(filePath, f.read(), config)
    os.unlink(localFilePath)

    pool.close()
    pool.join()
示例#18
0
def generateRewardChart(configDir, applicationId):
    getLogger().info(f"Generating the reward chart")

    config = KwolaCoreConfiguration(configDir)

    testingSteps = sorted([
        step for step in TrainingManager.loadAllTestingSteps(
            config, applicationId=applicationId) if step.status == "completed"
    ],
                          key=lambda step: step.startTime,
                          reverse=False)

    rewardValueFutures = []

    pool = multiprocessing.Pool(config['chart_generation_dataload_workers'])

    for step in testingSteps:
        rewardValueFutures.append(
            pool.apply_async(averageRewardForTestingStep, [config, step.id]))

    rewardValues = [
        future.get() for future in rewardValueFutures
        if future.get() is not None
    ]

    fig, ax = plt.subplots()

    rewardValues = scipy.signal.medfilt(rewardValues, kernel_size=9)

    ax.plot(range(len(rewardValues)), rewardValues, color='green')

    ax.set_ylim(0, 15)

    ax.set(xlabel='Testing Step #',
           ylabel='Reward',
           title='Reward per session')
    ax.grid()

    _, localFilePath = tempfile.mkstemp(suffix=".png")
    fig.savefig(localFilePath)
    with open(localFilePath, 'rb') as f:
        filePath = f"{config.getKwolaUserDataDirectory('charts')}/reward_chart.png"
        saveKwolaFileData(filePath, f.read(), config)
    os.unlink(localFilePath)

    pool.close()
    pool.join()
    def generateVideoFilesForBugs(self, testingStep, bugObjects):
        pool = multiprocessing.Pool(self.config['video_generation_processes'],
                                    maxtasksperchild=1)
        futures = []
        for bugIndex, bug in enumerate(bugObjects):
            future = pool.apply_async(
                func=createDebugVideoSubProcess,
                args=(self.config.configurationDirectory,
                      str(bug.executionSessionId), f"{bug.id}_bug", False,
                      False, bug.stepNumber, bug.stepNumber + 3, "bugs"))
            futures.append(future)

        for future in futures:
            future.get()

        pool.close()
        pool.join()
示例#20
0
def generateLossChart(configDir, applicationId, attribute, title, fileName):
    getLogger().info(f"Generating the loss chart for {attribute}")

    config = KwolaCoreConfiguration(configDir)

    trainingStepIds = findAllTrainingStepIds(config,
                                             applicationId=applicationId)

    pool = multiprocessing.Pool(config['chart_generation_dataload_workers'])

    lossValueFutures = []
    for id in trainingStepIds:
        lossValueFutures.append(
            pool.apply_async(loadTrainingStepLossData,
                             [config, id, attribute]))

    lossValuesSorted = sorted([
        future.get()
        for future in lossValueFutures if future.get()[2] == "completed"
    ],
                              key=lambda result: result[1],
                              reverse=False)

    lossValues = [result[0] for result in lossValuesSorted]

    fig, ax = plt.subplots()

    lossValues = scipy.signal.medfilt(lossValues, kernel_size=9)

    ax.plot(range(len(lossValues)), lossValues, color='green')

    ax.set_ylim(0, numpy.percentile(lossValues, 99))

    ax.set(xlabel='Training Step #', ylabel='Reward', title=title)
    ax.grid()

    _, localFilePath = tempfile.mkstemp(suffix=".png")
    fig.savefig(localFilePath)
    with open(localFilePath, 'rb') as f:
        filePath = os.path.join(config.getKwolaUserDataDirectory('charts'),
                                fileName)
        saveKwolaFileData(filePath, f.read(), config)
    os.unlink(localFilePath)

    pool.close()
    pool.join()
示例#21
0
def twitter_bulk_query_wad(self, user_list, tweets_num=10, threads=2):
    if current_cred >= len(all_creds):
        raise Exception('You have reached the daily limit of 1500 requests!')
    print(user_list)
    results = {}
    pool = mp.Pool(threads)
    try:
        pool_results = pool.starmap(twitter_query_wad,
                                    [(user, tweets_num) for user in user_list])
        self.update_state(state="PROGRESS")
    except:
        raise
    finally:
        pool.terminate()
    results = [{user_list[i]: pool_results[i]} for i in range(len(user_list))]
    print('twitter_bulk_query_wad() completed.')
    self.update_state(state="FINISHED")
    print(results)
    return {"result": results}
示例#22
0
def generateFitnessChart(config, applicationId):
    getLogger().info(f"Generating the fitness chart")

    config = KwolaCoreConfiguration(config)

    testingSteps = sorted(
        [step for step in TrainingManager.loadAllTestingSteps(config, applicationId=applicationId) if step.status == "completed"],
        key=lambda step: step.startTime, reverse=False)

    fitnessValueFutures = []

    pool = multiprocessing.Pool(config['chart_generation_dataload_workers'])

    for step in testingSteps:
        fitnessValueFutures.append(pool.apply_async(averageFitnessForTestingStep, [config, step.id]))

    fitnessValues = [future.get() for future in fitnessValueFutures if future.get() is not None]

    if len(fitnessValues) > 0:
        bestFitness = numpy.max(fitnessValues)

        fig, ax = plt.subplots()

        fitnessValues = scipy.signal.medfilt(fitnessValues, kernel_size=9)

        ax.plot(range(len(fitnessValues)), fitnessValues, color='green')

        ax.set_ylim(0, 100)

        ax.set(xlabel='Testing Step #', ylabel='Fitness',
               title='Fitness per session')
        ax.grid()

        _, localFilePath = tempfile.mkstemp(suffix=".png")
        fig.savefig(localFilePath)
        with open(localFilePath, 'rb') as f:
            config.saveKwolaFileData("charts", "fitness_chart.png", f.read())
        os.unlink(localFilePath)

        getLogger().info(f"Best Fitness Value: {bestFitness}")

        pool.close()
        pool.join()
示例#23
0
def getDiagonalOfInverse(matrix, stop_at=0):

    global p_solver
    global p_num_variables

    if __name__ == '__main__' or __name__ == 'gmrf.pool_inverse':
        p_solver = scipy.sparse.linalg.factorized(matrix)
        p_num_variables = matrix.shape[0]

        if stop_at == 0:
            stop_at = p_num_variables

        with billiard.Pool() as pool:
            diagonal = pool.map(invOfColumn, [i for i in range(0, stop_at)])
            pool.close()
            pool.terminate()
            return np.array(diagonal)
    else:
        return 0
示例#24
0
def get_dataset(features: list, dataset_id: str, nthread: int = -1):
    all_features = features
    dataframe = pd.DataFrame()
    if nthread > 0:
        for features in np.array_split(all_features,
                                       int(len(all_features) / 32) + 1):
            with mp.Pool(nthread) as p:
                partial_create_features = functools.partial(
                    get_feature, dataset_id=dataset_id)
                # dataframe = pd.concat(p.map(partial_create_features, features), axis=1)
                dataframes = p.map(partial_create_features, features)
                if len(dataframe) > 0:
                    dataframes.append(dataframe)
                dataframe = pd.concat(dataframes, axis=1)
                for df in dataframes:
                    del df
                del dataframes
    else:
        for feature_name in tqdm(features):
            if (feature_name, dataset_id) in FEATURES.keys():
                f = FEATURES[(feature_name, dataset_id)]
                df = f.load_or_create()
                if len(df.columns) == 1:
                    dataframe[feature_name] = df[f.feature_name]
                else:
                    if len(dataframe) > 0:
                        dataframe = pd.concat([dataframe, df], axis=1)
                    else:
                        dataframe = df
            else:
                raise Exception(f"Feature {feature_name} not found ")
    # dataframe = pd.concat([get_feature(feature_name, dataset_id) for feature_name in features], axis=1)
    # Some columns are not in the format XGB expects, so the following block of code will cast them to the right format
    for column in dataframe.columns:
        if str(dataframe[column].dtype).lower()[:3] == "int":
            dataframe[column] = dataframe[column].fillna(0).astype(np.int64,
                                                                   copy=False)
        elif str(dataframe[column].dtype).lower() == "boolean":
            dataframe[column] = dataframe[column].fillna(False).astype(
                np.bool, copy=False)
    return dataframe
示例#25
0
def get_dataset_batch(features: list, dataset_id: str, total_n_split: int,
                      split_n: int, sample: float):
    assert split_n < total_n_split, "split_n parameter should be less than total_n_split parameter"

    if sample < 1:
        with mp.Pool(16) as p:
            partial_create_features = functools.partial(
                get_feature_batch,
                dataset_id=dataset_id,
                total_n_split=total_n_split,
                split_n=split_n,
                sample=sample)
            dataframe = pd.concat(p.map(partial_create_features, features),
                                  axis=1)
    else:
        dataframe = pd.DataFrame()
        for feature_name in tqdm(features):
            if (feature_name, dataset_id) in FEATURES.keys():
                f = FEATURES[(feature_name, dataset_id)]
                df = np.array_split(f.load_or_create(), total_n_split)[split_n]
                if len(df.columns) == 1:
                    dataframe[feature_name] = df[f.feature_name]
                else:
                    if len(dataframe) > 0:
                        dataframe = pd.concat([dataframe, df], axis=1)
                    else:
                        dataframe = df
            else:
                raise Exception(f"Feature {feature_name} not found ")
        # dataframe = pd.concat([np.array_split(get_feature(feature_name, dataset_id),
        #                                       total_n_split)[split_n] for feature_name in features], axis=1)
    # Some columns are not in the format XGB expects, so the following block of code will cast them to the right format
    for column in dataframe.columns:
        if str(dataframe[column].dtype).lower()[:3] == "int":
            dataframe[column] = dataframe[column].fillna(0).astype(np.int64,
                                                                   copy=False)
        elif str(dataframe[column].dtype).lower() == "boolean":
            dataframe[column] = dataframe[column].fillna(False).astype(
                np.bool, copy=False)
    return dataframe
示例#26
0
def generateTracesWithNewBranchesChart(config, applicationId):
    getLogger().info(f"Generating the traces with new branches chart")

    config = KwolaCoreConfiguration(config)

    testingSteps = sorted(
        [step for step in TrainingManager.loadAllTestingSteps(config, applicationId=applicationId) if step.status == "completed"],
        key=lambda step: step.startTime, reverse=False)

    countTracesWithNewBranchesFutures = []

    pool = multiprocessing.Pool(config['chart_generation_dataload_workers'])

    for step in testingSteps:
        countTracesWithNewBranchesFutures.append(pool.apply_async(averageTracesWithNewBranchesForTestingStep, [config, step.id]))

    countTracesWithNewBranchesValues = [future.get() for future in countTracesWithNewBranchesFutures if future.get() is not None]

    if len(countTracesWithNewBranchesValues) > 0:
        fig, ax = plt.subplots()

        countTracesWithNewBranchesValues = scipy.signal.medfilt(countTracesWithNewBranchesValues, kernel_size=9)

        ax.plot(range(len(countTracesWithNewBranchesValues)), countTracesWithNewBranchesValues, color='green')

        ax.set_ylim(0, config['testing_sequence_length'])

        ax.set(xlabel='Testing Step #', ylabel='Traces with new branches',
               title='# of testing traces that have new branches')
        ax.grid()

        _, localFilePath = tempfile.mkstemp(suffix=".png")
        fig.savefig(localFilePath)
        with open(localFilePath, 'rb') as f:
            config.saveKwolaFileData("charts", "traces_with_new_branches.png", f.read())
        os.unlink(localFilePath)

        pool.close()
        pool.join()
示例#27
0
def poolHandle(zip,nid):
	if DEBUG_LEVEL ==0 : 	
		p = Pool(80)
		for sub in zip.namelist():
			fobj = getSubFobj(zip,sub)
			if fobj != None : p.apply_async(handleSub,args=(fobj,nid))
		p.close()  
		p.join()
	elif DEBUG_LEVEL ==1 :
		p = billiard.Pool()
		_finalizers.append(Finalize(p, p.terminate))
		try:
			p.map_async(handleSub, [(getSubFobj(zip,sub),nid) for sub in zip.namelist()])
			p.close()
			p.join()
		finally:
			p.terminate()
	else :
		for sub in zip.namelist():
			fobj = getSubFobj(zip,sub)
			if fobj != None : handleSub(fobj,nid)
	zip.close()
示例#28
0
    def generateVideoFilesForBugs(self, testingStep, bugObjects):
        pool = multiprocessing.Pool(
            self.config['testing_video_generation_processes'],
            maxtasksperchild=1)
        futures = []
        for bugIndex, bug in enumerate(bugObjects):
            future = pool.apply_async(func=createDebugVideoSubProcess,
                                      args=(self.config.serialize(),
                                            str(bug.executionSessionId),
                                            f"{bug.id}_bug", False, False,
                                            bug.stepNumber, bug.stepNumber + 3,
                                            "bugs"))
            futures.append((bugIndex, bug, future))

        for bugIndex, bug, future in futures:
            localFuture = future
            # for retry in range(5):
            # try:
            value = localFuture.get(
                timeout=self.config['debug_video_generation_timeout'])
            if value:
                getLogger().error(value)
            # break
            # except billiard.exceptions.WorkerLostError:
            #     if retry == 4:
            #         raise
            #     localFuture = pool.apply_async(func=createDebugVideoSubProcess, args=(
            #         self.config.serialize(), str(bug.executionSessionId), f"{bug.id}_bug", False, False, bug.stepNumber,
            #         bug.stepNumber + 3, "bugs"))
            # except BrokenPipeError:
            #     if retry == 4:
            #         raise
            #     localFuture = pool.apply_async(func=createDebugVideoSubProcess, args=(
            #         self.config.serialize(), str(bug.executionSessionId), f"{bug.id}_bug", False, False, bug.stepNumber,
            #         bug.stepNumber + 3, "bugs"))

        pool.close()
        pool.join()
示例#29
0
def deepspeech_run(video, ds_wav_path):
    msg = msg = "\nTRANSCRIPTING FILE : %s" % time.ctime()
    if DEBUG:
        start_timer = timer()
    lang = video.main_lang
    vad = VoiceActivityDetector(ds_wav_path)
    seg_list, sample_rate = vad.vad_segment_generator()
    if seg_list:
        msg += "\n- Start Transcript Process : %s" % time.ctime()
        p = billiard.Pool(processes=NB_WORKERS_POOL,
                          initializer=initfunc,
                          initargs=(lang, ),
                          threads=True)
        res = p.map_async(deepspeech_aux, seg_list).get()
        p.close()
        p.join()
        msg += "\n- End Transcript Process : %s" % time.ctime()
    if DEBUG:
        end_timer = timer()
        print('Transcription duration : %f s' % (end_timer - start_timer))
    msg2, webvtt = createVTT(res, vad.sample_rate)
    msg += msg2
    msg += saveVTT(video, webvtt)
    return msg
示例#30
0
def describe(df, bins=10, check_correlation=True, correlation_threshold=0.9, correlation_overrides=None, check_recoded=False, pool_size=multiprocessing.cpu_count(), **kwargs):
    """Generates a dict containing summary statistics for a given dataset stored as a pandas `DataFrame`.

    Used has is it will output its content as an HTML report in a Jupyter notebook.

    Parameters
    ----------
    df : DataFrame
        Data to be analyzed
    bins : int
        Number of bins in histogram.
        The default is 10.
    check_correlation : boolean
        Whether or not to check correlation.
        It's `True` by default.
    correlation_threshold: float
        Threshold to determine if the variable pair is correlated.
        The default is 0.9.
    correlation_overrides : list
        Variable names not to be rejected because they are correlated.
        There is no variable in the list (`None`) by default.
    check_recoded : boolean
        Whether or not to check recoded correlation (memory heavy feature).
        Since it's an expensive computation it can be activated for small datasets.
        `check_correlation` must be true to disable this check.
        It's `False` by default.
    pool_size : int
        Number of workers in thread pool
        The default is equal to the number of CPU.

    Returns
    -------
    dict
        Containing the following keys:
            * table: general statistics on the dataset
            * variables: summary statistics for each variable
            * freq: frequency table

    Notes:
    ------
        * The section dedicated to check the correlation should be externalized
    """

    if not isinstance(df, pd.DataFrame):
        raise TypeError("df must be of type pandas.DataFrame")
    if df.empty:
        raise ValueError("df can not be empty")

    try:
        # reset matplotlib style before use
        # Fails in matplotlib 1.4.x so plot might look bad
        matplotlib.style.use("default")
    except:
        pass

    matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))

    # Clearing the cache before computing stats
    base.clear_cache()

    if not pd.Index(np.arange(0, len(df))).equals(df.index):
        # Treat index as any other column
        df = df.reset_index()

    kwargs.update({'bins': bins})
    # Describe all variables in a univariate way
    if pool_size == 1:
        local_multiprocess_func = partial(multiprocess_func, **kwargs)
        ldesc = {col: s for col, s in map(local_multiprocess_func, df.iteritems())}
    else:
        pool = multiprocessing.Pool(pool_size)
        local_multiprocess_func = partial(multiprocess_func, **kwargs)
        ldesc = {col: s for col, s in pool.map(local_multiprocess_func, df.iteritems())}
        pool.close()

    # Get correlations
    dfcorrPear = df.corr(method="pearson")
    dfcorrSpear = df.corr(method="spearman")

    # Check correlations between variable
    if check_correlation is True:
        ''' TODO: corr(x,y) > 0.9 and corr(y,z) > 0.9 does not imply corr(x,z) > 0.9
        If x~y and y~z but not x~z, it would be better to delete only y
        Better way would be to find out which variable causes the highest increase in multicollinearity.
        '''
        corr = dfcorrPear.copy()
        for x, corr_x in corr.iterrows():
            if correlation_overrides and x in correlation_overrides:
                continue

            for y, corr in corr_x.iteritems():
                if x == y: break

                if corr > correlation_threshold:
                    ldesc[x] = pd.Series(['CORR', y, corr], index=['type', 'correlation_var', 'correlation'])

        if check_recoded:
            categorical_variables = [(name, data) for (name, data) in df.iteritems() if base.get_vartype(data)=='CAT']
            for (name1, data1), (name2, data2) in itertools.combinations(categorical_variables, 2):
                if correlation_overrides and name1 in correlation_overrides:
                    continue

                confusion_matrix=pd.crosstab(data1,data2)
                if confusion_matrix.values.diagonal().sum() == len(df):
                    ldesc[name1] = pd.Series(['RECODED', name2], index=['type', 'correlation_var'])

    # Convert ldesc to a DataFrame
    names = []
    ldesc_indexes = sorted([x.index for x in ldesc.values()], key=len)
    for idxnames in ldesc_indexes:
        for name in idxnames:
            if name not in names:
                names.append(name)
    variable_stats = pd.concat(ldesc, join_axes=pd.Index([names]), axis=1)
    variable_stats.columns.names = df.columns.names

    # General statistics
    table_stats = {}

    table_stats['n'] = len(df)
    table_stats['nvar'] = len(df.columns)
    table_stats['total_missing'] = variable_stats.loc['n_missing'].sum() / (table_stats['n'] * table_stats['nvar'])
    unsupported_columns = variable_stats.transpose()[variable_stats.transpose().type != base.S_TYPE_UNSUPPORTED].index.tolist()
    table_stats['n_duplicates'] = sum(df.duplicated(subset=unsupported_columns)) if len(unsupported_columns) > 0 else 0

    memsize = df.memory_usage(index=True).sum()
    table_stats['memsize'] = formatters.fmt_bytesize(memsize)
    table_stats['recordsize'] = formatters.fmt_bytesize(memsize / table_stats['n'])

    table_stats.update({k: 0 for k in ("NUM", "DATE", "CONST", "CAT", "UNIQUE", "CORR", "RECODED", "BOOL", "UNSUPPORTED")})
    table_stats.update(dict(variable_stats.loc['type'].value_counts()))
    table_stats['REJECTED'] = table_stats['CONST'] + table_stats['CORR'] + table_stats['RECODED']

    return {
        'table': table_stats,
        'variables': variable_stats.T,
        'freq': {k: (base.get_groupby_statistic(df[k])[0] if variable_stats[k].type != base.S_TYPE_UNSUPPORTED else None) for k in df.columns},
        'correlations': {'pearson': dfcorrPear, 'spearman': dfcorrSpear}
    }