def _execute_batch(self, batch: List[UpdateFromGitHubAPI]): """Process a single batch of updates.""" tasks = map(run_command.s, batch) results = group(tasks).apply_async().get(interval=1) updated = [r for r in results if isinstance(r, RepositoryUpdated)] missing = [r for r in results if isinstance(r, RepositoryNotFound)] # This should eventually be moved to an event listener log.info('Updating: {}, Missing: {}'.format(len(updated), len(missing))) self._handle_updates(updated) self._handle_missing(missing)
def handle(self, cmd: CalculateImageComplexityScores) \ -> ImageComplexityScoresCalculated: """Use Celery to calculate the image complexity scores concurrently.""" filenames = self._get_filenames(cmd.path, cmd.pattern) rv = [] batch_size = 100 log.info("Processing {} images".format(len(filenames))) for batch in batched(filenames, batch_size): jobs = map(run_command.s, map(CalculateImageComplexityScore, batch)) result = group(jobs).apply_async() rv += result.get(interval=1) self._to_csv(cmd.destination, rv) return ImageComplexityScoresCalculated(cmd.destination)
def handle(self, cmd: ExecuteMahoutRecommender): model = MODELS.get(cmd.model) source = abspath(join(RATINGS_PATH, model.source)) destination = abspath(join(EXPORT_PATH, model.destination)) log.info('Running Mahout') run = ["mvn", "exec:java", "-DbatchSize=100", "-DmodelID={}".format(model.id), "-Dsrc=" + source, "-Dout=" + destination] subprocess.call(run, cwd="../growser-mahout/") Recommendation.query.filter( Recommendation.model_id == model.id).delete() columns = ['model_id', 'repo_id', 'recommended_repo_id', 'score'] batch = from_sqlalchemy_table( Recommendation.__table__, from_csv(destination), columns) for rows in batch.batch_execute(db.engine.raw_connection): log.info("Batch complete: {}".format(rows)) return RecommendationsUpdated(model.id, batch)
def run_recommendations(ratings: str, output: str, num_repos: int): ratings = fetch_ratings(ratings, num_repos) log.info("Creating co-occurrence matrix (A'A)") coo = ratings.dot(ratings.T) log.info("Log-likelihood similarity") _recommendations(4, ratings.shape[1], ratings.index, coo, score_llr, 'co-occurrence.log-likelihood') log.info("Jaccard similarity") _recommendations(6, ratings.shape[1], ratings.index, coo, score_jaccard, 'co-occurrence.jaccard')
def fetch_ratings(filename: str, num_repos: int): log.info("Loading %s", filename) ratings = pd.read_csv(filename, header=None, names=['login_id', 'repo_id', 'rating', 'date']) ratings['value'] = 1 log.info("Filtering ratings") top_users = ratings.groupby('login_id')['repo_id'].count() \ .sort_values(ascending=False) \ .sample(MAX_LOGINS) top_repos = ratings[ratings['login_id'].isin(top_users.index)] \ .groupby('repo_id')['login_id'].count() \ .sort_values(ascending=False)[:num_repos] rv = ratings[(ratings['login_id'].isin(top_users.index)) & (ratings['repo_id'].isin(top_repos.index))] log.info("Creating user/repo matrix") df = rv.pivot(index='repo_id', columns='login_id', values='value').fillna(0) return df
def run_command(command): log.info("Executing command: {}".format(command)) bus = commands(app) return bus.execute(command)