def get_samples(): """Return a {'class': [reponames]}.""" repos = Repo.load() fetch_dates = [datetime.datetime(*(r.fetch_ymd)) for r in repos] print 'number of repos:', len(repos) latest_fetch = max(fetch_dates) print 'fetched between %s and %s' % (min(fetch_dates), latest_fetch) print filtered = [r for r in repos if 30720 > r.size > 0 and # not foolproof to avoid big repos r.stars > 1 and not r.fork and not 'dotfile' in r.name.lower() and not 'sublime' in r.name.lower() # avoid SublimeText config ] print 'after noise filter:', len(filtered) filtered = [r for r in filtered if ((latest_fetch - r.creation_date) > datetime.timedelta(30)) ] print 'exluding very new:', len(filtered) filtered = [r for r in filtered if r.stars > 5 and classes.score(r) > (1 / 30) ] print 'exluding very unpopular:', len(filtered) score_pairs = [(classes.score(r), r) for r in filtered] score_pairs.sort(key=lambda x: x[0]) # top 1k, bottom 1k. return {'high': [r.name for (score, r) in score_pairs[-1000:]], 'low': [r.name for (score, r) in score_pairs[:1000]], }