def test() -> None: from my.coding.commits import commits all_commits = list(commits()) assert len(all_commits) > 100 buckets = bucket(all_commits, key=lambda c: c.repo) by_repo = {k: list(buckets[k]) for k in buckets}
def create_partition_buffers(stream): bucketed_stream = more_itertools.bucket(stream, key=attrgetter("partition")) partition_buffers: Dict[int, Iterator[StreamEvent]] = { p: more_itertools.peekable(iter(bucketed_stream[p])) for p in range(partition_count) } global_event_buffer = bucketed_stream[StreamEvent.ALL_PARTITIONS] return partition_buffers, global_event_buffer
def test() -> None: from my.youtube.takeout import watched, Watched videos = [w for w in watched() if not isinstance(w, Exception)] assert len(videos) > 1000 # results in nicer errors, otherwise annoying to check against thousands of videos grouped = bucket(videos, key=lambda w: (w.url, w.title)) w1 = Watched( url='https://www.youtube.com/watch?v=hTGJfRPLe08', title='Jamie xx - Gosh', when=pytz.timezone('Europe/London').localize( datetime(year=2018, month=6, day=21, hour=6, minute=48, second=34)), ) assert w1 in list(grouped[(w1.url, w1.title)]) w2 = Watched( url='https://www.youtube.com/watch?v=IZ_8b_Ydsv0', title='Why LESS Sensitive Tests Might Be Better', when=pytz.utc.localize( datetime(year=2021, month=1, day=15, hour=17, minute=54, second=12)), ) assert w2 in list(grouped[(w2.url, w2.title)])
def select_results(version, url_list): """Prépare le traitement des résultats de tests des documents dont tous les tests ont été effectués. Récupère les résultats dans la collection de résultats et les valeurs nécessaires dans les documents issus du scraping. Sélection des résultats de test définitifs à partir des choix majoritaires des testeurs. """ dbfinder = mongo.MongoLoad( { 'img_url': { '$in': url_list }, 'search_version': version }, { 'img_url': 1, 'locations_selected': 1, 'sufficient': 1, '_id': 0 }) group_results = bucket(dbfinder.retrieve('Resultats_Test_Expert_1'), key=lambda x: x['img_url']) dbfinder.reinit({ 'img_url': { '$in': url_list }, 'search_version': version }, { 'search_version': 1, 'country': 1, 'img_url': 1, 'tag_list': 1, 'location_list': 1, '_id': 0 }) final_results = [] for doc in dbfinder.retrieve('Resultats_RGN'): result = list( group_results[doc['img_url']] ) #Conversion de l'itérateur car double parcours nécessaire doc['locations_selected'] = [ True if comp.count(True) > len(comp) / 2 else False for comp in zip(*[res['locations_selected'] for res in result]) ] doc['sufficient'] = True if sum( 1 if b else -1 for b in [res['sufficient'] for res in result]) > 0 else False doc['processed'] = False final_results.append(doc) return final_results
def radix_sort(A, max_digits): """More elegant, but far too slow due to the call to bucket().""" radix_keys = list(map(str, range(10))) B = [str(a_i).zfill(max_digits) for a_i in A] for digit_index in range(-1, -max_digits - 1, -1): buckets = bucket(B, key=lambda b_i: b_i[digit_index]) B = [] for j in radix_keys: B.extend(buckets[j]) return [int(b_i) for b_i in B]
def research(): flask.current_app.logger.info('Serving RESEARCH') page = pages.get_or_404('research') articles = mitt.bucket(sorted(topics, key=lambda t: t.path), key=lambda a: a.meta['group']) refs = get_refs() return flask.render_template('research.html', active='research', page=page, groups=groups, articles=articles, refs=refs)
def by_night() -> Dict[date, Emfit]: res: Dict[date, Emfit] = {} # TODO shit. I need some sort of interrupted sleep detection? grouped = bucket(get_datas(), key=lambda s: s.date) for dd in grouped: sleeps = list(grouped[dd]) if len(sleeps) > 1: logger.warning("multiple sleeps per night, not handled yet: %s", sleeps) continue [s] = sleeps res[s.date] = s return res
def bucket_merge( iterable: Iterable[T], sort_key: Callable[[T], Any], bucket_key: Callable[[T], U], buckets: Iterable[U], ) -> Iterator[T]: """Sort a partially sorted iterable lazily If the iterable can be split into individually sorted buckets then this function will sort it. """ buckets_ = set(buckets) iterables = more_itertools.bucket(iterable, bucket_key, lambda x: x in buckets_) yield from imerge((iterables[bucket] for bucket in buckets_), key=sort_key)
def CompleteSystem(S, context): """ Algorithm C1, p. 385 >>> tvars=var("x y z") >>> w = function("w")(*tvars) >>> # these DPs are constructed from C1, pp 384 >>> h1=diff(w, x,x,x, y,y,z,z) >>> h2=diff(w, x,x,x, z,z,z) >>> h3=diff(w, x, y, z,z,z) >>> h4=diff(w, x, y) >>> ctx=Context((w,),(x,y,z), Mgrlex) >>> dps=[_Differential_Polynomial(_, ctx) for _ in [h1,h2,h3,h4]] >>> cs = CompleteSystem(dps, ctx) >>> # things are sorted up >>> for _ in cs: _.show() diff(w(x, y, z), x, y) diff(w(x, y, z), x, y, z) diff(w(x, y, z), x, x, y) diff(w(x, y, z), x, y, z, z) diff(w(x, y, z), x, x, y, z) diff(w(x, y, z), x, x, x, y) diff(w(x, y, z), x, y, z, z, z) diff(w(x, y, z), x, x, y, z, z) diff(w(x, y, z), x, x, x, y, z) diff(w(x, y, z), x, x, x, y, y) diff(w(x, y, z), x, x, y, z, z, z) diff(w(x, y, z), x, x, x, z, z, z) diff(w(x, y, z), x, x, x, y, z, z) diff(w(x, y, z), x, x, x, y, y, z) diff(w(x, y, z), x, x, x, y, z, z, z) diff(w(x, y, z), x, x, x, y, y, z, z) >>> # example from Schwarz, pp 54 >>> w = function("w")(x,y) >>> z = function("z")(x,y) >>> g1 = diff(z,y,y) + diff(z, y)/(2*y) >>> g5 = diff(z,x,x,x) + diff(w,y,y)*8*y**2 + diff(w,x,x)/y - diff(z,x,y)*4*y**2 - diff(z,x)*32*y-16*w >>> g6 = diff(z,x,x,y) - diff(z,y,y)*4*y**2 - diff(z,y)*8*y >>> ctx = Context((w,z),(x,y), Mgrlex) >>> dps=[_Differential_Polynomial(_, ctx) for _ in [g1,g5,g6]] >>> cs = CompleteSystem(dps, ctx) >>> for _ in cs: print(_) diff(z(x, y), y, y) + (1/2/y) * diff(z(x, y), y) diff(z(x, y), x, y, y) + (1/2/y) * diff(z(x, y), x, y) diff(z(x, y), x, x, y) + (-4*y^2) * diff(z(x, y), y, y) + (-8*y) * diff(z(x, y), y) diff(z(x, y), x, x, x) + (1/y) * diff(w(x, y), x, x) + (8*y^2) * diff(w(x, y), y, y) + (-4*y^2) * diff(z(x, y), x, y) + (-32*y) * diff(z(x, y), x) + (-16) * w(x, y) """ s = bucket(S, key=lambda d: d.Lfunc()) res = flatten([complete(s[k], context) for k in s]) return Reorder(res, context, ascending=True)
def get_spot(self, t: float) -> float: if not self.spots: raise RuntimeError("Cannot interpolate without spot rates") elif t in self.spots: return self.spots[t] else: groups = more_itertools.bucket(self.spots.keys(), key=lambda x: x > t) L = max(groups[False], default=None) # closest key on the left R = min(groups[True], default=None) # closest key on the right if L is None: # if none are smaller return self.spots[R] # flat interpolation elif R is None: # if none are bigger return self.spots[L] # flat interpolation else: m = (self.spots[R] - self.spots[L]) / (R - L) # get slope return m * (t - L) + self.spots[L] # linear interpolation
async def handle_initial(self, filename): """ Handles reading the special 'initial' file The file contains org unit data, as well as data on the associated details The initial org unit file contains historic data, so a minimal set of create/edit payloads are created accordingly """ org_units = los_files.read_csv(filename, OrgUnit) await self.handle_addresses(org_units, filename) unit_payloads = self.create_unit_payloads(org_units) detail_payloads = await self.create_detail_payloads(org_units) payloads = list(unit_payloads) + list(detail_payloads) # Bucket all payloads referring to the same object uuid_buckets = bucket(payloads, key=lambda payload: payload["uuid"]) sorted_buckets = map( lambda uuid_key: sorted(uuid_buckets[uuid_key], key=lambda x: x["validity"]["from"]), uuid_buckets, ) consolidated_buckets = list( map(self.consolidate_payloads, sorted_buckets)) split_lists = map(lambda x: (x[0], x[1:]), consolidated_buckets) heads, tails = unzip(split_lists) # OS2mo reads an object before performing an edit to it, so we need to ensure # that we don't perform multiple edits to an object in parallel, which could # cause one edit to be overwritten by another # We create layers containing at most one edit request for each org unit UUID, # and execute the layers sequentially, while allowing the importer to submit the # individual requests in a layer in parallel edit_payloads = map(partial(map, mo_payloads.convert_create_to_edit), tails) edit_layers = zip_longest(*edit_payloads) edit_layers_filtered = map(partial(filter, None.__ne__), edit_layers) async with util.get_client_session() as session: await util.create_details(session, heads) for edit_layer in edit_layers_filtered: await util.edit_details(session, edit_layer)
def get_kle(org_unit_uuid: str, mh: MoraHelper) -> Tuple[List[str], List[str]]: present = mh._mo_lookup(org_unit_uuid, "ou/{}/details/kle?validity=present") future = mh._mo_lookup(org_unit_uuid, "ou/{}/details/kle?validity=future") kles = present + future def get_kle_tuples( kles: List[dict], ) -> Generator[Tuple[str, str], None, None]: for kle in kles: number = kle["kle_number"]["user_key"] for aspect in kle["kle_aspect"]: yield number, aspect["scope"] kle_tuples = get_kle_tuples(kles) buckets = bucket(kle_tuples, key=itemgetter(1)) interest = map(itemgetter(0), buckets["INDSIGT"]) performing = map(itemgetter(0), buckets["UDFOERENDE"]) return list(interest), list(performing)
def _get_forms( self, ) -> (forms.AccountUpdateForm, forms.PasswordChangeForm, forms.TOTPCheckForm): """Bind forms appropriately for method.""" request = self.request # TODO: switch to normal attribute access after this is fixed # https://youtrack.jetbrains.com/issue/PY-37457 post_data: QueryDict = getattr(request, "POST") # Bucket into new QueryDicts based on prefix. Must use MultiValueDict.update # to enforce list containers for values. buckets = bucket(post_data.items(), lambda pair: pair[0].partition("-")[0]) account_update = QueryDict(mutable=True) account_update.update(dict(buckets[self.ACCOUNT_FORM_PREFIX])) password_change = QueryDict(mutable=True) password_change.update(dict(buckets[self.PASSWORD_FORM_PREFIX])) otp_check = QueryDict(mutable=True) otp_check.update(dict(buckets[self.OTP_FORM_PREFIX])) # When data is set to None, the form will not bind. return ( self.update_account_form_class( instance=request.user, data=account_update or None, prefix=self.ACCOUNT_FORM_PREFIX, ), self.change_password_form_class( request.user, data=password_change or None, prefix=self.PASSWORD_FORM_PREFIX, ), self.otp_check_form_class(request=request, data=otp_check or None, prefix=self.OTP_FORM_PREFIX), )
def create_employee_payloads(self, persons): cpr_buckets = bucket(persons, key=lambda x: x.cpr) # Every person row contains the same info, so we just pick one unique_persons = map(lambda key: first(cpr_buckets[key]), cpr_buckets) return map(self.generate_employee_payload, unique_persons)
def get_items(self) -> Mirror.Results: from my import zotero errors = [] good = [] for a in zotero.annotations(): if isinstance(a, Exception): errors.append(a) else: good.append(a) for e in errors: yield error(e) groups = bucket(good, key=lambda a: a.item) for item in groups: file_annotations = groups[item] def chit(): for a in file_annotations: parts = [] text = a.text if text is not None: # todo not sure about it here... maybe should rely on softwrap in emacs instead? text = '\n'.join( wrap(text, width=config.MAX_LINE_WIDTH)) text = literal(text) parts.append(text) comment = a.comment if comment is not None: parts.append(comment) page1 = a.page + 1 # NOTE: zotero using 0-indexing, pdfview using 1-indexing body = '\n'.join(parts) color = a.color_human tags = list(a.tags) # todo not sure which is best? tags.append(color) properties = { 'ZOTERO_COLOR': color, } if len(a.tags) > 0: # zotero tags cal be multi-word? guess worth adding just in case properties['ZOTERO_TAGS'] = ', '.join( a.tags) # not sure what's the best separator... # todo not sure about it... mtodo: Optional[str] = None if 'todo' in {t.lower() for t in tags}: mtodo = 'TODO' heading = docview_link(path=item.file, title=f'page {page1}', page1=page1) if comment is not None: # try to display first few words? cline = wrap(comment, width=config.MAX_LINE_WIDTH)[0] heading = heading + ' ' + cline # todo would be nice to align tags, maybe... yield node( todo=mtodo, heading=dt_heading( a.added, heading, ), tags=tags, properties=properties, body=body, ) body = '' if url := item.url: body = url yield node(heading=docview_link(path=item.file, title=item.title), tags=item.tags, body=body, children=list(chit()))
def compute_hal_or_glove_co_occurrences( stream: Iterable[Tuple[str, Iterable[str]]], *, document_index: DocumentIndex, token2id: Mapping[str, int], window_size: int, distance_metric: int, # 0, 1, 2 normalize: str = 'size', method: str = 'HAL', zero_diagonal: bool = True, direction_sensitive: bool = False, partition_column: str = 'year', ): """Computes co-occurrence as specified by either `Glove` or `Hyperspace Analogous to Hyperspace` (HAL) NOTE: - Passed document index MUST be in the same sequence as the passed sequence of tokens Parameters ---------- corpus : Iterable[str,Iterable[str]] Sequence of tokens document_index : DocumentIndex Document catalogue window_size : int [description] distance_metric : int [description] 1 : [type] [description] 2normalize : str, optional [description], by default 'size' method : str, optional [description], by default 'HAL' zero_diagonal : bool, optional [description], by default True direction_sensitive : bool, optional [description], by default False Returns ------- [type] [description] """ # if issubclass(type(corpus), CorpusABC): # doc_terms = [[t.lower().strip('_') for t in terms if len(t) > 2] for terms in corpus.get_texts()] if document_index is None: raise CoOccurrenceError("expected document index found None") if partition_column not in document_index.columns: raise CoOccurrenceError( f"expected `{partition_column}` not found in document index") if token2id is None: raise CoOccurrenceError("expected `token2id` found None") # token2id = generate_token2id(doc_terms) def get_bucket_key(item: Tuple[str, Iterable[str]]) -> int: if not isinstance(item, tuple): raise CoOccurrenceError( f"expected stream of (name,tokens) tuples found {type(item)}") filename = item[0] if not isinstance(str, filename): raise CoOccurrenceError( f"expected filename (str) ound {type(filename)}") return int(document_index.loc[filename][partition_column]) total_results = [] key_streams = more_itertools.bucket(stream, key=get_bucket_key, validator=None) keys = sorted(list(key_streams)) metadata = [] for i, key in tqdm(enumerate(keys), position=0, leave=True): key_stream: FilenameTokensTuple = key_streams[key] keyed_document_index = document_index[document_index[partition_column] == key] metadata.append( dict( document_id=i, filename='year_{year}.txt', document_name='year_{year}', year=key, n_docs=len(keyed_document_index), )) logger.info(f'Processing{key}...') tokens_stream = (tokens for _, tokens in key_stream) vectorizer = (HyperspaceAnalogueToLanguageVectorizer( token2id=token2id).fit(tokens_stream, size=window_size, distance_metric=distance_metric) if method == "HAL" else GloveVectorizer(token2id=token2id).fit( tokens_stream, size=window_size)) co_occurrence = vectorizer.to_dataframe( normalize=normalize, zero_diagonal=zero_diagonal, direction_sensitive=direction_sensitive) co_occurrence[partition_column] = key total_results.append( co_occurrence[[ 'year', 'x_term', 'y_term', 'nw_xy', 'nw_x', 'nw_y', 'cwr' ]], ) # if i == 5: break co_occurrences = pd.concat(total_results, ignore_index=True) co_occurrences['cwr'] = co_occurrences.cwr / np.max(co_occurrences.cwr, axis=0) return co_occurrences
[[3, 64, 96, 92], [64, 96, 19, 128, 0, 64, 0, 0, 0, 71, 38, 2, 250, 0, 160, 0, 27, 242, 118, 0, 0]], [[3, 64, 97, 91], [64, 97, 18, 128, 0, 78, 1, 10, 1, 11, 1, 17, 1, 18, 2, 4, 1, 0, 0, 59]], [[3, 64, 98, 90], [64, 98, 19, 128, 0, 128, 94, 1, 210, 0, 0, 1, 253, 255, 0, 100, 0, 0, 0, 0, 184]], [[3, 64, 99, 89], [64, 99, 10, 128, 0, 1, 112, 100, 50, 21, 1, 181]], [[3, 64, 100, 88], [64, 100, 14, 128, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 203]], [[3, 64, 97, 91], [64, 97, 18, 128, 0, 79, 1, 10, 1, 11, 1, 17, 1, 18, 2, 4, 1, 0, 0, 58]], ] ] three_dollars = b'$$$' # Some commands have multiple responses, in which case we might receive any one of them. command_to_responses = ( bucket(simulated_command_responses, key=lambda command_and_response: command_and_response[0]) ) command_to_responses = { k: [list(response[1]) for response in command_to_responses[k]] for k in command_to_responses } class DaikinSimulator: """ Simulates a Daikin Altherma's responses to serial commands. Used by tests. """ response_buffer = b'' def write(self, command): """ Write a simulated command.
async def bulk_ensure( ctx, dry_run: bool, filename: str, ): """Ensure the entries in the json file exists in MOX. Currently only bulk loads classes """ mox_helper = await create_mox_helper(ctx.obj["mox.base"]) # Load file and fetch with open(filename) as json_file: data = json.load(json_file) # Construct classes by applies __apply_to_all__ to all elements within a # single block of classes and flattening the structure to be a simple # list of classes def construct_entry(bvn, item, apply_to_all): return {**item, **apply_to_all, "bvn": bvn} def construct_block(block): apply_to_all = block.pop("__apply_to_all__", {}) classes = map(lambda entry: construct_entry(*entry, apply_to_all), block.items()) return classes facets = [] if "facets" in data: facets = flatten(map(construct_block, data["facets"])) facets = list(facets) classes = [] if "classes" in data: classes = flatten(map(construct_block, data["classes"])) classes = list(classes) # Fetch default organisation org_uuid = None org_uuid = org_uuid or await mox_helper.read_element_organisation_organisation( bvn="%") def enrich_with_org_unit(entry): entry["org_uuid"] = org_uuid return entry # Enrich facets with default organisation facets = map(enrich_with_org_unit, facets) # Translate facet json to lora_facet facets = map(lambda facet: lora_facet(**facet), facets) # Prepare to output facets = list(facets) # Print for dry run if dry_run: for facet in facets: mox_helper.validate_klassifikation_facet(facet) message = json.dumps(facet, indent=4, sort_keys=True) click.secho(message, fg="green") return # POST for non-dry tasks = list(map(mox_helper.get_or_create_klassifikation_facet, facets)) results = await asyncio.gather(*tasks) for uuid, created in results: print_created(uuid, created) # Find all unique facet bvns used by the classes, and translate to UUIDs required_facets = set(map(itemgetter("facet"), classes)) async def construct_facet_bvn_to_uuid_map(facet_bvns): async def create_bvn_to_uuid_tuple(facet_bvn): return ( facet_bvn, await mox_helper.read_element_klassifikation_facet(bvn=facet_bvn), ) tasks = list(map(create_bvn_to_uuid_tuple, facet_bvns)) return dict(await asyncio.gather(*tasks)) facet_map = await construct_facet_bvn_to_uuid_map(required_facets) async def enrich_classes(classes): # Find all unique parent bvns used by the classes, and translate to UUIDs required_parents = set( {clazz["parent"] for clazz in classes if "parent" in clazz}) async def construct_parent_bvn_to_uuid_map(parent_bvns): async def create_bvn_to_uuid_tuple(parent_bvn): return ( parent_bvn, await mox_helper.read_element_klassifikation_klasse( bvn=parent_bvn), ) tasks = list(map(create_bvn_to_uuid_tuple, parent_bvns)) return dict(await asyncio.gather(*tasks)) parent_map = await construct_parent_bvn_to_uuid_map(required_parents) # Enrich classes with default organisation classes = map(enrich_with_org_unit, classes) # Translate class facet to facet_uuid def class_facet_to_facet_uuid(clazz): facet_bvn = clazz.pop("facet") clazz["facet_uuid"] = facet_map[facet_bvn] return clazz classes = map(class_facet_to_facet_uuid, classes) # Translate class parent to parent_uuid def class_parent_to_parent_uuid(clazz): parent_bvn = clazz.pop("parent", None) if parent_bvn: clazz["parent_uuid"] = parent_map[parent_bvn] return clazz classes = map(class_parent_to_parent_uuid, classes) return classes # Partition into buckets by layer def set_layer(clazz): if "__layer__" not in clazz: clazz["__layer__"] = 1 return clazz classes = map(set_layer, classes) buckets = bucket(classes, key=itemgetter("__layer__")) layers = sorted(list(buckets)) for layer in layers: classes = list(buckets[layer]) classes = await enrich_classes(classes) # Remove the layer key def remove_key(key): def worker(clazz): del clazz[key] return clazz return worker classes = map(remove_key("__layer__"), classes) # Translate class json to lora_klasse classes = map(lambda clazz: lora_klasse(**clazz), classes) # Prepare to output classes = list(classes) # Print for dry run if dry_run: for clazz in classes: mox_helper.validate_klassifikation_klasse(clazz) message = json.dumps(clazz, indent=4, sort_keys=True) click.secho(message, fg="green") return # POST for non-dry tasks = list( map(mox_helper.get_or_create_klassifikation_klasse, classes)) results = await asyncio.gather(*tasks) for uuid, created in results: print_created(uuid, created)
def bucket(container, bucket_key = lambda x: x, sort_key= None): b = more_itertools.bucket(container, bucket_key) return {k: sorted(list(b[k]), key=sort_key) for k in b}
""" import docopt import shutil from tempfile import mkdtemp from pathlib import Path from more_itertools import bucket ROOT_DIR = Path(__file__).parent CASE_DIR = ROOT_DIR / 'test_cases' import sys; sys.path.append(str(ROOT_DIR / 'api')) import nestedtext_official_tests as official if __name__ == '__main__': args = docopt.docopt(__doc__) tmp_dir = Path(mkdtemp(prefix='renumber_test_cases_')) cases = official.load_test_cases(args['<cases>']) families = bucket(cases, key=lambda x: x.family) for key in families: sorted_cases = sorted(families[key], key=lambda x: x.num) d = len(str(len(sorted_cases))) for i, case in enumerate(sorted_cases, 1): shutil.move(case.dir, tmp_dir / f'{case.family}_{i:0{d}}') for dir in tmp_dir.iterdir(): shutil.move(dir, CASE_DIR / dir.name) tmp_dir.rmdir()
# Split based on Object Type import more_itertools class Cube: pass class Circle: pass class Triangle: pass shapes = [Circle(), Cube(), Circle(), Circle(), Cube(), Triangle(), Triangle()] s = more_itertools.bucket(shapes, key=lambda x: type(x)) # s -> <more_itertools.more.bucket object at 0x7fa65323f210> list(s[Cube]) # [<__main__.Cube object at 0x7f394a0633c8>, <__main__.Cube object at 0x7f394a063278>] list(s[Circle]) # [<__main__.Circle object at 0x7f394a063160>, <__main__.Circle object at 0x7f394a063198>, <__main__.Circle object at 0x7f394a063320>]
def offer_uw_testing(*, at: str, log_offers: bool, db: DatabaseSession, action: DatabaseSessionAction): LOG.debug(f"Offering UW Husky Coronavirus Testing @ {at}") dry_run = action is DatabaseSessionAction.DRY_RUN # This uses a mutable quota to track available vs. used testing capacity # for given time periods. An alternate approach would be to use a # log/ledger (like we keep in receiving.* tables) which records credits # (tests capacity scheduled for release at a certain time) and debits # (tests offered at a certain time). While this requires recalculating the # balance every run, we would be able to query when tests were released and # keep more metadata about that. These same benefits could be realized by # turning our normal logging output into structured event logs. I think # that's preferrable, so decided not to implement as a ledger right now. # -trs, 17 Sept & 13 Oct 2020 # Lookup the quota for the current time, locking it for update at the end # after we make offers. # # XXX TODO: As a future improvement, automatically pick up any remaining # quota left from _past_ timespans in the current day. # -trs, 17 Sept 2020 quota = db.fetch_row( """ select name, timespan, max, used, max - used as remaining from operations.test_quota where name = 'uw' and timespan @> timestamp with time zone %s for update """, (at, )) if not quota: LOG.info(f"No quota row found, aborting") return if not quota.remaining > 0: LOG.info( f"No quota remaining for {quota.name} during {quota.timespan}, aborting" ) return LOG.info( f"Quota for {quota.name} during {quota.timespan} " f"is now {quota.remaining:,} = {quota.max:,} - {quota.used:,} (remaining = max - used)" ) # Offer testing to the top entries in our priority queue. next_in_queue = db.fetch_all( """ select redcap_url, redcap_project_id, redcap_record_id, redcap_event_name, redcap_repeat_instance, priority, priority_reason from shipping.uw_priority_queue_v1 limit %s """, (quota.remaining, )) if not next_in_queue: LOG.info(f"Nothing in the queue") return LOG.info( f"Fetched {len(next_in_queue):,} entries from the head of the queue") # Use the REDCap URL and project id from the queue rather than hardcoding. buckets = bucket(next_in_queue, lambda q: (q.redcap_url, q.redcap_project_id)) queued_by_project = {key: list(buckets[key]) for key in buckets} offer_count = 0 for (url, project_id), queued in queued_by_project.items(): offers = [offer(q) for q in queued] LOG.info( f"Making {len(offers):,} offers for {url} project {project_id} {'(dry run)' if dry_run else ''}" ) if log_offers: dump_ndjson(offers) # Token will automatically come from the environment. If we're doing a # dry run, then Project will make sure we update_records() doesn't # actually update records. project = Project(url, project_id, dry_run=dry_run) batches = list(chunked(offers, REDCAP_BATCH_SIZE)) for i, batch in enumerate(batches, 1): LOG.info( f"Updating REDCap record batch {i:,}/{len(batches):,} of size {len(batch):,}" ) offer_count += project.update_records(batch) # Insert synthetic DETs into our receiving table to trigger a new # import. This helps complete the roundtrip data update for the REDCap # records we just updated since API imports don't trigger natural DETs. insert_dets(db, project, offers) # XXX TODO: Maybe also update an internal testing_offered flag (in # encounter.details?) to avoid delay of roundtrip thru REDCap? If we don't # do this, then worst case we try to offer testing to the same records more # than once? This is probably more complicated than we want to deal with # on the first iteration and involves cooperation between this command and # the priority queue definition. I think timing will work out most of the # time and the worst case is we offer less testing than we can handle # (better than offering more!). If it happens commonly, we can address # later. # -trs, 17 Sept & 13 Oct 2020 updated_quota = db.fetch_row( """ update operations.test_quota set used = used + %s where (name, timespan) = (%s, %s) returning name, timespan, max, used, max - used as remaining """, (offer_count, quota.name, quota.timespan)) LOG.info( f"Quota for {updated_quota.name} during {updated_quota.timespan} " f"is now {updated_quota.remaining:,} = {updated_quota.max:,} - {updated_quota.used:,} (remaining = max - used)" )
def split_by_function(S, context): s = bucket(S, key=lambda d: d.Lfunc()) return flatten([FindIntegrableConditions(s[k], context) for k in s])
def _entities() -> Iterator[Res[Union[User, _Message]]]: from ..core.kompress import ZipPath last = ZipPath(max(inputs())) # TODO make sure it works both with plan directory # idelaly get_files should return the right thing, and we won't have to force ZipPath/match_structure here # e.g. possible options are: # - if packed things are detected, just return ZipPath # - if packed things are detected, possibly return match_structure_wrapper # it might be a bit tricky because it's a context manager -- who will recycle it? # - if unpacked things are detected, just return the dir as it is # (possibly detect them via match_structure? e.g. what if we have a bunch of unpacked dirs) # # I guess the goal for core.structure module was to pass it to other functions that expect unpacked structure # https://github.com/karlicoss/HPI/pull/175 # whereas here I don't need it.. # so for now will just implement this adhoc thing and think about properly fixing later j = json.loads((last / 'account_information/personal_information.json').read_text()) [profile] = j['profile_user'] pdata = profile['string_map_data'] username = pdata['Username']['value'] full_name = _decode(pdata['Name']['value']) # just make up something :shrug: self_id = username self_user = User( id=self_id, username=username, full_name=full_name, ) yield self_user files = list(last.rglob('messages/inbox/*/message_*.json')) assert len(files) > 0, last buckets = bucket(files, key=lambda p: p.parts[-2]) file_map = {k: list(buckets[k]) for k in buckets} for fname, ffiles in file_map.items(): for ffile in sorted(ffiles, key=lambda p: int(p.stem.split('_')[-1])): j = json.loads(ffile.read_text()) id_len = 10 # NOTE: no match in android db/api responses? other_id = fname[-id_len:] # NOTE: no match in android db? other_username = fname[:-id_len - 1] other_full_name = _decode(j['title']) yield User( id=other_id, username=other_username, full_name=other_full_name, ) # todo "thread_type": "Regular" ? for jm in j['messages']: # todo defensive? try: mtype = jm['type'] # Generic/Share? content = None if 'content' in jm: content = _decode(jm['content']) else: share = jm.get('share') photos = jm.get('photos') videos = jm.get('videos') cc = share or photos or videos if cc is not None: content = str(cc) assert content is not None, jm timestamp_ms = jm['timestamp_ms'] sender_name = _decode(jm['sender_name']) user_id = other_id if sender_name == other_full_name else self_id yield _Message( created=datetime.fromtimestamp(timestamp_ms / 1000), text=content, user_id=user_id, thread_id=fname, # meh.. but no better way? ) except Exception as e: # TODO sometimes messages are just missing content?? even with Generic type yield e