def count_lines_and_files(paths_lines=None, paths_files=None, line_glob=None, file_glob=None) -> dict: """Counts lines and files in the given paths.""" result = {} for path in arg_to_iter(paths_lines): path = Path(path).resolve() if path.is_dir(): files = path.glob(line_glob) if line_glob else path.iterdir() elif path.is_file(): files = (path, ) else: files = () for file in files: LOGGER.info("Counting lines in <%s>...", file) name = os.path.splitext(file.name)[0] result[f"lc_{name}"] = count_lines(file) for path in arg_to_iter(paths_files): path = Path(path).resolve() if not path.is_dir(): continue for subdir in path.glob("**"): LOGGER.info("Counting files in <%s>...", subdir) if path == subdir: name = path.name else: relative = subdir.relative_to(path) name = "_".join(relative.parts) result[f"fc_{name}"] = count_files(subdir, glob=file_glob) return result
def get_previous_link(self): url = super().get_previous_link() if url is None: return None for key, parser in zip(arg_to_iter(self.keys), arg_to_iter(self.parsers)): params = ",".join( map(str, sorted(_extract_params(self.request, key, parser))) ) url = ( replace_query_param(url, key, params) if params else remove_query_param(url, key) ) return url
def handle(self, *args, **kwargs): logging.basicConfig( stream=sys.stderr, level=logging.DEBUG if kwargs["verbosity"] > 1 else logging.INFO, format= "%(asctime)s %(levelname)-8.8s [%(name)s:%(lineno)s] %(message)s", ) LOGGER.info(kwargs) if kwargs["delete"]: LOGGER.info("deleting destination dir <%s>", kwargs["destination"]) rmtree(kwargs["destination"], ignore_errors=True) exclude = tuple(arg_to_iter(kwargs["exclude"])) exclude = exclude + ( re.compile(r"^\."), ) if kwargs["exclude_dot"] else exclude LOGGER.info("excluding files: %s", exclude) minify( src=kwargs["source"], dst=kwargs["destination"], exclude_files=exclude, file_processors=DEFAULT_PROCESSORS, )
def _exclude(user=None, ids=None): if ids is None: return None try: import turicreate as tc except ImportError: LOGGER.exception("unable to import <turicreate>") return None ids = ( ids if isinstance(ids, tc.SArray) else tc.SArray(tuple(arg_to_iter(ids)), dtype=int) ) # pylint: disable=len-as-condition if ids is None or not len(ids): return None sframe = tc.SFrame({"bgg_id": ids}) sframe["bgg_user_name"] = user del tc, ids return sframe
def _walk_files(path, exclude_files=None): exclude_files = tuple(arg_to_iter(exclude_files)) filter_file = (partial(_filter_file, exclude_files=exclude_files) if exclude_files else None) for curr_dir, _, files in os.walk(path): for file in filter(filter_file, files): yield os.path.join(curr_dir, file)
def _process_repo( self, repo, directories, game_item, rating_item, game_csv, rating_csv, recommender_cls=BGGRecommender, recommender_dir=None, ranking_dir=None, max_iterations=100, date_str=DATE_TEMPLATE, overwrite=False, dry_run=False, ): if isinstance(repo, (str, os.PathLike)): repo = Repo(repo) LOGGER.info("Processing repository %s...", repo) recommender_dir = Path(recommender_dir) if recommender_dir else None ranking_dir = Path(ranking_dir) if recommender_dir else None if ranking_dir: ranking_fac_dir = ranking_dir / self.ranking_types[Ranking.FACTOR] ranking_sim_dir = ranking_dir / self.ranking_types[ Ranking.SIMILARITY] if not dry_run: ranking_fac_dir.mkdir(parents=True, exist_ok=True) ranking_sim_dir.mkdir(parents=True, exist_ok=True) else: ranking_fac_dir = None ranking_sim_dir = None for directory in arg_to_iter(directories): LOGGER.info("Looking for all versions of <%s>...", directory) for commit in repo.iter_commits(paths=directory): try: _process_commit( commit=commit, directory=directory, recommender_cls=recommender_cls, recommender_dir=recommender_dir, ranking_fac_dir=ranking_fac_dir, ranking_sim_dir=ranking_sim_dir, game_item=game_item, rating_item=rating_item, game_csv=game_csv, rating_csv=rating_csv, max_iterations=max_iterations, date_str=date_str, overwrite=overwrite, dry_run=dry_run, ) except Exception: LOGGER.warning( "There was an error processing commit <%s>, skipping...", commit)
def _filter_file(file, exclude_files=None): for exclude in arg_to_iter(exclude_files): if isinstance(exclude, str): if file == exclude: return False elif exclude.match(file): return False return True
def _light_games(bgg_ids=None): # pylint: disable=no-member games = ( Game.objects.all() if bgg_ids is None else Game.objects.filter(bgg_id__in=arg_to_iter(bgg_ids)) ) return games.values("bgg_id", "name", "year", "image_url")
def _extract_params(request, key, parser=None): data_values = ( arg_to_iter(request.data.get(key)) if isinstance(request.data, dict) else arg_to_iter(request.data) ) query_values = arg_to_iter(request.query_params.getlist(key)) values = _parse_parts(chain(data_values, query_values)) if not callable(parser): yield from values return values = map(parser, values) for value in values: if value is not None: yield value
def _load_add_data(files, id_field, *fields, in_format=None): objs = _load(*arg_to_iter(files), in_format=in_format) result = { o.get(id_field): {field: o[field] for field in fields if field in o} for o in objs } LOGGER.info("loaded %d data items", len(result)) return result
def parse_url( url: Union[str, ParseResult, None], hostnames: Optional[Iterable[Union[str, Pattern]]] = None, ) -> Optional[ParseResult]: """ parse URL and optionally filter for hosts """ url = urlparse(url) if isinstance(url, str) else url hostnames = tuple(arg_to_iter(hostnames)) return (url if url and url.hostname and url.path and (not hostnames or any( _match(url.hostname, hostname) for hostname in hostnames)) else None)
def validate_url( url: Union[str, ParseResult, None], hostnames: Optional[Iterable[Union[str, Pattern]]] = None, schemes: Optional[Iterable[Union[str, Pattern]]] = None, ) -> Optional[str]: """Returns cleaned up URL iff valid with scheme, hostname, and path.""" url = parse_url(url=url, hostnames=hostnames) schemes = frozenset(arg_to_iter(schemes)) return (url.geturl() if url is not None and url.scheme and (not schemes or url.scheme in schemes) else None)
def dfs_from_repo(repo, directories, files): """Load data from Git repo.""" LOGGER.info("Loading data from %s...", repo) for directory, file in product(arg_to_iter(directories), arg_to_iter(files)): path = os.path.join(directory, file) LOGGER.info("Looking for all versions of <%s>...", path) for commit in repo.iter_commits(paths=path): try: blob = commit.tree / directory / file except Exception: LOGGER.exception("Path <%s> not found in commit <%s>...", path, commit) continue LOGGER.info( 'Found <%s> from commit <%s>: "%s" (%s)', blob, commit, commit.message.strip(), commit.authored_datetime, ) file_format = format_from_path(blob.name) try: data_frame = (pd.read_csv(blob.data_stream) if file_format == "csv" else _df_from_jl(blob.data_stream.read().splitlines()) if file_format in ("jl", "jsonl") else None) except Exception: LOGGER.exception("There are a problem loading <%s>...", blob) data_frame = None if data_frame is not None and not data_frame.empty: yield { "data_frame": data_frame, "commit": commit, "blob": blob, "date": commit.authored_datetime, }
def _parse_parts(args): for arg in arg_to_iter(args): if isinstance(arg, str): for parsed in arg.split(","): parsed = parsed.strip() if parsed: yield parsed elif isinstance(arg, (list, tuple)): yield from _parse_parts(arg) else: yield arg
def _parse_link_ids(data, regex=LINK_ID_REGEX): result = defaultdict(lambda: defaultdict(list)) for origin, links in data.items(): _, id_orig = _parse_link_id(origin, regex) if id_orig is None: continue for site, id_dest in map(_parse_link_id, arg_to_iter(links)): if site and id_dest is not None: result[id_orig][site].append(id_dest) LOGGER.info("found links for %d items", len(result)) return result
def _cp_any_files(dst, tree, files): dst_files = [] for file in arg_to_iter(files): blob = tree / file dst_file = dst / file with dst_file.open("wb") as dst_fp: shutil.copyfileobj(blob.data_stream, dst_fp) dst_files.append(dst_file) return tuple(dst_files)
def games_in_articles(paths): seen = set() for path in arg_to_iter(paths): path = Path(path).resolve() with open(path) as file: for line in file: for match in regex.finditer(line): bgg_id = parse_int(match.group(1)) name = match.group(2) if bgg_id and bgg_id not in seen: seen.add(bgg_id) yield bgg_id, name, path
def _process_df(data_frame, columns=None, required_columns=None, target_column=None): if data_frame is None or data_frame.empty: LOGGER.error("DataFrame is empty") return None columns = clear_list(arg_to_iter(columns)) required_columns = clear_list(arg_to_iter(required_columns)) or columns columns = clear_list(columns + required_columns) if not columns: LOGGER.error("No columns given") return None missing_columns = [ column for column in required_columns if column not in data_frame ] if missing_columns: LOGGER.error("DataFrame does not contain the expected columns %s", missing_columns) return None for column in columns: if column not in data_frame: data_frame[column] = None target_column = target_column or columns[0] return (data_frame[columns][data_frame[target_column].notna()].sort_values( target_column).rename(columns={ "bayes_rating": "score" }).astype({ "rank": int, "bgg_id": int }))
def _create_secondary_instances( model, secondary, items, models_order=(), batch_size=None, dry_run=False, **kwargs, ): instances = _make_secondary_instances( model=model, secondary=secondary, items=items, **kwargs, ) del items batches = batchify(instances, batch_size) if batch_size else (instances, ) del instances models_order = tuple(arg_to_iter(models_order)) for count, batch in enumerate(batches): LOGGER.info("processing batch #%d...", count + 1) models = defaultdict(list) for instance in batch: models[type(instance)].append(instance) order = models_order or tuple(models.keys()) del batch for mdl in order: instances = models.pop(mdl, ()) if not dry_run and instances: LOGGER.info("creating %d instances of %r", len(instances), mdl) mdl.objects.bulk_create(instances) if any(models.values()): LOGGER.warning( "some models have not been processed properly: %r", tuple(models.keys()), ) del models del batches LOGGER.info("done processing")
def _excluded_games(self, user, params, include=None, exclude=None): params = params or {} params.setdefault("exclude_known", True) exclude = frozenset(arg_to_iter(exclude)) | frozenset( _parse_ints(params.get("exclude")) ) exclude_known = parse_bool(take_first(params.get("exclude_known"))) exclude_fields = [ field for field in self.collection_fields if parse_bool(take_first(params.get(f"exclude_{field}"))) ] exclude_wishlist = parse_int(take_first(params.get("exclude_wishlist"))) exclude_play_count = parse_int(take_first(params.get("exclude_play_count"))) exclude_clusters = parse_bool(take_first(params.get("exclude_clusters"))) try: queries = [Q(**{field: True}) for field in exclude_fields] if exclude_known and exclude_clusters: queries.append(Q(rating__isnull=False)) if exclude_wishlist: queries.append(Q(wishlist__lte=exclude_wishlist)) if exclude_play_count: queries.append(Q(play_count__gte=exclude_play_count)) if queries: query = reduce(or_, queries) exclude |= frozenset( User.objects.get(name=user) .collection_set.order_by() .filter(query) .values_list("game_id", flat=True) ) except Exception: pass return tuple(exclude) if not include else tuple(exclude - include)
def _create_all_instances(self, path, filter_ids=None, week_day="SUN", types=None): types = frozenset(arg_to_iter(types)) for ranking_type, ( sub_dir, method, min_date, min_score, ) in self.ranking_types.items(): if not types or ranking_type in types: yield from _create_instances( path_dir=os.path.join(path, sub_dir), ranking_type=ranking_type, filter_ids=filter_ids, method=method, week_day=week_day, min_date=min_date, min_score=min_score, )
def jl_to_csv(in_path, out_path, columns=None, joiner=","): """Convert a JSON lines file into CSV.""" columns = tuple(arg_to_iter(columns)) LOGGER.info("Reading JSON lines from <%s> and writing CSV to <%s>...", in_path, out_path) with open(in_path) as in_file, open(out_path, "w") as out_file: if not columns: row = next(in_file, None) row = _process_row(row, joiner=joiner) if row else {} columns = tuple(row.keys()) else: row = None rows = map(partial(_process_row, columns=columns, joiner=joiner), in_file) writer = DictWriter(out_file, fieldnames=columns) writer.writeheader() if row: writer.writerow(row) writer.writerows(rows)
def _parse_ids(values: Any) -> Generator[Tuple[int, str], None, None]: for value in arg_to_iter(values): id_, name = _parse_id(value) if id_ and name: yield id_, name
def _create_references( model, items, foreign=None, recursive=None, batch_size=None, dry_run=False, ): foreign = foreign or {} foreign = {k: tuple(arg_to_iter(v)) for k, v in foreign.items()} foreign = {k: v for k, v in foreign.items() if len(v) == 2} recursive = ({r: r for r in arg_to_iter(recursive)} if not isinstance(recursive, dict) else recursive) if not foreign and not recursive: LOGGER.warning( "neither foreign nor recursive references given, got nothing to do..." ) return LOGGER.info("creating foreign references: %r", foreign) LOGGER.info("creating recursive references: %r", recursive) count = -1 foreign_values = {f[0]: defaultdict(set) for f in foreign.values()} updates = {} for count, item in enumerate(items): update = defaultdict(list) for field, (fmodel, _) in foreign.items(): for value in filter( None, map(_parse_value_id, arg_to_iter(item.get(field)))): id_ = value.get("id") value = value.get("value") if id_ and value: foreign_values[fmodel][id_].add(value) update[field].append(id_) for rec_from, rec_to in recursive.items(): rec = {parse_int(r) for r in arg_to_iter(item.get(rec_from)) if r} rec = (sorted( model.objects.filter(pk__in=rec).values_list( "pk", flat=True).distinct()) if rec else None) if rec: update[rec_to] = rec pkey = parse_int(item.get(model._meta.pk.name)) if pkey and any(update.values()): updates[pkey] = update if (count + 1) % 1000 == 0: LOGGER.info("processed %d items so far", count + 1) del items, recursive LOGGER.info("processed %d items in total", count + 1) for fmodel, value_field in frozenset(foreign.values()): id_field = fmodel._meta.pk.name LOGGER.info("found %d items for model %r to create", len(foreign_values[fmodel]), fmodel) values = ({ id_field: k, value_field: take_first(v) } for k, v in foreign_values[fmodel].items() if k and v) _create_from_items( model=fmodel, items=values, batch_size=batch_size, dry_run=dry_run, ) del foreign, foreign_values LOGGER.info("found %d items for model %r to update", len(updates), model) batches = (batchify(updates.items(), batch_size) if batch_size else (updates.items(), )) for count, batch in enumerate(batches): LOGGER.info("processing batch #%d...", count + 1) if not dry_run: with atomic(): for pkey, update in batch: try: instance = model.objects.get(pk=pkey) for field, values in update.items(): getattr(instance, field).set(values) instance.save() except Exception: LOGGER.exception( "an error ocurred when updating <%s> with %r", pkey, update, ) del batches, updates LOGGER.info("done updating")