def load_data( input_dir=None, split='train', ) -> Tuple[pd.DataFrame, pd.DataFrame]: """Load data for a specific split If input_dir is not provided, loads X and y for the given split from the default location (S3). If input_dir is provided, loads the entities/targets tables from their default table names from the given directory, ignoring split. For feature development, only the train split should be used. """ config = load_config() tables = config.data.tables entities_table_name = config.data.entities_table_name entities_config = some(where(tables, name=entities_table_name)) targets_table_name = config.data.targets_table_name targets_config = some(where(tables, name=targets_table_name)) if input_dir is None: bucket = config.data.s3_bucket split_path = config.data.splits.get(split) input_dir = f's3://{bucket}/{split_path}' X = load_table_from_config(input_dir, entities_config) y = load_table_from_config(input_dir, targets_config) return X, y
def buffer_logic(state): ''' Buffer create/replace/reuse logic. The function name is not very good :( new_state | old_state | same sources | action ----------------|---------------|--------------|----------------- replace | replace | True | reuse buffer replace | replace | False | replace buffer replace | no-replace | True | create buffer (copy candidates) replace | no-replace | False | create buffer no-replace | replace | True | create buffer (copy candidates) no-replace | replace | False | create buffer no-replace | no-replace | True | reuse buffer no-replace | no-replace | False | create buffer A reusable buffer will be looked for, then a replacement buffer and as a last resort a new one will be created. Returns: old_state (dict): In case a state was reused/replaced it is returned because it will be needed later on to compare it with the current state and determine whether the window should be resized/moved, etc... ''' # We are only interested in buffers which are in the same container. # That's where the interesting reuse/replace logic is at. states = fn.where(variables.states, container=state['container']) with_same_sources = partial(same_sources, state) reusable_state = fn.first(fn.where( ifilter(with_same_sources, states), replace = state['replace'] )) replaceable_state = fn.first(fn.where( ifilter(lambda x: not with_same_sources(x), states), replace = True )) old_state = None if reusable_state: state.update(fn.project(reusable_state, ['uid', 'buffer', 'sources'])) old_state = reusable_state variables.states.remove(reusable_state) elif replaceable_state: state.update(fn.project(replaceable_state, ['uid', 'buffer'])) state['sources'] = populated_candidates(state) set_buffer_contents(state['buffer'], aggregate_candidates(state)) old_state = replaceable_state variables.states.remove(replaceable_state) else: same = find(with_same_sources, states) state['sources'] = (same and same['sources']) or populated_candidates(state) state['buffer'] = make_pyunite_buffer(state) return old_state
def chose_ware_ids_with_requirement(wares, categories, moderation, stock): """ Делаем пересечение условия для существующих товаров. :param wares: список товаров :param categories: список категорий :param moderation: модерирование :param stock: сток :return: список товаров с пересечениями """ # делаем пересечение условий w_c = [funky.pluck(funcy.where(wares, managed_category=x), 'ware_id') for x in categories] w_m = [funky.pluck(funcy.where(wares, moderation_state=x), 'ware_id') for x in moderation] w_s = [funky.pluck(funcy.where(wares, stock_state=x), 'ware_id') for x in stock] return intersection_lists(w_c + w_m + w_s)
def get_static_user(user_id): """ Получить данные статичного пользователя для прода по идентификатору. :param user_id: идентификатор пользователя :return: список словарей """ from funcy import where return where(MainClass.STATIC_USERS, user_id=user_id)
def validate_semantics_of_table(table_design): """Check for semantics that apply to tables in source schemas.""" validate_semantics_of_table_or_ctas(table_design) if "depends_on" in table_design: raise TableDesignSemanticError( "upstream table '%s' has dependencies listed" % table_design["name"]) constraints = table_design.get("constraints", []) constraint_types_in_design = [ constraint_type for constraint in constraints for constraint_type in constraint ] for constraint_type in constraint_types_in_design: if constraint_type in ("natural_key", "surrogate_key"): raise TableDesignSemanticError( "upstream table '{}' has unexpected {} constraint".format( table_design["name"], constraint_type)) [split_by_name] = table_design.get("extract_settings", {}).get("split_by", [None]) if split_by_name: split_by_column = fy.first( fy.where(table_design["columns"], name=split_by_name)) if split_by_column.get("skipped", False): raise TableDesignSemanticError( "split-by column must not be skipped") if not split_by_column.get("not_null", False): raise TableDesignSemanticError( "split-by column must have not-null constraint") if split_by_column["type"] not in ("int", "long", "date", "timestamp"): raise TableDesignSemanticError( "type of split-by column must be int, long, date or timestamp, not '{}'" .format(split_by_column["type"]))
def get_tests(self, file_name, source_data): """ :param file_name: :param source_data: :return: """ tests_array = funcy.where(source_data, code_name_file=file_name) classes_name = set([(index['code_name_class'], str(index['story'])) for index in tests_array]) classes_array = list() for c_name in classes_name: #tests_for_class = funcy.where(source_data, code_name_class=c_name) tests_for_class = list() for t in tests_array: if t['code_name_class'] == c_name[0]: p = {"name": t['name'], "code_name": t["code_name_class"], "description": t["description"], "skip": t["skip"], "priority": t["priority"]} tests_for_class.append(p) classes_array.append({"name": c_name[1].decode('utf-8'), "code_name": c_name[0], "tests": tests_for_class}) return {file_name: classes_array}
def get_static_user_by_role(role): """ Получить данные статичного пользователя для прода по его роли. :param role: роль пользователя :return: список словарей """ from funcy import where return where(MainClass.STATIC_USERS, role=role)
def build_sqoop_partition_options(self, relation: RelationDescription, partition_key: Optional[str], table_size: int) -> List[str]: """ Build the partitioning-related arguments for Sqoop. """ if partition_key: column = fy.first( fy.where(relation.table_design["columns"], name=partition_key)) if column["sql_type"] in ("timestamp", "timestamp without time zone"): quoted_key_arg = """CAST(TO_CHAR("{}", 'YYYYMMDDHH24MISS') AS BIGINT)""".format( partition_key) else: quoted_key_arg = '"{}"'.format(partition_key) if relation.num_partitions: # num_partitions explicitly set in the design file overrides the dynamic determination. num_mappers = min(relation.num_partitions, self.max_partitions) else: num_mappers = self.maximize_partitions(table_size) if num_mappers > 1: return [ "--split-by", quoted_key_arg, "--num-mappers", str(num_mappers) ] # Use 1 mapper if either there is no partition key, or if the partitioner returns only one partition return ["--num-mappers", "1"]
def load_data(input_dir=None): """Load data""" if input_dir is not None: tables = config.get('data.tables') entities_table_name = config.get('data.entities_table_name') entities_config = some(where(tables, name=entities_table_name)) X = load_table_from_config(input_dir, entities_config) targets_table_name = config.get('data.targets_table_name') targets_config = some(where(tables, name=targets_table_name)) y = load_table_from_config(input_dir, targets_config) else: raise NotImplementedError return X, y
def load_data(input_dir=None): """Load data""" if input_dir is not None: tables = config.get('data.tables') entities_table_name = config.get('data.entities_table_name') entities_config = some(where(tables, name=entities_table_name)) X = load_table_from_config(input_dir, entities_config) targets_table_name = config.get('data.targets_table_name') targets_config = some(where(tables, name=targets_table_name)) y = load_table_from_config(input_dir, targets_config) else: root = 'https://mit-dai-ballet.s3.amazonaws.com/census' X = pd.read_csv(root + '/train/entities.csv.gz') y = pd.read_csv(root + '/train/targets.csv.gz') return X, y
def load_data(input_dir=None): """Load data""" if input_dir is not None: tables = conf.get("tables") entities_table_name = conf.get("data", "entities_table_name") entities_config = some(where(tables, name=entities_table_name)) X_df = load_table_from_config(input_dir, entities_config) targets_table_name = conf.get("data", "targets_table_name") targets_config = some(where(tables, name=targets_table_name)) y_df = load_table_from_config(input_dir, targets_config) else: source = "https://s3.amazonaws.com/mit-dai-ballet/ames/AmesHousing.txt" df = pd.read_csv(source, sep="\t") X_df = df.drop("SalePrice", axis=1) y_df = df["SalePrice"] return X_df, y_df
def load_data(split='train', input_dir=None): """Load data If input dir is not None, then load whatever dataset appears in `input_dir`. Otherwise, load the data split indicated by `split`. """ if input_dir is not None: config = load_config() tables = config.data.tables entities_table_name = config.data.entities_table_name entities_config = some(where(tables, name=entities_table_name)) X = load_table_from_config(input_dir, entities_config) targets_table_name = config.data.targets_table_name targets_config = some(where(tables, name=targets_table_name)) y = load_table_from_config(input_dir, targets_config) return X, y raise NotImplementedError
def find_partition_key(self) -> Union[str, None]: """ Return valid partition key for a relation. The partition key will fulfill these conditions: (1) the column is marked as a primary key (2) the table's primary key is a single column (3) the column has a numeric type or can be cast into one (which currently only works for timestamps). If the table design provides extract_settings with a split_by column setting, provide that instead. The column will be numeric (int or long) or a timestamp in this case. If no partition key can be found, returns None. """ constraints = self.table_design.get("constraints", []) extract_settings = self.table_design.get("extract_settings", {}) [partition_key] = extract_settings.get("split_by", [None]) if not partition_key: try: # Unpacking will fail here if the list of primary keys hasn't exactly one element. [primary_key] = [ col for constraint in constraints for col in constraint.get("primary_key", []) ] partition_key = primary_key except ValueError: logger.debug( "Found no single-column primary key for table '%s'", self.identifier) if not partition_key: logger.debug("Found no partition key for table '%s'", self.identifier) return None column = fy.first( fy.where(self.table_design["columns"], name=partition_key)) # We check here the "generic" type which abstracts the SQL types like smallint, int4, etc. if column["type"] in ("int", "long", "date", "timestamp"): logger.debug("Partition key for table '%s' is '%s'", self.identifier, partition_key) return partition_key logger.warning( "Column '%s' is not int, long, date or timestamp so is not usable as a partition key for '%s'", partition_key, self.identifier, ) return None
def check_fav_ware_from_db(self, before_fav_wares, after_fav_wares): """ Сравнить на идентичность избранных товаров до и после какого-то действия. :param before_fav_wares: список словарей до какого-то действия :param after_fav_wares: список словарей после какого-то действия """ self.assertEqual(len(before_fav_wares), len(after_fav_wares), "Changed the number of favorite user.") for user in before_fav_wares: fav_data = funcy.where(after_fav_wares, fav_ware_id=user["fav_ware_id"]) fav_elem = fav_data[0] self.assertEqual(len(fav_data), 1) self.assertEqual(fav_elem["fav_ware_id"], user["fav_ware_id"]) self.assertEqual(fav_elem["user_id"], user["user_id"]) self.assertEqual(fav_elem["creation_timestamp"], user["creation_timestamp"])
def check_wares(self, wares_cassandra, wares_worker): """ Сравниваем списки товаров от Cassandra и от Warehouse. Сраниваем общее количество товаров. Сравниваем каждый товар по отдельности. :param wares_cassandra: список с данными от Warehouse :param wares_worker: список с данными от Cassandra """ service_log.put("Check lists from BD and Warehouse.") self.assertEqual(len(wares_worker), len(wares_cassandra), "The quantity of the wares does not match.") for ware_worker in wares_worker: service_log.put("Get ware in list: %s" % str(ware_worker)) ware_cassandra = funcy.where(wares_cassandra, ware_id=ware_worker.wareId) self.assertEqual(len(ware_cassandra), 1, "Found several ware with one id.") self.assertNotEqual(len(ware_cassandra), 0, "Not found ware in data from worker Warehouse.") self.check_ware(ware_worker, ware_cassandra[0])
def test_findUserDetails_for_exist_part_phone_for_all_users(self, iteration=None): """ Тестирование работы метода findUserDetails на существующем пользователе. Выбираем существующего пользователя и берём часть его номера телефона. Делаем выборку всех пользователей у которых совпадает часть номера телефона. Выборка части телефона производиться произвольным образом, поэтому делаем несколько итераций теста. """ service_log.run(self) part_phone = self.user["phone"][:random.randint(1, len(self.user["phone"]))] self.assertNotEqual(len(self.user["phone"]), 0, "Find user without phone!!!") service_log.put("Get part phone user's: %s" % part_phone) users_with_part_phone = databases.db1.accounting.get_users_by_part_phone(part_phone) result = services.accounting.root.tframed.findUserDetails(self.get_FindUserRequestDto(part_phone)) service_log.put("Method findUserDetails returned result: %s" % result) self.assertEqual(len(result), len(users_with_part_phone), "Does not match number of detected users.") for index in result: user = funcy.where(users_with_part_phone, id=index.userId)[0] self.check_user(index, user)
def test_updateWares(self): """ Импорт премодерированного товара Товар проходит базовую валидацию, переводится в указанное состояние и помечается как отмодерированный. """ service_log.run(self) # сохраняем первоночальные данные по товарам self.save_wares_data(self.list_wares) service_log.put("Save the data in a several product.") # берём произвольно товары № 2 self.list_wares2 = list() for index in range(self.count_wares): self.list_wares2.append(self.get_random_ware(self.wares)) service_log.put("Get list2 with Ware: %s" % self.list_wares2) # Создаём запрос для обновления товара №1 значениями от товара № 2. Отправляем его на сервис. wares_req = list() for num, index in enumerate(self.list_wares2): ware1 = self.list_wares[num] wares_req.append(self.req_update_ware(ware1["ware_id"], index['managed_category_id'], index["content"])) wares_warehouse = services.warehouse.root.tframed.updateWares(wares_req) service_log.put("Updated wares: %s" % str(wares_warehouse)) for ware in wares_warehouse: # Возьмём значение из БД только что обновлённого товара №1 по его идентификатору ware_cassandra = databases.db1.warehouse.get_wares_by_ware_id(ware.wareId) service_log.put("Ware from BD: %s" % ware_cassandra) # проверяем, что вернулось только один товар self.assertEqual(len(ware_cassandra), 1, "Found more than one item.") ware_cassandra = ware_cassandra[0] # десериализуем и обновляем контент self.update_data_content(ware_cassandra, self.deserialize_content(ware_cassandra['content'])) # проверяем, что идентификаторы товара остались прежними ware_in_list_wares = funcy.where(self.list_wares, ware_id=ware.wareId)[0] self.assertEqual(ware.wareId, ware_in_list_wares["ware_id"], "Do not match the identifiers of the ware.") # проверяем полученное значение от сервиса со значениями из БД self.check_ware(ware_worker=ware, ware_dbase=ware_cassandra)
def get_suits(source_data): """ Группируем по сьютам. :param source_data: список словарей всех тестов. :return: список папок с файлами и описанием. """ folders_code_name = set([index['code_name_folder'] for index in source_data]) # выборка папок folders_array = list() for folder in folders_code_name: sorting_files = funcy.where(source_data, code_name_folder=folder) # выборка файлов для конкретной папки f_code_name = set([(index['code_name_file'], index['feature']) for index in sorting_files]) pages = list() for file_array in f_code_name: code_name = file_array[0].decode('utf-8') name = str(file_array[1]).decode('utf-8') pages.append({"code_name": code_name, "name": name}) folders_array.append({"name": sorting_files[0]['folder_title'], "code_name": folder, "pages": pages}) return folders_array
def test_add_fav_users(self, limit_users=5): """ Проверка добавления товаров в Избранное через метод addFavorites. """ service_log.run(self) before_fav_users = databases.db1.favorites.get_fav_wares_by_user_id(self.user_id) users = databases.db3.accounting.get_users(limit=limit_users) user_ids = [index["id"] for index in users] dto_list = self.generate_dto_list_equal_fav_user(self.user_id, user_ids) param = self.get_FavoritesAddRequest(dto_list) result = services.favorites.root.tframed.addFavorites(param) after_fav_users = databases.db1.favorites.get_fav_users_by_user_id(self.user_id) self.assertIsNone(before_fav_users) self.assertEqual(len(result.dtoList), limit_users, "Does not match the number of elements.") for index in result.dtoList: fav_data = funcy.where(after_fav_users, fav_usr_id=index.content.favUserId) self.check_fav_user(user_id=self.user_id, data=index, fav_data=fav_data, fav_type=self.fav_type_user)
def test_find_fav_users_for_user(self, limit_users=5): """ Проверка выборки избранных пользователей у пользователя. :param limit_users: количество добавляемых пользователей для проверки """ service_log.run(self) users = databases.db3.accounting.get_users(limit=limit_users) user_ids = [index["id"] for index in users] dto_list = self.generate_dto_list_equal_fav_user(self.user_id, user_ids) add_fav_param = self.get_FavoritesAddRequest(dto_list) services.favorites.root.tframed.addFavorites(add_fav_param) param = self.get_UsersFavoritesRequest(user_id=self.user_id, fav_type=self.fav_type_user) result = services.favorites.root.tframed.findUsersFavoritesByParams(param) f_wares = databases.db1.favorites.get_fav_users_by_user_id(self.user_id) self.assertEqual(len(f_wares), len(result.dtoList), "Different length of lists.") self.assertEqual(len(f_wares), result.totalCount, "Wrong value totalCount.") for user in result.dtoList: fav_data = funcy.where(f_wares, fav_usr_id=user.content.favUserId) self.check_fav_user(user_id=self.user_id, data=user, fav_data=fav_data, fav_type=self.fav_type_user)
def test_add_fav_wares(self, limit_wares=5): """ Проверка добавления товаров в Избранное через метод addFavorites. """ service_log.run(self) before_fav_wares = databases.db1.favorites.get_fav_wares_by_user_id(self.user_id) wares = databases.db2.warehouse.get_wares_with_limit(limit=limit_wares) ware_ids = [index["ware_id"] for index in wares] dto_list = self.generate_dto_list_equal_fav_ware(self.user_id, ware_ids) param = self.get_FavoritesAddRequest(dto_list) result = services.favorites.root.tframed.addFavorites(param) after_fav_wares = databases.db1.favorites.get_fav_wares_by_user_id(self.user_id) self.assertIsNone(before_fav_wares) self.assertEqual(len(result.dtoList), limit_wares, "Does not match the number of elements.") for index in result.dtoList: fav_data = funcy.where(after_fav_wares, fav_ware_id=index.content.favWareId) self.check_fav_ware(user_id=self.user_id, data=index, fav_data=fav_data, fav_type=self.fav_type_ware) # TODO: в одном случае 0, в другом None - баг: https://jira.oorraa.net/browse/RT-786 self.assertEqual(index.content.favUserId, 0, "Is not None value favorite users.")
def test_find_fav_wares_for_user(self, limit_wares=5): """ Проверка выборки избранных товаров пользователя. :param limit_wares: количество добавляемых товаров для проверки """ service_log.run(self) wares = databases.db2.warehouse.get_wares_with_limit(limit=limit_wares) ware_ids = [index["ware_id"] for index in wares] dto_list = self.generate_dto_list_equal_fav_ware(self.user_id, ware_ids) add_fav_param = self.get_FavoritesAddRequest(dto_list) services.favorites.root.tframed.addFavorites(add_fav_param) param = self.get_UsersFavoritesRequest(user_id=self.user_id, fav_type=self.fav_type_ware) result = services.favorites.root.tframed.findUsersFavoritesByParams(param) f_wares = databases.db1.favorites.get_fav_wares_by_user_id(self.user_id) self.assertEqual(len(f_wares), len(result.dtoList), "Different length of lists.") self.assertEqual(len(f_wares), result.totalCount, "Wrong value totalCount.") for ware in result.dtoList: fav_data = funcy.where(f_wares, fav_ware_id=ware.content.favWareId) self.check_fav_ware(user_id=self.user_id, data=ware, fav_data=fav_data, fav_type=self.fav_type_ware) self.assertIsNone(ware.content.favUserId, "Is not None value favorite users.")
def extract_errors(transcoder_job: dict): from funcy import merge, where, lpluck job = transcoder_job['Job'] outputs = merge(job['Outputs'], job['Playlists']) return lpluck('StatusDetail', where(outputs, Status='Error'))
def snapshot_file(request, snap_id, format): snap = get_object_or_404(Snapshot, pk=snap_id) f = first(where(snap.files, format=format)) return redirect(f.url)