def from_row(cls, row, imageable: ImageableType, alias=None): table = cls.Options.db_table if alias is None else alias return cls( id=t.Int().check(row[table.c.id]), title=t.String().check(row[table.c.title]), uri=str(t.URL.check(row[table.c.uri])), imageable=t.Or(t.Type(Author), t.Type(Book), t.Type(Series)).check(imageable), created_at=DateTime().check(row[table.c.created_at]), updated_at=t.Or(DateTime, t.Null).check(row[table.c.updated_at]), is_populated=True )
def from_row(cls, row, books: List['Book'], photos: List['Photo'], alias=None): table = cls.Options.db_table if alias is None else alias return cls( id=t.Int().check(row[table.c.id]), name=t.String().check(row[table.c.name]), date_of_birth=Date().check(row[table.c.date_of_birth]), date_of_death=t.Or(Date, t.Null).check(row[table.c.date_of_death]), books=t.List(t.Type(Book)).check(books), photos=t.List(t.Type(Photo)).check(photos), created_at=DateTime().check(row[table.c.created_at]), updated_at=t.Or(DateTime, t.Null).check(row[table.c.updated_at]), is_populated=True )
def from_row(cls, row, author: 'Author', photos: List['Photo'], chapters: List['Chapter'], series: 'Series' = None, alias=None): table = cls.Options.db_table if alias is None else alias return cls( id=t.Int().check(row[table.c.id]), title=t.String().check(row[table.c.title]), date_published=Date().check(row[table.c.date_published]), author=t.Type(Author).check(author), photos=t.List(t.Type(Photo)).check(photos), chapters=t.List(t.Type(Chapter)).check(chapters), series=t.Or(t.Type(Series), t.Null).check(series), created_at=DateTime().check(row[table.c.created_at]), updated_at=t.Or(DateTime, t.Null).check(row[table.c.updated_at]), is_populated=True )
class MsgBrokerClient: msg_schema_validator = t.Dict({ t.Key('user'): t.Int(gte=0), t.Key('cmd'): t.String(), t.Key('data'): t.Type(dict), t.Key('priority', default=100): t.Int(gte=0) }) def __init__(self, server_addr: str, service_name: str): self.server_addr = server_addr self.service_name = service_name self.client = Client() self.subscriber_queue = asyncio.Queue() async def run_client(self) -> None: await self.client.connect(servers=[f'nats://{self.server_addr}'], max_reconnect_attempts=-1) async def subscribe(self) -> int: return await self.client.subscribe_async(self.service_name, cb=self._handler) async def publish(self, receiver, msg: Dict[str, Any], reply=None) -> None: try: msg = self.pack_msg(msg) except t.DataError: logger.exception(f'Bad message: {msg}') else: if reply: # TODO: сделать проверку таймаута return await self.client.publish_request(receiver, reply, msg) else: asyncio.ensure_future(self.client.publish(receiver, msg)) @classmethod def pack_msg(cls, data: Dict[str, Any]) -> bytes: data = cls.msg_schema_validator(data) return msgpack.packb(data) @classmethod def unpack_msg(cls, data: bytes) -> Dict[str, Any]: data = msgpack.unpackb(data, encoding='utf-8') logger.debug(f'Received message:{data}') return cls.msg_schema_validator(data) async def _handler(self, msg: 'Msg') -> None: try: subject = msg.subject reply = msg.reply data = self.unpack_msg(msg.data) msg = Message(subject, data, reply) except t.DataError: logger.exception(f'Bad message: {msg}') else: asyncio.ensure_future(self.subscriber_queue.put(msg)) @property def is_connected(self) -> bool: return self.client.is_connected
def test_type(self): res = t.Type(int) self.assertEqual(repr(res), '<Type(int)>') c = t.Type[int] res = c.check(1) self.assertEqual(res, 1) res = extract_error(c, "foo") self.assertEqual(res, 'value is not int')
class User(EmbeddedDocument, UserMixin): structure = t.Dict({ 'email': t.Email, 'password': t.String, 'first_name': t.String, 'last_name': t.String, 'roles': t.List[t.Type(Role)], t.Key('active', default=True): t.Bool, }) required_fields = ['email', 'password', 'active']
def from_row(cls, row, book: 'Book', alias=None): table = cls.Options.db_table if alias is None else alias return cls( id=t.Int().check(row[table.c.id]), title=t.String().check(row[table.c.title]), ordering=t.Int().check(row[table.c.ordering]), book=t.Type(Book).check(book), created_at=DateTime().check(row[table.c.created_at]), updated_at=t.Or(DateTime, t.Null).check(row[table.c.updated_at]), is_populated=True )
def construct(arg): ''' Shortcut syntax to define trafarets. - int, str, float and bool will return t.Int, t.String, t.Float and t.Bool - one element list will return t.List - tuple or list with several args will return t.Tuple - dict will return t.Dict. If key has '?' at the and it will be optional and '?' will be removed - any callable will be t.Call - otherwise it will be returned as is construct is recursive and will try construct all lists, tuples and dicts args ''' if isinstance(arg, t.Trafaret): return arg elif isinstance(arg, tuple) or (isinstance(arg, list) and len(arg) > 1): return t.Tuple(*(construct(a) for a in arg)) elif isinstance(arg, list): # if len(arg) == 1 return t.List(construct(arg[0])) elif isinstance(arg, dict): return t.Dict({ construct_key(key): construct(value) for key, value in arg.items() }) elif isinstance(arg, str): return t.Atom(arg) elif isinstance(arg, type): if arg is int: return t.Int() elif arg is float: return t.Float() elif arg is str: return t.String() elif arg is bool: return t.Bool() else: return t.Type(arg) elif callable(arg): return t.Call(arg) else: return arg
def test_repr(self): res = t.Type(int) assert repr(res) == '<Type(int)>'
class Visitor(EmbeddedDocument): confirm_fields = { "cid": t.String, "sent": t.Bool, "letter_id": t.String, "confirmed_at": t.Float, "confirmed": t.Bool } structure = t.Dict({ 'name': t.String, 'email': t.Email, 'position': t.String, 'company': t.String, 'created_at': t.Type(datetime), 'confirms': t.List(t.Dict(confirm_fields)), 'tshirt_size': t.String(allow_blank=False), t.Key('is_approved', default=False): t.Bool, t.Key('is_declined', default=False): t.Bool, t.Key('is_confirmed', default=False): t.Bool, }) required_fields = [ 'name', 'email', 'is_approved', 'is_declined', 'confirms' ] def confirmations(self): if not hasattr(self, "confirms"): self.confirms = [] return zip(range(1, len(self.confirms) + 1), self.confirms) #map(lambda o: {key: val for key, val in o.as_dict().items() # if key != "id"}, self.confirms)) @classmethod def confirmations_stats(cls): confirms = {} for visitor in cls.query.find({'confirms.confirmed': True}): for n, confirm in visitor.confirmations(): if n not in confirms: confirms[n] = 0 if confirm["confirmed"] is True: confirms[n] += 1 return confirms @classmethod def tshirt_matrix(cls): return {} def one_confirm(self, letter_id): for n, confirm in self.confirmations(): if confirm["letter_id"] == str(letter_id): return confirm return None def save(self, *args, **kwargs): self.save_confirmation(None, commit=True) def save_confirmation(self, confirm, index=None, commit=True): to_save = [] for n, c in self.confirmations(): if n == index: to_save.append(confirm) else: to_save.append(c) if index is None and confirm is not None: to_save.append(confirm) self.confirms = to_save if commit is True: super(Visitor, self).save() def send_confirmation(self, letter, id=None): if id is None: id = str(uuid.uuid1()) for n, conf in self.confirmations(): if "letter_id" in conf and \ conf["letter_id"] == str(letter.id): return False tasks.send_email( self.email, letter.subject, None, { 'visitor': self, 'id': id, 'letter_id': letter.id, 'link': "{}/confirm/{}/{}/".format(DOMAIN, self.id, id) }, template_text=letter.content) self.save_confirmation({ "cid": id, "letter_id": str(letter.id), "sent": True, "confirmed": False, "confirmed_at": time.time() }) return True def save_registered(self): if self.query.find_one({'email': self.email}) is None: self.created_at = datetime.utcnow() return self.save() else: return None
It allows users to featurize the data with model.predict. It also lets the featurizer write the featurized data to the csv containing the images, appending the features to additional columns in-line with each image row. Also adds "image_missing" columns automatically for each image_column which contains binary values of whether the image in that row is missing. """ import logging import trafaret as t import numpy as np import pandas as pd from keras.models import Model @t.guard(model=t.Type(Model), array=t.Type(np.ndarray)) def featurize_data(model, array): """ Given a model and an array, perform error checking and return the prediction of the full feature array. Parameters: ---------- model : keras.models.Model The featurizer model performing predictions array : np.ndarray The vectorized array of images being converted into features Returns: --------
# if model_str == 'squeezenet': # # Special case for squeezenet - we already have weights for it # this_dir, this_filename = os.path.split(__file__) # model_path = os.path.join(this_dir, # 'saved_models', # 'squeezenet_weights_tf_dim_ordering_tf_kernels.h5') # if not os.path.isfile(model_path): # raise ValueError('Could not find the weights. Download another model' # ' or replace the SqueezeNet weights in the model folder.') # model.load_weights(model_path) logging.info('Model successfully initialized.') return model @t.guard(model=t.Type(Model), depth=t.Int(gte=1)) def _decapitate_model(model, depth): """ Cut off end layers of a model equal to the depth of the desired outputs, and then remove the links connecting the new outer layer to the old ones. Parameters: ---------- model: keras.models.Model The model being decapitated. Note: original model is not changed, method returns new model. depth: int The number of layers to pop off the top of the network Returns: ------- model: keras.models.Model Decapitated model. """
class ImageFeaturizer: """ This object can load images, rescale, crop, and vectorize them into a uniform batch, and then featurize the images for use with custom classifiers. Methods ------------------ __init__(depth, autosample, downsample_size): -------------------------------- Initialize the ImageFeaturizer. Build the featurizer model with the depth and feature downsampling specified by the inputs. featurize_data(image_columns, image_path, csv_path, new_csv_path, scaled_size, grayscale): -------------------------------- Loads image directory and/or csv into the model, and featurizes the images load_data(image_columns, image_path, csv_path, scaled_size, grayscale): -------------------------------- Loads image directory and/or csv into the model, and vectorize the images for input into the featurizer featurize_preloaded_data(): -------------------------------- Featurize the loaded data, append the features to the csv, and return the full dataframe """ @t.guard(depth=t.Int(gte=1, lte=4), autosample=t.Bool, downsample_size=t.Int(gte=0), model=t.Enum(*supported_model_types.keys())) def __init__(self, depth=1, autosample=False, downsample_size=0, model='squeezenet' ): """ Initializer. Loads an initial InceptionV3 pretrained network, decapitates it and downsamples according to user specifications. Parameters: ---------- depth : int How deep to decapitate the model. Deeper means less specific but also less complex autosample : bool If True, feature layer is automatically downsampled to the right size. downsample_size: int The number of features to downsample the featurizer to Returns: -------- None. Initializes and saves the featurizer object attributes. """ # BUILDING THE MODEL # logging.info("Building the featurizer.") featurizer = build_featurizer(depth, autosample, downsample_size, model_str=model.lower()) # Saving initializations of model self.depth = depth self.autosample = autosample self.downsample_size = downsample_size self.num_features = featurizer.layers[-1].output_shape[-1] # Save the model self.model_name = model.lower() self.featurizer = featurizer self.visualize = featurizer.summary # Initializing preprocessing variables for after we load and featurize the images self.data = np.zeros((1)) self.features = pd.DataFrame() self.df_original = pd.DataFrame() self.full_dataframe = pd.DataFrame() self.df_features = pd.DataFrame() self.csv_path = '' self.image_dict = {} self.image_columns = '' self.image_path = '' # Image scaling and cropping self.scaled_size = (0, 0) self.crop_size = (0, 0) self.number_crops = 0 self.isotropic_scaling = False def load_data(self, image_columns, image_path='', image_dict='', csv_path='', grayscale=False, save_data=True, # crop_size = (299, 299), # number_crops = 0, # random_crop = False, # isotropic_scaling = True ): """ Load image directory and/or csv, and vectorize the images for input into the featurizer. Parameters: ---------- image_columns : str the name of the column holding the image data, if a csv exists, or what the name of the column will be, if generating the csv from a directory image_path : str the path to the folder containing the images. If using URLs, leave blank csv_path : str the path to the csv. If just using a directory, leave blank. If csv exists, this is the path where the featurized csv will be generated. # These features haven't been implemented yet. # grayscale : bool # Flags the image as grayscale # # isotropic_scaling : bool # If True, images are scaled keeping proportions and then cropped # # crop_size: tuple # If the image gets cropped, decides the size of the crop # # random_crop: bool # If False, only take the center crop. If True, take random crop # """ # Fix column headers and image path if they haven't been done, build path for new csv image_columns, image_path = _input_fixer(image_columns, image_path) # If there's no dataframe, build it! if csv_path == '': if len(image_columns) > 1: raise ValueError('If building the dataframe from an image directory, the featurizer' 'can only create a single image column. If two image columns are ' 'needed, please create a csv to pass in.') # If the image_dict hasn't been passed in (which only happens in batch processing), # build the full image dict and save the original dataframe if not image_dict: image_dict, df = _build_image_dict(image_path, csv_path, image_columns) self.df_original = df self.full_dataframe = df self.image_columns = image_columns self.image_dict = image_dict scaled_size, full_image_data = \ self._load_data_helper(self.model_name, image_columns, image_path, image_dict, csv_path, grayscale) # Save all of the necessary data to the featurizer if save_data: self.data = full_image_data self.csv_path = csv_path self.image_path = image_path self.scaled_size = scaled_size return full_image_data @t.guard(batch_data=t.Type(np.ndarray), image_columns=t.List(t.String(allow_blank=True)) | t.String(allow_blank=True), batch_processing=t.Bool, features_only=t.Bool, save_features=t.Bool, save_csv=t.Bool, new_csv_path=t.String(allow_blank=True), omit_model=t.Bool, omit_depth=t.Bool, omit_output=t.Bool, omit_time=t.Bool, ) def featurize_preloaded_data(self, batch_data=np.zeros((1)), image_columns='', batch_processing=False, features_only=False, save_features=False, save_csv=False, new_csv_path='', omit_model=False, omit_depth=False, omit_output=False, omit_time=False): """ Featurize the loaded data, returning the dataframe and writing the features and the full combined data to csv Parameters ---------- Returns ------- full_dataframe or df_features: pandas.DataFrame If features_only, this returns a Dataframe containing the features. Otherwise, it returns a DataFrame containing the features appended to the original csv. If save_csv is set to True, it also writes csv's to the same path as the csv containing the list of names. """ # If the batch data isn't passed in, then load the full data from the attributes if np.array_equal(batch_data, np.zeros((1))): batch_data = self.data if image_columns == '': image_columns = self.image_columns if isinstance(image_columns, str): image_columns = [image_columns] # Check data has been loaded, and that the data was vectorized correctly if np.array_equal(batch_data, np.zeros((1))): raise IOError('Must load data into the model first. Call load_data.') # If batch processing, make sure we're only doing a single column at a time. # Otherwise, make sure the number of columns matches the first dimension of the data if batch_processing: assert len(image_columns) == 1 or isinstance(image_columns, str) else: assert len(image_columns) == batch_data.shape[0] logging.info("Trying to featurize data.") # Initialize featurized data vector with appropriate size features = np.zeros((batch_data.shape[1], self.num_features * len(image_columns))) # Get the image features df_features = self._featurize_helper( features, image_columns, batch_data) # Save features if boolean set to True if save_features: self.features = df_features # If called with features_only, returns only the features if features_only: return df_features # Save the image features with the original dataframe full_dataframe = pd.concat([self.df_original, df_features], axis=1) # If batch processing, this is only the batch dataframe. Otherwise, this is the actual # full dataframe. if not batch_processing: self.full_dataframe = full_dataframe # Save csv if called if save_csv: self.save_csv(new_csv_path=new_csv_path, omit_model=omit_model, omit_depth=omit_depth, omit_output=omit_output, omit_time=omit_time, save_features=save_features) return full_dataframe @t.guard(image_columns=t.List(t.String(allow_blank=True)) | t.String(allow_blank=True), image_path=t.String(allow_blank=True), csv_path=t.String(allow_blank=True), new_csv_path=t.String(allow_blank=True), batch_processing=t.Bool, batch_size=t.Int, save_data=t.Bool, save_features=t.Bool, save_csv=t.Bool, omit_time=t.Bool, omit_model=t.Bool, omit_depth=t.Bool, omit_output=t.Bool, verbose=t.Bool, grayscale=t.Bool ) def featurize(self, image_columns, image_path='', csv_path='', new_csv_path='', batch_processing=True, batch_size=1000, save_data=False, save_features=False, save_csv=False, omit_time=False, omit_model=False, omit_depth=False, omit_output=False, verbose=True, grayscale=False # crop_size = (299, 299), # number_crops = 0, # random_crop = False, # isotropic_scaling = True ): """ Load image directory and/or csv, and vectorize the images for input into the featurizer. Then, featurize the data. Parameters: ---------- image_columns : list of str list of the names of the column holding the image data, if a csv exists, or what the name of the column will be, if generating the csv from a directory image_path : str the path to the folder containing the images. If using URLs, leave blank csv_path : str the path to the csv. If just using a directory, leave blank, and specify the path for the generated csv in new_csv_path. If csv exists, this is the path where the featurized csv will be generated. new_csv_path : str the path to the new csv, if one is being generated from a directory. If no csv exists, this is the path where the featurized csv will be generated grayscale : bool Decides if image is grayscale or not. May get deprecated. Don't think it works on the InceptionV3 model due to input size. # These features haven't been implemented yet. # isotropic_scaling : bool # if True, images are scaled keeping proportions and then cropped # # crop_size: tuple # if the image gets cropped, decides the size of the crop # # random_crop: bool # If False, only take the center crop. If True, take random crop # Returns: -------- full_dataframe : Dataframe containing the features appended to the original csv. Also writes csvs containing the features only and the full dataframe to the same path as the csv containing the list of names """ if not image_path and not csv_path: raise ValueError("Must specify either image_path or csv_path as input.") # Set logging level if verbose: logger.setLevel(logging.INFO) # Fix column headers and image path if necessary image_columns, image_path = _input_fixer(image_columns, image_path) # Find the full image dict and save the original dataframe. This is required early to know # how many images exist in total, to control batch processing. full_image_dict, df_original = _build_image_dict(image_path, csv_path, image_columns) # Save the fixed inputs and full image dict self.df_original = df_original self.image_columns = image_columns self.image_dict = full_image_dict # Users can turn off batch processing by either setting batch_processing to false, or # setting batch_size to 0 if batch_processing and batch_size: # Perform batch processing, and save the full dataframe and the full features dataframe features_df = self._batch_processing(full_image_dict, image_columns, image_path, csv_path, batch_size, grayscale) # If batch processing is turned off, load the images in one big batch and features them all else: logger.info("Loading full data tensor without batch processing. If you " "experience a memory error, make sure batch processing is enabled.") full_data = self.load_data(image_columns, image_path, full_image_dict, csv_path, grayscale, save_data) features_df = \ self.featurize_preloaded_data(full_data, image_columns=image_columns, features_only=True) # Save the full dataframe with the features full_df = pd.concat([df_original, features_df], axis=1) self.full_dataframe = full_df # Save features and csv if flags are enabled if save_features: self.features = features_df if save_csv: self.save_csv(new_csv_path=new_csv_path, omit_model=omit_model, omit_depth=omit_depth, omit_output=omit_output, omit_time=omit_time, save_features=save_features) # Return the full featurized dataframe return full_df def save_csv(self, new_csv_path='', omit_model=False, omit_depth=False, omit_output=False, omit_time=False, save_features=False): """ """ if self.full_dataframe.empty: raise AttributeError('No dataframe has been featurized.') # Save the name and extension separately, for robust naming if not new_csv_path: new_csv_path = self.csv_path or DEFAULT_NEW_CSV_PATH csv_name, ext = os.path.splitext(new_csv_path) name_path = _named_path_finder("{}_featurized".format(csv_name), self.model_name, self.depth, self.num_features, omit_model, omit_depth, omit_output, omit_time) else: name_path, ext = os.path.splitext(new_csv_path) _create_csv_path(name_path) logger.warning("Saving full dataframe to csv as {}{}".format(name_path, ext)) self.full_dataframe.to_csv("{}{}".format(name_path, ext), index=False) if save_features: logger.warning("Saving features to csv as {}_features_only{}".format(name_path, ext)) self.df_features.to_csv("{}_features_only{}".format(name_path, ext), index=False) @t.guard(confirm=t.Bool) def clear_input(self, confirm=False): """ Clear all input for the model. Requires the user to confirm with an additional "confirm" argument in order to run. Parameters: ---------- confirm : bool Users are required to modify this to true in order to clear all attributes from the featurizer """ if not confirm: raise ValueError('If you\'re sure you would like to clear the inputs of this model, ' 'rerun the function with the following argument: ' 'clear_input(confirm=True). This operation cannot be reversed.') self.data = np.zeros((1)) self.features = pd.DataFrame() self.full_dataframe = pd.DataFrame() self.csv_path = '' self.image_list = '' self.image_columns = '' self.image_path = '' # ################### # Helper Functions! # # ################### def _load_data_helper(self, model_name, image_columns, image_path, image_dict, csv_path, grayscale): """ This function helps load the image data from the image directory and/or csv. It can be called by either batch processing, where each column is handled separately in the parent function and the data is loaded in batches, or it can be called without batch processing, where the columns must each be loaded and concatenated here. Parameters: ---------- model_name : str The name of the model type, which determines scaling size image_columns : list A list of the image column headers image_path : str Path to the image directory image_dict : dict This is a dictionary containing the names of each image column as a key, along with all of the image paths for that column. csv_path : str Path to the csv grayscale : bool Whether the images are grayscale or not """ # Save size that model scales to scaled_size = SIZE_DICT[model_name] # Save the full image tensor, the path to the csv, and the list of image paths image_data, list_of_image_paths = \ preprocess_data(image_columns[0], model_name, image_dict[image_columns[0]], image_path, csv_path, scaled_size, grayscale) image_data_list = [np.expand_dims(image_data, axis=0)] # If there is more than one image column, repeat this process for each if len(image_columns) > 1: for column in image_columns[1:]: image_data, list_of_image_paths = \ preprocess_data(column, model_name, image_dict[column], image_path, csv_path, scaled_size, grayscale) image_data_list.append(np.expand_dims(image_data, axis=0)) full_image_data = np.concatenate(image_data_list) return scaled_size, full_image_data def _featurize_helper(self, features, image_columns, batch_data): """ This function featurizes the data for each image column, and creates the features array from all of the featurized columns Parameters: ---------- features : array Array of features already computed image_columns : list A list of the image column headers batch_data : array The batch loaded image data (which may be the full array if not running with batches) """ # Save the initial features list features_list = [] # For each image column, perform the full featurization and add the features to the df for column in range(batch_data.shape[0]): # Featurize the data, and save it to the appropriate columns partial_features = featurize_data(self.featurizer, batch_data[column]) features[:, self.num_features * column:self.num_features * column + self.num_features]\ = partial_features # Save the full dataframe df_features = \ create_features(batch_data[column], partial_features, image_columns[column]) features_list.append(df_features) df_features = pd.concat(features_list, axis=1) return df_features def _batch_processing(self, full_image_dict, image_columns, image_path='', csv_path='', batch_size=1000, grayscale=False): """ This function handles batch processing. It takes the full list of images that need to be processed and loads/featurizes the images in batches. Parameters: ---------- full_image_dict : dict This is a dictionary containing the names of each image column as a key, along with all of the image paths for that column. image_columns : list A list of the image column headers df_original : pandas.DataFrame The original dataframe (not containing the image features) image_path : str Path to the image directory csv_path : str Path to the csv batch_size : int The number of images processed per batch grayscale : bool Whether the images are grayscale or not """ features_df = pd.DataFrame() features_df_columns_list = [] # Iterate through each image column for column_index in range(len(image_columns)): # Initialize the batch index and save the column name index = 0 batch_number = 0 column = image_columns[column_index] batch_features_df = pd.DataFrame() # Get the list of image paths and the number of images in this column list_of_image_paths = full_image_dict[column] num_images = len(list_of_image_paths) batch_features_list = [] # Loop through the images, featurizing each batch if len(image_columns) > 1: logger.info("Featurizing column #{}".format(column_index + 1)) while index < num_images: tic = time.clock() # Cap the batch size against the total number of images left to prevent overflow if index + batch_size > num_images: batch_size = num_images - index # Create a dictionary for just the batch of images batch_image_dict = {column: full_image_dict[column][index:index + batch_size]} # Load the images logger.info("Loading image batch.") batch_data = self.load_data(column, image_path, batch_image_dict, csv_path, grayscale, save_data=False) logger.info("\nFeaturizing image batch.") # If this is the first batch, the batch features will be saved alone. # Otherwise, they are concatenated to the last batch batch_features_list.append(self.featurize_preloaded_data(batch_data, column, features_only=True, batch_processing=True)) # Increment index by batch size index += batch_size batch_number += 1 # Give update on time and number of images left in column remaining_batches = int(math.ceil(num_images - index) / batch_size) logger.info("Featurized batch #{}. Number of images left: {}\n" "Estimated total time left: {} seconds\n".format( batch_number, num_images - index, int((time.clock() - tic) * remaining_batches)) ) # After the full column's features are calculated, concatenate them all and append them # to the full DataFrame list batch_features_df = pd.concat(batch_features_list, ignore_index=True) features_df_columns_list.append(batch_features_df) # Once all the features are created for each column, concatenate them together for both # the features dataframe and the full dataframe features_df = pd.concat(features_df_columns_list, axis=1) # Return the full dataframe and features dataframe return features_df
unique_strings_list, ensure_list, ) from .decimal import Decimal from .format import format_trafaret __VERSION__ = (0, 2, 1) check_number = t.OnError(t.Float() | Decimal(), 'Not a number') json_schema_type = ( t.Atom('null') & just(t.Null()) | t.Atom('boolean') & just(t.Bool()) | t.Atom('object') & just(t.Type(dict)) | t.Atom('array') & just(t.Type(list)) | t.Atom('number') & just(check_number) | t.Atom('integer') & just(t.Int()) | t.Atom('string') & just(t.String()) ) def multipleOf(multiplier): def check(value): if value % multiplier != 0: return t.DataError('%s is not devisible by %s' % (value, multiplier)) return value return check
'data': t.Or(t.List(t.Any), t.String), # check if data is optinal, and possible values 'meta': t.Dict( { 'rc': t.Enum('ok', 'error'), # check if other rc's could exist t.Key('msg', optional=True): t.String, }, ignore_extra='*'), }, ignore_extra='*') _base_time_params = t.Dict({ 'start': t.Or(t.Float, t.Type(datetime), t.Atom(None)), 'end': t.Or(t.Float, t.Type(datetime), t.Atom(None)), }) _base_time_site_params = _base_time_params.merge({'site': SiteName}) _base_time_op_site_params = _base_time_params.merge( {'site': t.Or(SiteName, t.Atom(None))}) _inner_stats_extras = t.Dict({ 'gran': t.Enum('5minutes', 'hourly', 'daily'), 'def_range': t.Int, }) # will not check for port in base_url for now init_params = t.Dict({
import pydash as _ import trafaret as t import datetime from functools import partial from jinja2.utils import import_string from trafaret.contrib.object_id import MongoId from trafaret.contrib.rfc_3339 import DateTime Optional = partial(t.Key, optional=True) SimpleType = t.IntRaw | t.Bool | t.String | t.FloatRaw DateTimeType = DateTime | t.Type(datetime.datetime) NumericType = t.Float | t.Int >> (lambda val: float(val)) URLType = t.Regexp(r'^([a-z]{2,5}:)?(\/\/?)?[a-z][a-z0-9\.\-\/]+$') OptionValue = t.String( allow_blank=True) | t.Bool | t.Float | t.Int | t.Type(dict) Optional = partial(t.Key, optional=True) SimpleDoc = t.Dict({ t.Key('id', optional=True) >> '_id': MongoId, Optional('_id'): MongoId }) TimestampDoc = SimpleDoc + t.Dict({ Optional('created', default=datetime.datetime.now): DateTimeType | t.Null, Optional('modified'): DateTimeType })