def _prepare_tables(self, df_1, df_2, feature_1=None, feature_2=None): """ Prepares tables for matching. Left match on df_1 and df_2 Parameters ---------- df_1 : pd.DataFrame Dataframe that will be used for matching. df_2 : pd.DataFrame Dataframe that will be used for matching. feature_1 : str, by default None List of features of df_1 that will be used for matching. None indicates all features will be used. feature_2 : str, by default None List of features of df_2 that will be used for matching. None indicates all features will be used. Returns ------- df_1 : pd.DataFrame df_1 with only those columns that will be used for matching. df_2 : pd.DataFrame df_2 with only those columns that will be used for matching. Raises ------ SunpyUserWarning If number of features for df_1 is not equal to number of features for df_2. SunpyUserWarning if key from feature_1 is not present in df_1 SunpyUserWarning if key from feature_2 is not present in df_2 """ if feature_1 is None: feature_1 = df_1.columns.values if feature_2 is None: feature_2 = df_2.columns.values if len(feature_1) != len(feature_2): raise SunpyUserWarning( "The number of columns to match the rows on must be the same.") try: df_1 = df_1[feature_1] except KeyError: raise SunpyUserWarning("The features specified for table 1 do not " "correspond to any columns in table 1.") try: df_2 = df_2[feature_2] except KeyError: raise SunpyUserWarning("The features specified for table 2 do not " "correspond to any columns in table 2.") return df_1, df_2
def get_nearest_observation(self, obsdate: str): """ Returns the observation time and date in the Timesfits that is closest to the given observation time and date. Parameters ---------- obsdate : str The observation time and date. Returns ------- closest_observation : str Observation time and date in the Timesfits that is closest to the given observation time and date. Examples -------- >>> from pythia.seo import Sunspotter >>> sunspotter = Sunspotter() >>> obsdate = '2000-01-01 22:47:02' >>> sunspotter.get_nearest_observation(obsdate) '2000-01-01 12:47:02' """ unique_dates = self.timesfits.index.unique() index = unique_dates.get_loc(obsdate, method='nearest') nearest_date = str(unique_dates[index]) # casting to str because obsdate can be a pandas.Timestamp if nearest_date != str(obsdate): warnings.warn( SunpyUserWarning( "The given observation date isn't in the Timesfits file.\n" "Using the observation nearest to the given obsdate instead." )) return nearest_date
def match_cosine(self, df_1, df_2): """ Finds Cosine similarity between the rows of the two dataframes. Parameters ---------- df_1: `pd.DataFrame` First DataFrame to match the rows from. df_2: `pd.DataFrame` Second DataFrame to match the rows from. Returns ------- result: `numpy.ndarray` Array of size `(n,)` where n is the number of rows in df_1. Contains indices of rows from df_2 that best correspond to rows from df_1. match_score: `numpy.ndarray` Array of size `(n,)` where n is the number of rows in df_1. Contains match score for corresponding best matches. """ try: from sklearn.metrics.pairwise import cosine_similarity except ImportError: raise SunpyUserWarning( "Table Matcher requires Scikit Learn to be installed") cosine = cosine_similarity(X=df_1, Y=df_2) result = np.argmax(cosine, axis=1) match_score = np.max(cosine, axis=1) return result, match_score
def prepare_data(self): """ Prepares the data DataFrame. Raises ------ TypeError If the data argument is invalid. ValueError If target column is not explicitely specified. ValueError If train test split is not a fraction between 0 and 1. ValueError If train val split is not a fraction between 0 and 1. # TODO : Add support for K fold cross validataion. only 1 split supported as of now. """ if isinstance(self.data, str): self.data = pd.read_csv(self.data) elif not isinstance(self.data, pd.DataFrame): raise TypeError( "Explicitely passed data must be a pandas Dataframe") if self.X_col is None: warnings.warn( SunpyUserWarning( "No Feature Columns specified." + "Assuming all columns except target columns to be feature columns." )) self.X_col = set(self.data.columns) - set(self.y_col) if not isinstance( self.train_test_split, float ) or self.train_test_split >= 1 or self.train_test_split <= 0: raise ValueError( "train test split must be a fraction between 0 and 1") if not isinstance( self.train_val_split, float ) or self.train_val_split >= 1 or self.train_val_split <= 0: raise ValueError( "train val split must be a fraction between 0 and 1") if self.is_regression is True and self.stratified_shuffle is True: warnings.warn( "Cannot use Stratified Shuffling with Regression tasks. Defaulting to Random Shuffling." ) self.stratified_shuffle = False if self.stratified_shuffle is True: splitter = StratifiedShuffleSplit else: splitter = ShuffleSplit self.train_test_splitter = splitter(n_splits=self.num_splits, test_size=self.train_test_split) self.train_val_splitter = splitter(n_splits=self.num_splits, test_size=self.train_val_split)
def pytest_runtest_setup(item): """ pytest hook to skip all tests that have the mark 'remotedata' if the pytest_remotedata plugin is not installed. """ if isinstance(item, pytest.Function): if 'remote_data' in item.keywords and not HAVE_REMOTEDATA: pytest.skip("skipping remotedata tests as pytest-remotedata is not installed") # Confirm that the pyplot figure stack is empty before the test if HAVE_MATPLOTLIB and plt.get_fignums(): raise SunpyUserWarning(f"There are stale pyplot figures prior to running {item.name}")
def __init__(self, match_type='cosine'): """ Parameters ---------- match_type : str, optional The row matching algorithm, by default 'cosine' Raises ------ SunpyUserWarning If unrecognized match type is passed. """ self.match_type = match_type if self.match_type not in ['cosine', 'euclidean']: raise SunpyUserWarning('Incorrect matching algorithm specified.')
def __init__(self, *, data, X_col, y_col, root_dir='data/all_clear/mdi/MDI/fits/', transform=None, is_fits=True, is_tabular=False): """ Parameters ---------- data : pd.DataFrame The Dataframe with the FITS data information. X_col : list or str Data Columns y_col : list or str Label Column root_dir : str, optional Path to the FITS files, by default 'data/all_clear/mdi/MDI/fits/' transform : torchvision.transforms, optional Data transforms, by default None is_fits : bool, optional Is the input Data in FITS files. is_tabular : bool, optional Is the input Data in Tabular. """ if not isinstance(y_col, (str, list)): raise TypeError( "y_col must be a list or string denoting the label column") if is_tabular is True and is_fits is True: warnings.warn( SunpyUserWarning( "`is_tabular` and `is_fits` flags both cannot be simultaneously True " "Using tabular data for analysis")) self.data = data self.X_col = X_col self.y_col = y_col self.root_dir = root_dir self.transform = transform self.is_fits = is_fits self.is_tabular = is_tabular self.X = self.data[self.X_col] self.y = self.data[self.y_col]
def train_dataloader(self): """ Returns the Training Dataloader. Returns ------- Dataloader : torch.DataLoader The Training Dataloader. """ if self.is_regression is True and self.weighted_sampling is True: warnings.warn( "Cannot use Weighted Sampling with Regression tasks. Defaulting to Random Shuffling." ) self.weighted_sampling = False if isinstance(self.y_col, list) and len(self.y_col) > 1: raise (SunpyUserWarning( "Weighted Sampling does not work with multiclass classification." + " Defaulting to random sampling.")) if self.weighted_sampling is True: if isinstance(self.y_col, list): y_col = self.y_col[0] else: y_col = self.y_col classes, class_counts = np.unique(self.train[y_col], return_counts=True) class_weights = {} weights = 1 / torch.DoubleTensor(class_counts) for index, weight in enumerate(weights): class_weights[index] = weight weight_list = [ class_weights[i] for i in self.train[np.array(y_col)] ] sampler = torch.utils.data.sampler.WeightedRandomSampler( weight_list, len(weight_list)) return DataLoader(self.train_dataset, batch_size=self.batch_size, sampler=sampler) else: return DataLoader(self.train_dataset, batch_size=self.batch_size)
def verify(self, match_score, threshold): """ Verify matching quality. If any match score is less than the threshold, raises Sunpy User Warnings. Parameters ---------- match_score: `numpy.ndarray` Array of size `(n,)` where n is the number of rows in df_1. Contains match score for corresponding best matches. threshold: `float` Minimum score for considering a proper match. """ match_dict = { 'euclidean' : lambda x, y: True if x > y else False, 'cosine' : lambda x, y: True if x < y else False } for index, score_value in enumerate(match_score): if match_dict[self.match_type](score_value, threshold): warnings.warn(SunpyUserWarning(f"\nMatch at Index {index} is likely to be incorrect\n"))
def _get_data(self, delimiter: str): # Reading the Timesfits file try: if self.get_all_timesfits_columns: self.timesfits = pd.read_csv(self.timesfits, delimiter=delimiter) else: self.timesfits = pd.read_csv(self.timesfits, delimiter=delimiter, usecols=self.timesfits_columns) except ValueError: raise SunpyUserWarning( "Sunspotter Object cannot be created." " Either the Timesfits columns do not match, or the file is corrupted" ) if not self.timesfits_columns.issubset(self.timesfits.columns): missing_columns = self.timesfits_columns - \ self.timesfits_columns.intersection(self.timesfits.columns) missing_columns = ", ".join(missing_columns) raise SunpyUserWarning( "Sunspotter Object cannot be created." " The Timesfits CSV is missing the following columns: " + missing_columns) if 'obs_date' in self.timesfits.columns: self.timesfits.obs_date = pd.to_datetime(self.timesfits.obs_date, format=self.datetime_fmt) self.timesfits.set_index("obs_date", inplace=True) # Reading the Properties file try: if self.get_all_properties_columns: self.properties = pd.read_csv(self.properties, delimiter=delimiter) else: self.properties = pd.read_csv(self.properties, delimiter=delimiter, usecols=self.properties_columns) except ValueError: raise SunpyUserWarning( "Sunspotter Object cannot be created." " Either the Properties columns do not match, or the file is corrupted" ) if not self.properties_columns.issubset(self.properties.columns): missing_columns = self.properties_columns - \ self.properties_columns.intersection(self.properties.columns) missing_columns = ", ".join(missing_columns) raise SunpyUserWarning( "Sunspotter Object cannot be created." " The Properties CSV is missing the following columns: " + missing_columns) if 'id_filename' in self.properties.columns: self.properties.set_index("id_filename", inplace=True) # Reading the Classification file if self.classifications is not None: if self.classifications_columns is None: raise SunpyUserWarning( "Classifications columns cannot be None" " when classifications.csv is to be loaded.") try: self.classifications = pd.read_csv( self.classifications, delimiter=delimiter, usecols=self.classifications_columns) except ValueError: raise SunpyUserWarning( "Sunspotter Object cannot be created." " Either the Classifications columns do not match, or the file is corrupted" ) self.classifications_columns = set(self.classifications_columns) if not self.classifications_columns.issubset( self.classifications.columns): missing_columns = self.classifications_columns - \ self.classifications_columns.intersection( self.classifications.columns) missing_columns = ", ".join(missing_columns) raise SunpyUserWarning( "Sunspotter Object cannot be created." " The Classifications CSV is missing the following columns: " + missing_columns)
def __init__( self, score_board: pd.DataFrame, *, k_value=32, default_score=1400, max_comparisons=50, max_score_change=32, min_score_change=16, score_memory=10, delimiter=';', column_map={ "player 0": "image_id_0", "player 1": "image_id_1", "score for player 0": "image0_more_complex_image1" }): """ Parameters ---------- score_board : pandas.DataFrame DataFrame holding the scores of individual matches. k_value : int, optional Initial K Value to be used for calculating new ratings, by default 32 default_score : int, optional Initial rating, by default 1400 max_comparisons : int, optional Max comparisions for any player, by default 50 max_score_change : int, optional Upper limit on K Value updation, by default 32 min_score_change : int, optional Lower limit on K Value updation, by default 16 score_memory : int, optional Number of previous scores to consider while calculating standard deviation and new K value, by default 10 column_map : dict, optional Dictionary, for mapping the column names of the score_board dataframe to variable names used in the ELO ranking system. by default {"player 0": "image_id_0", "player 1": "image_id_1", "score for player 0": "image0_more_complex_image1"} """ self.score_board = score_board self.k_value = k_value self.default_score = default_score self.score_change = {'min': min_score_change, 'max': max_score_change} self.max_comparisions = max_comparisons self.score_memory = score_memory self.column_map = column_map if not set(self.column_map.values()).issubset( self.score_board.columns): missing_columns = set(self.column_map.values()) - set( self.column_map.values()).intersection( self.score_board.columns) missing_columns = ", ".join(missing_columns) raise SunpyUserWarning( "The following columns mentioned in the column map" f" are not present in the score board: {missing_columns}") self._create_ranking()
def setup(self, stage=None): """ Dataset Generation function. Parameters ---------- stage : str, optional Training or Testing stage, by default None """ for train_index, test_index in self.train_test_splitter.split( X=self.data[self.X_col], y=self.data[self.y_col]): self.train, self.test = self.data.iloc[ train_index], self.data.iloc[test_index] for train_index, val_index in self.train_val_splitter.split( X=self.train[self.X_col], y=self.train[self.y_col]): self.train, self.val = self.train.iloc[ train_index], self.train.iloc[val_index] # Assign train/val datasets for use in dataloaders if stage == 'fit' or stage is None: if self.train_size > 1: self.train = self.train[:self.train] else: self.train = self.train[:int( len(self.train) * self.train_size)] if isinstance(self.train_conf, dict): self.train_dataset = BaseDataset(data=self.train, X_col=self.X_col, y_col=self.y_col, **self.train_conf) else: warnings.warn( SunpyUserWarning( "No training configurations specified, using default configuration." )) self.train_dataset = BaseDataset(data=self.train, X_col=self.X_col, y_col=self.y_col) if isinstance(self.val_conf, dict): self.val_dataset = BaseDataset(data=self.val, X_col=self.X_col, y_col=self.y_col, **self.val_conf) else: warnings.warn( SunpyUserWarning( "No validation configurations specified, using default configuration." )) self.val_dataset = BaseDataset(data=self.val, X_col=self.X_col, y_col=self.y_col) # Assign test dataset for use in dataloader(s) if stage == 'test' or stage is None: if isinstance(self.test_conf, dict): self.test_dataset = BaseDataset(data=self.test, X_col=self.X_col, y_col=self.y_col, **self.test_conf) else: warnings.warn( SunpyUserWarning( "No testing configurations specified, using default configuration." )) self.test_dataset = BaseDataset(data=self.test, X_col=self.X_col, y_col=self.y_col)
def __init__(self, *, data, X_col, y_col, sequence_length, root_dir=None, transform=None, is_tabular=True): """ Parameters ---------- data : pd.DataFrame The Dataframe with the data information. X_col : list or str Feature Columns y_col : list or str Label Column sequence_length : int Length of the Sequence in the Time Series. root_dir : str, optional Path to the data files if any. transform : torchvision.transforms, optional Data transforms, by default None is_tabular : bool, optional Is the input Data in Tabular. """ if not isinstance(y_col, (str, list)): raise TypeError( "y_col must be a string or list denoting the label column(s)") if not set(X_col).isdisjoint(set(y_col)): raise ValueError( "Feature Columns and Label columns must be dijoint") self.data = data self.X_col = X_col self.y_col = y_col self.sequence_length = sequence_length self.root_dir = root_dir self.transform = transform self.is_tabular = is_tabular if len(data) < self.sequence_length: raise ValueError( "Length of dataset cannot be smaller than sequence length.") if self.sequence_length > len(data) // 2: raise ValueError( "Length of sequence cannot be greater half of length of data.") residual_data_indices = len(data) % self.sequence_length if residual_data_indices > 0: warning_message = "The following indices cannot be loaded as a sequence : " leftover_indices = ", ".join([ str(index) for index in range( len(data) - residual_data_indices, len(data)) ]) warnings.warn(SunpyUserWarning(warning_message + leftover_indices)) self.data = self.data[:-residual_data_indices] self.X = self.data[self.X_col] self.y = self.data[self.y_col]