def form_population_data(): ''' Specialized function for turning state-wise population information into features Returns: Saves a dictionary with key = state name and value = (population, population density, population over 65 in %) ''' path1 = '../dataset/population_density_usa.csv' path2 = '../dataset/population_old_usa.csv' df1 = pd.read_csv(path1) df2 = pd.read_csv(path2) column1 = ['State', 'Population', 'Density'] column2 = ['State', 'Population65+%'] pop_info = dict() for values in df1[column1].values: state, pop, density = values pop_info[state] = [ float(pop.replace(',', '')), float(density.replace(',', '')) ] for values in df2[column2].values: state, pop65 = values pop_info[state].append(float(pop65)) save_pickle(pop_info, '../dataset/generated/usa/pop_info')
def main(): # Load a dictionary of Michael's quotes to their season and episode print("Attempting to load quotes from file") quotes = load_quotes() if quotes is None: print("Scraping the web for new quotes") quotes = scrape() print("Creating sentence encoder") encoder = Encoder() print("Attempting to load quote embeddings from file") quote_embeddings = load_quote_embeddings() if quote_embeddings is None: print("Generating new quote embeddings") quote_embeddings = generate_quote_embeddings(encoder, quotes) print("Saving new quote embeddings to {0}".format(embeddings_file)) save_pickle(quote_embeddings, embeddings_file) print("Creating predictor") predictor = Predictor(encoder, quote_embeddings) while True: input_sentence = query_input() prediction = predictor.predict_output(input_sentence) output_quote = prediction[0] output_season = prediction[1]['season'] output_episode = prediction[1]['episode'] print("Michael says: \"{0}\" in season {1}, episode {2}".format( output_quote, output_season, output_episode))
def write_stats_pickle(self, base_path: Union[str, Path]): """ write the stats dictionary as a pickle :return: """ filename = os.path.join(base_path, 'graph_stats', self.dataset, self.model, f'gs_{self.trial}_{self.iteration}.pkl.gz') CP.print_blue(f'Stats pickle stored at {filename}') save_pickle(self.stats, filename) return
def load_preprocessing(): # cfg.preprocessing = proc.ComposeProcessColumn([ # prep.Resize(224, apply_to_target=False), # prep.MinMaxNorm(background=-1), # prep.LocalMedian(background=-1), # prep.EqualizeHist(background=-1), # ]) cfg.preprocessing = proc.Processor() u.save_pickle(cfg.preprocessing, join(cfg.tensorboard_path, 'preprocessing.pkl'))
def _do_masking(self): try: # get key and dir's abs_path key_str = self._get_key() file_path = self._get_path() # print(key_str, file_path) wb_read = load_workbook(file_path, read_only=True) wb_write = Workbook(write_only=True) hash_bytes = load_pickle('mapping.pkl') # get the count of all data rows row_count = 0 current_count = 1 for sheetname in wb_read.sheetnames: sheet_read = wb_read[sheetname] row_count += sheet_read.max_row # read data and do masking, and then save the masked rows for sheetname in wb_read.sheetnames: print('processing sheet {}:'.format(sheetname)) sheet_read = wb_read[sheetname] # sheet_row_count = sheet_read.max_row # print(sheet_row_count) sheet_write = wb_write.create_sheet(title=sheetname) rows_read = sheet_read.rows for row in rows_read: row_values = [] for cell in row: row_values.append(cell.value) # do masking if current_count > 1: masked_row, hash_bytes_added = mask_row( key_str, sheetname, row_values) hash_bytes.update(hash_bytes_added) sheet_write.append(masked_row) else: sheet_write.append(row_values) current_count += 1 if current_count % 100 == 0 or current_count == row_count: self._set_processBar(current_count / row_count * 100) print('完成了{}%'.format(current_count / row_count * 100)) save_pickle('mapping.pkl', hash_bytes) QMessageBox.information(QWidget(), "Information", "数据脱敏成功,点击确认后请保存文件") write_path = QFileDialog.getSaveFileName(caption="保存为.xlsx文档", directory="./")[0] # write_path = './加密数据.xlsx' # print(write_path) wb_write.save(write_path) QMessageBox.information(QWidget(), "Information", "保存完成") except Exception as e: QMessageBox.warning(QWidget(), "warning", str(e)) print(e)
def extract_features(self, model, model_path, model_tag, used_set, loaders_dic): """ inputs: model : The loaded model containing the feature extractor loaders_dic : Dictionnary containing training and testing loaders model_path : Where was the model loaded from model_tag : Which model ('final' or 'best') to load used_set : Set used between 'test' and 'val' n_ways : Number of ways for the task returns : extracted_features_dic : Dictionnary containing all extracted features and labels """ # Load features from memory if previously saved ... save_dir = os.path.join(model_path, model_tag, used_set) filepath = os.path.join(save_dir, 'output.plk') if os.path.isfile(filepath): extracted_features_dic = load_pickle(filepath) print(" ==> Features loaded from {}".format(filepath)) return extracted_features_dic # ... otherwise just extract them else: print(" ==> Beginning feature extraction") if not os.path.isdir(save_dir): os.makedirs(save_dir) model.eval() with torch.no_grad(): all_features = [] all_labels = [] for i, (inputs, labels, _) in enumerate(warp_tqdm(loaders_dic['test'], False)): inputs = inputs.to(self.device) outputs, _ = model(inputs, True) all_features.append(outputs.cpu()) all_labels.append(labels) all_features = torch.cat(all_features, 0) all_labels = torch.cat(all_labels, 0) extracted_features_dic = { 'concat_features': all_features, 'concat_labels': all_labels } print(" ==> Saving features to {}".format(filepath)) save_pickle(filepath, extracted_features_dic) return extracted_features_dic
def train(args, device_id): init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True def train_iter_fct(): return data_loader.Dataloader(args, load_dataset(args, 'train', shuffle=True), args.batch_size, device, shuffle=True, is_test=False) model = Summarizer(args, device, load_pretrained_bert=True) if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) model.load_cp(checkpoint) optim = model_builder.build_optim(args, model, checkpoint) else: optim = model_builder.build_optim(args, model, None) logger.info(model) trainer = build_trainer(args, device_id, model, optim) losses, n_docs = trainer.train(train_iter_fct, args.train_steps) save_pickle(losses, 'losses_classifier') save_pickle(n_docs, 'docs_classifier')
def valid(_cfg, model, all_exam=False): cfg = copy.deepcopy(_cfg) if all_exam: cfg["dataset"]["param"][ "posexam_only"] = False # validation for all slices assert cfg["output"] assert not os.path.exists(cfg["output"]) criterion = factory.get_criterion(cfg) path = os.path.join(output_dir, 'fold%d_ep0.pt' % (cfg['fold'])) print(f'best path: {str(path)}') utils.load_model(str(path), model) loader_valid = factory.get_loader_valid(cfg) with torch.no_grad(): results = run_nn(cfg, 'valid', model, loader_valid, criterion=criterion) utils.save_pickle(results, cfg["output"]) log('saved to %s' % cfg["output"])
def create_flight_data(df): ''' Args: df (Pandas dataframe): Travel data Returns: Saves a dictionary where key = (state1, state2) and value = count of flights ''' columns_of_interest = ['ORIGIN_STATE_NM', 'DEST_STATE_NM'] # Loop through the columns and increase the count flight_dict = dict() for s1, s2 in df[columns_of_interest].values: # Assuming an undirected graph, hence using a frozen set if s1 != s2: pair = frozenset([s1, s2]) if pair in flight_dict: flight_dict[pair] += 1 else: flight_dict[pair] = 1 save_path = '../dataset/generated/flightdict' save_pickle(flight_dict, save_path)
def main(): with Timer('Loading config'): cfg = load_config() with Timer('Loading tweets'): tweets = load_raw_data(cfg['RAW_DATA_PATH']) with Timer('Cleaning sentences'): tweet_text = cleanse_sentences(list(tweets['text'])) with Timer('Mapping characters to integers'): tweet_enc, map_char_to_int, map_int_to_char = map_tweets_to_int(tweet_text) with Timer('Producing dataset'): tweet_train, tweet_label = produce_dataset(tweet_enc) with Timer('Save dataset and mapping tables'): save_pickle(tweet_train, cfg['PROCESSED_DATA_DIR'] + '/train.pkl') save_pickle(tweet_label, cfg['PROCESSED_DATA_DIR'] + '/label.pkl') save_pickle(map_char_to_int, cfg['PROCESSED_DATA_DIR'] + '/map_char_to_int.pkl') save_pickle(map_int_to_char, cfg['PROCESSED_DATA_DIR'] + '/map_int_to_char.pkl')
def main(): """ Load raw ECG data from disc and transform it to cleansed training data. """ cfg = load_config() with Timer('Getting label list'): labels, file_list = get_labels(cfg['RAW_DATA_PATH'] + '/Diagnostics.xlsx') with Timer('Loading & Downsampling files'): ecg_data = get_ecg_data(cfg['RAW_DATA_PATH'] + '/ECGDataDenoised', file_list, cfg['DOWNSAMPLE_THRESHOLD'], cfg['DATA_SLICE'], cfg['NUM_WORKERS']) with Timer('Imputing missing values'): ecg_data = impute_nans(ecg_data) with Timer('Splitting into Train & Test Set'): x_train, x_test, y_train, y_test = train_test_split(ecg_data, labels, test_size=0.2, shuffle=True, stratify=labels, random_state=42) print('Final Training set has {} samples'.format(len(x_train))) print('Final Test set has {} samples'.format(len(x_test))) print('Distribution of labels in Training: {}'.format( Counter(y_train))) print('Distribution of labels in Testing: {}'.format(Counter(y_test))) with Timer('Normalizing data'): x_train, x_test = normalize_data(x_train, x_test) with Timer('Saving generated arrays'): save_pickle(x_train, cfg['PROCESSED_DATA_DIR'] + '/train_data.pkl') save_pickle(y_train, cfg['PROCESSED_DATA_DIR'] + '/train_label.pkl') save_pickle(x_test, cfg['PROCESSED_DATA_DIR'] + '/test_data.pkl') save_pickle(y_test, cfg['PROCESSED_DATA_DIR'] + '/test_label.pkl')
for data in train_loader: data = data.to(device) # print(data.x.size()) optimizer.zero_grad() output = model(data) label = data.y.to(device).reshape(-1, 1) loss = cost(output, label) loss.backward() total_loss = loss optimizer.step() # for actual, predicted in zip(label, output): # print(actual, predicted) # print("actual = ", label, "predicted = ", output, "loss =", total_loss) save_pickle(output.cpu().detach().numpy().reshape(-1, ), SOURCE_PATH / 'dataset/timeseries/data/outputs.pkl') elif reply == 3: epochs = 12 lr = 0.01 cases = load_pickle(SOURCE_PATH / 'dataset/timeseries/data/features.pkl') distances = load_pickle(SOURCE_PATH / 'dataset/timeseries/data/dist_matrix.pkl') n_flights = load_pickle(SOURCE_PATH / 'dataset/timeseries/data/travel_matrix.pkl') distances = 1 - distances norm = np.max(cases) cases = cases / np.max(cases) edges = distances / np.max(distances) + n_flights / np.max(n_flights) edges /= np.max(edges) labels = torch.FloatTensor(cases[:, 0])
def main(args): cfg.args = args cfg.BATCH_SIZE = args['BATCH_SIZE'] cfg.EPOCHS = args['EPOCHS'] if not cfg.DEBUG else 2 cfg.TRAIN_TEST_SPLIT = args['TRAIN_TEST_SPLIT'] cfg.MODEL_NAME = args['MODEL_NAME'] cfg.MODEL_ARGS = args['MODEL_ARGS'] assert type(cfg.BATCH_SIZE) == int, "Batch size must be int" assert type(cfg.EPOCHS) == int, "Epochs must be int" cfg.res = pd.DataFrame([args]) cfg.background = cfg.MODEL_ARGS.get('background', None) now = datetime.now() day, hour = now.strftime("%d/%m/%Y %H:%M:%S").split(' ') cfg.res['day'] = [day] cfg.res['hour'] = [hour] # Get tensorboard path to save results if not cfg.DEBUG: cfg.tensorboard_path = du.get_save_path( path_dirs=all_paths['tensorboard_classification'], path_to_manager=all_paths['manager_classification'], args=args, ) cfg.res['tensorboard_path'] = cfg.tensorboard_path u.save_pickle(args, join(cfg.tensorboard_path, 'cur_args.pkl')) u.save_yaml(args, join(cfg.tensorboard_path, 'cur_args.yaml')) do_save_code.save_in_final_file() else: cfg.tensorboard_path = None # Load data oriented # load_preprocessing.main() load_data.main_train() # Model creation load_model.main() # Training print('==================') print('Training ...') tr.train( cfg.model, cfg.optimizer, cfg.loss, cfg.observables, defreezer=cfg.defreezer, number_of_epochs=cfg.EPOCHS, trainloader=cfg.trainloader, valloader=cfg.testloader, grad_input=True, retain_graph=True, grad_in_eval=True, # interval=1, output_dir_tensorboard=cfg.tensorboard_path, device=cfg.device, verbose=VERBOSE_TRAIN, ) print('Done.') cfg.res['Batch_Epoch_weights'] = [cfg.observables[0].best_weights_batch_epoch] if not cfg.DEBUG: cfg.model.load_state_dict( torch.load(join(cfg.tensorboard_path, 'best_weights.pt')) ) print('weights loaded from', join(cfg.tensorboard_path, 'best_weights.pt'), 'Epoch Batch: ', cfg.res['Batch_Epoch_weights']) print('==================') print('Evaluating on train ...') # Evaluation on train set loss_train, metric_train = te.evaluate_model( cfg.model, cfg.trainloader_for_test, cfg.criterion, cfg.metrics, # 20, device=cfg.device, ) print('Done.') title_train = { 'loss': loss_train.item(), 'metric': u.round_dict_array(metric_train) } cfg.res['loss_train'] = [title_train['loss']] for key, metric in metric_train.items(): cfg.res['metric_train_{}'.format(key)] = [metric.cpu().numpy()] # Evaluation on test set print('==================') print('Evaluating on test ...') loss_test, metric_test = te.evaluate_model( cfg.model, cfg.testloader, cfg.criterion, cfg.metrics, # 20, device=cfg.device, ) print('Loss Test: {}'.format(loss_test)) print('Metric Test: {}'.format(metric_test)) print('Done.') title_test = { 'loss': loss_test.item(), 'metric': u.round_dict_array(metric_test) } cfg.res['loss_test'] = [title_test['loss']] for key, metric in metric_test.items(): cur_metric = metric.cpu().numpy() if cur_metric.shape == (): cfg.res['metric_test_{}'.format(key)] = [cur_metric] else: for idx_met, met in enumerate(cur_metric): cfg.res['metric_test_{}_{}'.format(key, idx_met)] = [met] return cfg.res
def save(self): save_pickle(reduce_mem_usage(self.train), self.train_path) save_pickle(reduce_mem_usage(self.test), self.test_path)
# =============================== # === Make submission # =============================== sample_submission = pd.read_csv(input_dir / "sample_submission.csv") submission_df = make_submission(test_preds, sample_submission) # =============================== # === Save # =============================== config["eval_results"] = dict() for k, v in evals_results.items(): config["eval_results"][k] = v save_path = output_dir / "output.json" save_json(config, save_path) plot_feature_importance(feature_importance, output_dir / "feature_importance.png") np.save(output_dir / "oof_preds.npy", oof_preds) np.save(output_dir / "test_preds.npy", test_preds) submission_df.to_csv(output_dir / "submission.csv", index=False) save_pickle(models, output_dir / "model.pkl") slack_notify(config_name + "終わったぞ\n" + str(config))
def scrape(): episode_dict = build_episode_dict() quote_dict = build_quote_dict(episode_dict) print("Saving new quotes") save_pickle(quote_dict, quotes_file) return quote_dict
def index_words(corpus_path, output_directory, min_count): regex = re.compile(r'<s>\s?|\s?</s>|\r\n|\n', re.MULTILINE) word2index = {PAD_STR: PAD, BOS_STR: BOS, EOS_STR: EOS, UNK_STR: UNK} index2word = {PAD: PAD_STR, BOS: BOS_STR, EOS: EOS_STR, UNK: UNK_STR} index2count = Counter() word2count = Counter() data = [] with open(corpus_path, 'r') as f: total = sum(1 for _ in f) with open(corpus_path, 'r') as f: for sentence in tqdm(f, total=total, desc='Reading corpus file'): sentence = re.sub(regex, '', sentence) words = sentence.split() for word in words: word2count[word] += 1 data.append(words) unk_cnt = 0 for word, count in tqdm(word2count.most_common(), desc='Fitering words using min_count'): if count >= min_count: ind = len(word2index) word2index[word] = ind index2word[ind] = word index2count[ind] = count else: unk_cnt += 1 index2count[PAD] = 0 index2count[UNK] = unk_cnt index2count[BOS] = 1 # Laplace index2count[EOS] = 1 # Laplace del word2count with open(os.path.join(output_directory, VOCAB_SIZE_FNAME), 'w') as f: f.write(len(word2index)) print("Saving word2index...") save_pickle(os.path.join(output_directory, WORD2INDEX_FNAME), word2index) print("Saving index2word...") save_pickle(os.path.join(output_directory, INDEX2WORD_FNAME), index2word) del index2word print("Saving index2count...") save_pickle(os.path.join(output_directory, INDEX2COUNT_FNAME), index2count) del index2count def pad(l, pad_token, length): return l + [pad_token] * (length - len(l)) dataset = [] for sentence in tqdm(data, desc="Creating dataset"): seq = [word2index[w] if w in word2index else UNK for w in sentence] seq = [BOS] + seq + [EOS] if len(seq) < MAX_SENTENCE_LENGTH: dataset.append(pad(seq, PAD, MAX_SENTENCE_LENGTH)) print("Freeing memory...") del data del word2index gc.collect() if not os.path.exists(output_directory): print("{} doesn't exists, creating".format(output_directory)) os.mkdir(output_directory) print("Creating pandas dataframe with dataset") df = pd.DataFrame(dataset, dtype=np.int32) df = df.sample(frac=1).reset_index(drop=True) train, test = train_test_split(df, test_size=0.2) train = train.reset_index(drop=True) test = train.reset_index(drop=True) print("Saving train dataset") train.to_csv(os.path.join(output_directory, TRAIN_DATASET_FNAME), index=False, header=False) print("Saving test dataset") test.to_csv(os.path.join(output_directory, TEST_DATASET_FNAME), index=False, header=False)
def save(self, filename): save_dict = {'log_pattern_scores': self.log_pattern_scores, 'pattern_count': self.pattern_count} save_pickle(filename, save_dict)
def run(self, use_pickle: bool) -> None: """ New runner - uses list of graphs :param use_pickle: :return: """ pickle_ext = '.pkl.gz' self.graphs = [] if use_pickle: if check_file_exists(self.graphs_pickle_path + pickle_ext): # the whole pickle exists graphs = load_pickle(self.graphs_pickle_path + pickle_ext) #assert len(graphs) == 21, f'Expected 21 graphs, found {len(graphs)}' assert len( graphs ) == self.num_generations + 1, f'Expected 21 graphs, found {len(graphs)}' CP.print_green( f'Using completed pickle at {self.graphs_pickle_path + pickle_ext!r}. Loaded {len(graphs)} graphs' ) return else: temp_file_pattern = re.compile( f'list_(\d+)_{self.trial}_temp_(\d+).pkl.gz') dir_name = '/'.join(self.graphs_pickle_path.split('/')[:-1]) input_files = [ f for f in os.listdir(dir_name) if re.match(temp_file_pattern, f) ] if len(input_files) > 0: assert len( input_files ) == 1, f'More than one matches found: {input_files}' input_file = input_files[0] total_generations, progress = map( int, temp_file_pattern.fullmatch(input_file).groups()) graphs = load_pickle(join(dir_name, input_file)) assert len( graphs ) == progress + 1, f'Found {len(graphs)}, expected: {progress}' CP.print_blue( f'Partial pickle found at {input_file!r} trial: {self.trial} progress: {progress}/{total_generations}' ) self.graphs = graphs remaining_generations = self.num_generations - len(self.graphs) tqdm.write( f'Running Infinity Mirror on {self.initial_graph.name!r} {self.initial_graph.order(), self.initial_graph.size()} {self.model.model_name!r} {remaining_generations} generations' ) pbar = tqdm(total=remaining_generations, bar_format='{l_bar}{bar}|[{elapsed}<{remaining}]', ncols=50) if len(self.graphs) == 0: self.initial_graph.level = 0 self.graphs = [self.initial_graph] self.features = [None] completed_trial = False for i in range(len(self.graphs) - 1, self.num_generations): if i == len(self.graphs) - 1: curr_graph = self.graphs[-1] # use the last graph level = i + 1 try: fit_time_start = time.perf_counter() self.model.update( new_input_graph=curr_graph) # update the model fit_time = time.perf_counter() - fit_time_start except Exception as e: fit_time = np.nan print(f'Model fit failed {e}') break try: gen_time_start = time.perf_counter() generated_graphs = self.model.generate( num_graphs=self.num_graphs, gen_id=level) # generate a new set of graphs gen_time = time.perf_counter() - gen_time_start except Exception as e: gen_time = np.nan print(f'Generation failed {e}') break if self.features: self.features.append(self.model.params) curr_graph = generated_graphs[ 0] # we are only generating one graph curr_graph.name = f'{self.initial_graph.name}_{level}_{self.trial}' curr_graph.gen = level self.graphs.append(curr_graph) temp_pickle_path = self.graphs_pickle_path + f'_temp_{level}{pickle_ext}' prev_temp_pickle_path = self.graphs_pickle_path + f'_temp_{level-1}{pickle_ext}' temp_features_path = self.graphs_features_path + f'_temp_{level}{pickle_ext}' prev_temp_features_path = self.graphs_features_path + f'_temp_{level-1}{pickle_ext}' save_pickle(obj=self.graphs, path=temp_pickle_path) save_pickle(obj=self.features, path=temp_features_path) delete_files(prev_temp_pickle_path) delete_files(prev_temp_features_path) self.write_timing_csv(iter_=level, fit_time=fit_time, gen_time=gen_time) if level == self.num_generations: completed_trial = True pbar.update(1) pbar.close() if completed_trial: # only delete the temp pickle if the trial finishes successfully delete_files( temp_pickle_path ) # delete the temp file if the loop finishes normally delete_files( temp_features_path ) # delete the temp file if the loop finishes normally CP.print_green( f'List of {len(self.graphs)} Graphs is pickled at "{self.graphs_pickle_path + pickle_ext}"' ) save_pickle(obj=self.graphs, path=self.graphs_pickle_path + pickle_ext) save_pickle(obj=self.features, path=self.graphs_features_path + pickle_ext) return
def fit(self, input_df: XDataFrame) -> None: """Fit to data frame Args: input_df (XDataFrame): Input data frame. Returns: XDataFrame : Output data frame. """ org_cols = input_df.columns.tolist() input_df = (input_df.to_pandas() if isinstance(input_df, cudf.DataFrame) else input_df) seen_cols_pairs = (load_pickle(self.save_path / "seen_feats_pairs.pkl") if (self.save_path / "seen_feats_pairs.pkl").exists() else defaultdict(list)) removed_cols_pairs = (load_pickle(self.save_path / "removed_feats_pairs.pkl") if (self.save_path / "removed_feats_pairs.pkl").exists() else defaultdict(list)) removed_cols = sum(removed_cols_pairs.values(), []) if self.dry_run: self._selected_cols = [ col for col in org_cols if col not in set(removed_cols) ] return org_cols = [col for col in org_cols if col not in removed_cols] counter = 0 for i in tqdm(range(len(org_cols) - 1)): feat_a_name = org_cols[i] if feat_a_name in removed_cols: continue feat_a = input_df[feat_a_name] for j in range(i + 1, len(org_cols)): feat_b_name = org_cols[j] if self._has_seen(feat_a_name, feat_b_name, seen_cols_pairs): continue else: seen_cols_pairs[feat_a_name].append(feat_b_name) seen_cols_pairs[feat_b_name].append(feat_a_name) if self._has_removed(feat_a_name, feat_b_name, removed_cols): continue feat_b = input_df[feat_b_name] c = np.corrcoef(feat_a, feat_b)[0][1] if abs(c) > self._threshold: counter += 1 removed_cols.append(feat_b_name) removed_cols_pairs[feat_a_name].append(feat_b_name) print("{}: FEAT_A: {} FEAT_B: {} - Correlation: {}".format( counter, feat_a_name, feat_b_name, c)) save_pickle(removed_cols_pairs, self.save_path / "removed_feats_pairs.pkl") save_pickle(seen_cols_pairs, self.save_path / "seen_feats_pairs.pkl") self._selected_cols = [ col for col in org_cols if col not in set(removed_cols) ]
df2 = df2.set_index('State') df1.sort_index(inplace=True) df2.sort_index(inplace=True) df1['Population65+%'] = df2['Population65+%'] print(df1[['Population', 'Density', 'Population65+%']]) return df1[['Population', 'Density', 'Population65+%']].to_numpy() if __name__ == "__main__": df = pd.read_csv(SOURCE_PATH / 'data/COVID19.csv') travel_df = pd.read_csv(SOURCE_PATH / 'data/travel_data.csv') selected_cols = ['Province', 'Country', 'Lat', 'Long', 'Date', 'Value'] list_of_states = load_pickle(SOURCE_PATH / 'data/us_states_list.pkl') df = df[selected_cols] features, df = get_US_states_data(list_of_states, df) dist_matrix = create_dist_matrix(df) travel_matrix = create_flight_matrix(list_of_states, travel_df) age_df = pd.read_csv(SOURCE_PATH / 'data/population_old_usa.csv') pop_df = pd.read_csv(SOURCE_PATH / 'data/population_density_usa.csv') pop_age_df = get_us_pop_data(pop_df, age_df) features = np.append(features, pop_age_df, axis=1) print(travel_matrix) save_pickle(features, SOURCE_PATH / 'data/features.pkl') save_pickle(dist_matrix, SOURCE_PATH / 'data/dist_matrix.pkl') save_pickle(travel_matrix, SOURCE_PATH / 'data/travel_matrix.pkl') a = load_pickle('data/pop_info') print(a)