def __init__(self, path_sim_word, path_score, path_id2word, path_kneighbor, threshold_ns=0.005, threshold_ps=0.2): self.path_sim_word = path_sim_word self.path_score = path_score self.path_id2word = path_id2word self.path_kneighbor = path_kneighbor self.threshold_ps = threshold_ps self.threshold_ns = threshold_ns synset = self.get_synset(self.path_sim_word) score = load_from_pkl(self.path_score) kneighbor = load_from_pkl(self.path_kneighbor) id2word = self.get_id2word(self.path_id2word) ps, md, ns = self.get_psns(score, id2word, kneighbor, self.threshold_ps, self.threshold_ns) dump_to_pkl(ps, 'data/test/ps_0.01_test.pkl') dump_to_pkl(ns, 'data/test/ns_0.01_test.pkl') dump_to_pkl(md, 'data/sample/md_3_0.2_0.01.pkl') f1, precision, recall = self.ps_stat(synset, ps) print('precision: %f' % precision) print('recall: %f' % recall) print('f1: %f' % f1)
def __init__(self, trn_file, wav_file, mfcc_file, args, vocab_create_mode='BUILD', mfcc_create='Y'): ''' Args: data_file: data file path vocab_create_mode: BUILD: create the vocab dict from raw label data LOAD : read from file directly ''' self.args = args #trn file path self.trn_file = trn_file #wav file path self.wav_file = wav_file #mfcc file path self.mfcc_file = mfcc_file # data file path #self.data_file = data_file # <EOS>: end of the sentenset tag # <SOS>: start of the sentenset tag # <PAD>: padding tag self.special_signs = ['<EOS>', '<SOS>', '<PAD>', '<BIAS>'] # label to index dict self.vocab = {} # index to label dict self.inverse_vocab = {} if vocab_create_mode == 'BUILD': self.label_process() elif vocab_create_mode == 'LOAD': self.vocab = utils.load_from_pkl('vocab.pkl') self.inverse_vocab = utils.invert_dict(self.vocab) if mfcc_create == 'Y': for i in range(len(self.wav_file)): wavlist = os.listdir(self.wav_file[i]) for j in range(len(wavlist)): wav_path = os.path.join(self.wav_file[i], wavlist[j]) # invert the radio to the mfcc feature mfcc = self.read_wav_file(wav_path, 26, 9) mfcc = np.transpose(mfcc) np.save(os.path.join(self.mfcc_file[i], \ os.path.splitext(wavlist[j])[0]), mfcc, 'utf-8')
def __init__(self, input_wvectors, input_word2id, input_id2word, input_vocabulary, pair_file_path, kn_file_name, output_file_name, topn = 20): word2id = dict() with codecs.open(input_word2id, 'r', encoding='utf-8') as f: for lines in f: word2id[lines.strip().split()[0]] = int(lines.strip().split()[1]) id2word = dict() with codecs.open(input_id2word, 'r', encoding='utf-8') as f: for lines in f: id2word[int(lines.strip().split()[0])] = lines.strip().split()[1] vocabulary = [] with codecs.open(input_vocabulary, 'r', encoding='utf-8') as f: for lines in f: vocabulary.append(int(lines.strip())) self.topn = topn kneighbor = KNeighbor(input_wvectors, vocabulary, word2id, id2word) dump_to_pkl(kneighbor, kn_file_name) logging_set('NSselect.log') files = os.listdir(pair_file_path) pairs = dict() for file in tqdm(files): if not os.path.isdir(file): path = pair_file_path + "/" + file pair = load_from_pkl(path) logging.info("pair size: %d" % (len(pair))) if len(pairs) == 0: pairs = pair else: for key in pair.keys(): if key in pairs: pairs[key] += pair[key] else: pairs[key] = pair[key] logging.info("current total pair size: %d" % (len(pairs))) logging.info("start calculate score") score = self.select_new(pairs, kneighbor, self.topn) #score1 = self.select(pairs, kneighbor) logging.info("start saving") dump_to_pkl(score, output_file_name)
pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') ], axis=1) X_test = pd.concat([ X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') ], axis=1) cat_features_index.append(fidx) fidx += 1 X_train = reduce_mem_usage(X_train) X_test = reduce_mem_usage(X_test) save_as_pkl(X_train, f'X_train_{EXP_NAME}_.pkl') save_as_pkl(X_test, f'X_test_{EXP_NAME}_.pkl') elif skip_fr is True: X_train = load_from_pkl(f'X_train_task1_lgbm_fs100.pkl') X_test = load_from_pkl(f'X_test_task1_lgbm_fs100.pkl') cat_features_index = [] mlflow.set_experiment(EXP_NAME) mlflow.start_run() run_id = mlflow.active_run().info.run_id with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'): folds = pd.read_csv( f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv' ) with t.timer(f'train XGB'): logging.info(f'Num. of Samples: {len(X_train)}')
for i in [i[0] for i in train_feat] + [i[0] for i in test_feat]: tmp.extend(i) unique_num = len(set(tmp)) + 1 unique_num_dic[feat] = unique_num print('Unique Num', unique_num_dic) print('Feature index', feature_index) save_as_pkl(X_train, f'X_train_{EXP_NAME}.pkl') save_as_pkl(X_test, f'X_test_{EXP_NAME}.pkl') save_as_pkl(unique_num_dic, f'unique_num_dic_{EXP_NAME}.pkl') save_as_pkl(feature_index, f'feature_index_{EXP_NAME}.pkl') elif skip_fr is True: X_train = load_from_pkl(f'X_train_task1_mlp_fs100.pkl') X_test = load_from_pkl(f'X_test_task1_mlp_fs100.pkl') unique_num_dic = load_from_pkl(f'unique_num_dic_task1_mlp_fs100.pkl') feature_index = load_from_pkl(f'feature_index_task1_mlp_fs100.pkl') X_train = X_train.fillna(0.0) X_test = X_test.fillna(0.0) with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'): folds = pd.read_csv( f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv' ) mlflow.set_experiment(EXP_NAME) mlflow.start_run() run_id = mlflow.active_run().info.run_id
def __init__(self, input_wvectors, input_word2id, input_id2word, input_vocabulary, pair_file_path, kn_file_name, output_file_name, topn=20): word2id = dict() with codecs.open(input_word2id, 'r', encoding='utf-8') as f: for lines in f: word2id[lines.strip().split()[0]] = int( lines.strip().split()[1]) id2word = dict() with codecs.open(input_id2word, 'r', encoding='utf-8') as f: for lines in f: id2word[int( lines.strip().split()[0])] = lines.strip().split()[1] vocabulary = [] with codecs.open(input_vocabulary, 'r', encoding='utf-8') as f: for lines in f: vocabulary.append(int(lines.strip())) self.topn = topn logging.info("get kneighbors...") #kneighbor = load_from_pkl(kn_file_name) kneighbor = KNeighbor(input_wvectors, vocabulary, word2id, id2word) dump_to_pkl(kneighbor, kn_file_name) logging.info("kneightbors got.") logging.info("get pairs...") files = os.listdir(pair_file_path) pairs = dict() for file in tqdm(files): if file == '.DS_Store': continue if not os.path.isdir(file): path = pair_file_path + "/" + file pair = load_from_pkl(path) logging.info("pair size: %d" % (len(pair))) if len(pairs) == 0: pairs = pair else: for key in pair.keys(): if key in pairs: pairs[key] += pair[key] else: pairs[key] = pair[key] logging.info("current total pair size: %d" % (len(pairs))) logging.info("pairs got") resplit_pairs(pairs, './data/pairs_large_resplit', 1000) logging.info("len(word2id): %d" % len(word2id)) keys_before_sort_set = set([key[0] for key in pairs.keys()]) logging.info("length of pair.keys[0]: %d" % len(keys_before_sort_set)) id_missing_in_pairs = set(word2id.values()) - keys_before_sort_set logging.info("len(id_missing_in_pairs): %d" % (len(id_missing_in_pairs))) if len(id_missing_in_pairs) > 0: logging.info("missing word in pairs: %s" % str(id_missing_in_pairs)) #dump_to_pkl(id_missing_in_pairs, './data/id_missing_in_pairs.pkl') logging.info("start calculate score") score = self.select_new(pairs, kneighbor, self.topn) logging.info("len(score): %d" % len(score)) #score1 = self.select(pairs, kneighbor) logging.info("start saving") dump_to_pkl(score, output_file_name)
import os from utils import load_from_pkl, dump_to_pkl path = 'data/pair' files = os.listdir(path) pairs = dict() for file in files: if not os.path.isdir(file): pair_file_path = path + "/" + file pair = load_from_pkl(pair_file_path) if len(pairs) == 0: pairs = pair else: for key in pair.keys(): if key in pairs: pairs[key] += pair[key] else: pairs[key] = pair[key] output_file_name = 'data/pairs' dump_to_pkl(pairs, output_file_name)
def __init__( self, input_file_name, input_wvectors, input_cvectors, input_ps, input_ns, output_file_name, emb_dimension=100, batch_size=50, window_size=5, kn=20, iteration=1, initial_lr=0.001, clip=1.0, min_count=30, batch_num_to_valid=100000, ): """Initilize class parameters. Args: input_file_name: Name of a text data from file. Each line is a sentence splited with space. input_vectors: Pretrained vector input_psns: Pretrained positive sample & negative sample output_file_name: Name of the final embedding file. emb_dimention: Embedding dimention, typically from 50 to 500. batch_size: The count of word pairs for one forward. window_size: Max skip length between words. kn: k neighbors. iteration: Control the multiple training iterations. initial_lr: Initial learning rate. min_count: The minimal word frequency, words with lower frequency will be filtered. Returns: None. """ self.data = InputData(input_file_name, min_count) self.pre_wvectors = InputVector(input_wvectors) self.pre_cvectors = InputVector(input_cvectors) self.ps_w = load_from_pkl(input_ps) self.ns_w = load_from_pkl(input_ns) self.ps = convert_word_to_id(self.ps_w, self.data.word2id) self.ns = convert_word_to_id(self.ns_w, self.data.word2id) self.output_file_name = output_file_name self.emb_size = len(self.data.word2id) self.emb_dimension = emb_dimension self.batch_size = batch_size self.window_size = window_size self.kn = kn self.iteration = iteration self.initial_lr = initial_lr self.clip = clip self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension, self.pre_wvectors, self.pre_cvectors) self.use_cuda = torch.cuda.is_available() if self.use_cuda: self.skip_gram_model.cuda() self.optimizer = optim.SGD(self.skip_gram_model.parameters(), lr=self.initial_lr) self.batch_num_to_valid = batch_num_to_valid
def output_plots( df_list_path: str, var_to_viz: str = "temp_f", save_path: str = "./data/images/", figsize: Tuple[float, float] = (5., 7.5), color_range: Tuple[int, int] = (0, 100), cmap: str = "RdYlGn_r", ) -> None: """Makes plots for each timestep in input data df_list_path: Path to saved data from pull_data.py var_to_viz: Which variable to visualize save_path: Directory to save plots figsize: Size of output plots color_range: Min and Max value for making the colorbar cmap: Color mapping for plots and colorbar. "RdYlGn_r" and "plasma" work best """ save_path += f"{var_to_viz}/" df_list = load_from_pkl(df_list_path) # print(df_list) print(f"Saving images for {len(df_list)} dfs") for now in df_list: df = df_list[now] if df.empty: print(f'No data for {now}') continue fl_name = f"{now}_map.png" # check if file already has image cont = 1 for (_, _, fls) in walk(save_path): if fl_name in fls: cont = 0 if not cont: continue # Create the figure and the axes fig = plt.figure(figsize=figsize) ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree()) # lat and lon coords to plot # extent = [-128, -65, 23, 50.5] # whole USA extent = [-123, -121.75, 37, 38.75] # Bay Area # extent = [-122.65, -122.15, 37.6, 38.] # SF city Area df = df[df["lon"].between(extent[0], extent[1])] df = df[df["lat"].between(extent[2], extent[3])] lat = df["lat"].values lon = df["lon"].values # Variable from which to generate the color gradient if var_to_viz in ["pm_2.5", "PM2.5_CF_ATM_ug/m3"]: colors = df[var_to_viz].fillna(-1).apply(aqi_from_pm).values elif var_to_viz == "temp_f": colors = df[var_to_viz].values - 8 else: colors = df[var_to_viz] # Display some map info ax.set_extent(extent) land10m = cfeature.NaturalEarthFeature( "physical", "land", "10m", edgecolor="black", facecolor="lightgray", linewidth=0.5, ) ax.add_feature(land10m) # Add scatter points for each coordinate pair c_min, c_max = color_range[0], color_range[1] scatter = ax.scatter( lon, lat, marker="o", c=colors, cmap=cmap, zorder=5, s=5, vmin=c_min, vmax=c_max, ) # Add scale plt.cm.ScalarMappable(cmap=cmap) plt.colorbar(scatter, fraction=0.06542, pad=0).set_label( f"{title_dict[var_to_viz]}", rotation=90 ) ax.set_facecolor("lightblue") plt.title(f"{title_dict[var_to_viz]} at {now}") if not os.path.exists(save_path): os.makedirs(save_path) plt.savefig(save_path + fl_name, dpi=300, bbox_inches="tight") plt.close()
X_train = pd.concat([ X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather') ], axis=1) X_test = pd.concat([ X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather') ], axis=1) X_train = reduce_mem_usage(X_train) X_test = reduce_mem_usage(X_test) save_as_pkl(X_train, f'X_train_{EXP_NAME}_.pkl') save_as_pkl(X_test, f'X_test_{EXP_NAME}_.pkl') elif skip_fr is True: X_train = load_from_pkl(f'X_train_task1_xgb_fs100_meta_.pkl') X_test = load_from_pkl(f'X_test_task1_xgb_fs100_meta_.pkl') cat_features_index = [] mlflow.set_experiment(EXP_NAME) mlflow.start_run() run_id = mlflow.active_run().info.run_id with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'): folds = pd.read_csv( f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv' ) with t.timer(f'train XGB'): logging.info(f'Num. of Samples: {len(X_train)}')
for i in [i[0] for i in train_feat] + [i[0] for i in test_feat]: tmp.extend(i) unique_num = len(set(tmp)) + 1 unique_num_dic[feat] = unique_num print('Unique Num', unique_num_dic) print('Feature index', feature_index) save_as_pkl(X_train, f'X_train_{EXP_NAME}.pkl') save_as_pkl(X_test, f'X_test_{EXP_NAME}.pkl') save_as_pkl(unique_num_dic, f'unique_num_dic_{EXP_NAME}.pkl') save_as_pkl(feature_index, f'feature_index_{EXP_NAME}.pkl') elif skip_fr is True: X_train = load_from_pkl(f'X_train_{EXP_NAME}.pkl') X_test = load_from_pkl(f'X_test_{EXP_NAME}.pkl') unique_num_dic = load_from_pkl(f'unique_num_dic_{EXP_NAME}.pkl') feature_index = load_from_pkl(f'feature_index_{EXP_NAME}.pkl') X_train = X_train.fillna(0.0) X_test = X_test.fillna(0.0) with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'): folds = pd.read_csv( f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv' ) mlflow.set_experiment(EXP_NAME) mlflow.start_run() run_id = mlflow.active_run().info.run_id
If all procedures are within scope of same pregnancy - treat them as single row and find matches from control once. Data Validation: 1. Rows in experiment should be removed from control. 2. Rows in experiment without parity - should be removed. Other: * rows colored purple - manually created couplings. Can be used to test final flow. Comparison should be contains and not exact. In 'experiment' sheet in experiment.xlsx, there is a list of the selected rows. """ # load files pkl_pth = 'temp_12_2018.pkl' if path.exists(pkl_pth): data = load_from_pkl(pkl_pth) control = data['control'] experiment = data['experiment'] else: control = pd.read_excel('control_2.xlsx').dropna(how='all') # parse_dates=['maternal_birth_date', 'neonatal_birth_date']).dropna(how='all') # experiment = pd.read_excel('experiment_2.xlsx', sheet_name='experiment', parse_dates=['BirthDate']).dropna(how='all') experiment = pd.read_excel('experiment_2.xlsx').dropna(how='all') save_to_pkl(pkl_pth, control=control, experiment=experiment) def create_new_index(df): """ Removes rows without parity, and creates full name_id_parity index as 'new_index' column :param df: :return: