def preprocess(): dataset = read_dataset('segment') data = dataset['data'] df = pd.DataFrame(data) y = df['class'].copy() df = df.drop(columns=['class', 'region-pixel-count']) scaler = MinMaxScaler() X_num = scaler.fit_transform(df) df_X_num = pd.DataFrame(X_num, columns=df.columns) df_X_cat = df_X_num.copy() for col in list(df.columns): df_X_cat[col] = pd.qcut(x=df_X_num[col], q=5, duplicates='drop') df_X_num.to_csv(os.path.join('datasets', 'segment_clean_num.csv'), index=False) df_X_cat.to_csv(os.path.join('datasets', 'segment_clean_cat.csv'), index=False) y.to_csv(os.path.join('datasets', 'segment_clean_y.csv'), index=False, header=False) return 'segment_clean_num.csv', 'segment_clean_cat.csv', 'segment_clean_y.csv'
def read_data(name: str) -> List[dict]: folds = [] preprocess = preprocess_hypothyroid if name == 'hypothyroid' else preprocess_penn for i in tqdm(range(10), desc=f'Reading {name} dataset', ncols=150): train_data = read_dataset(name=f'{name}.fold.00000{i}.train', dataset_path=os.path.join('datasets', name)) validation_data = read_dataset(name=f'{name}.fold.00000{i}.test', dataset_path=os.path.join( 'datasets', name)) (X_train, y_train), (X_val, y_val) = preprocess(train_data, validation_data) folds.append({ 'X_train': X_train, 'y_train': y_train, 'X_val': X_val, 'y_val': y_val }) return folds
def preprocess(): DATASET = 'connect-4' data = read_dataset(DATASET) df = pd.DataFrame(data['data']) # subset needed for executing some of the algorithms in our computers! df = df.sample(n=5000, replace=False, random_state=1).reset_index(drop=True) # since we are still doing unsupervised methods (clustering), we will ignore labels y... except for sup. evaluation df = df.applymap(lambda x: x.decode('utf-8') ) # encode values as unicode strings instead of bytes X = df.loc[:, df.columns != 'class'] y = df['class'].copy() # for supervised evaluation of clustering # For all vars in X, the domain is ['b', 'o', 'x'] # However, we will check it programatically. # Also, even if the dataset is supposed to have no missing values, we will check it as well, just in case. X_categories = set([]) for index, row in X.iterrows(): for col_val in row: X_categories.add(col_val) # {'b', 'o', 'x'}, so the domain is confirmed, Also, no missing values, # because otherwise would have None or others # Recall that: 'x' means that we have a cell with a disk belonging to player 'x', # 'o' means that we have a cell with a disk belonging to player 'o', and 'b' means that # the cell is empty (blank). # Instead of one hot encoding, we will apply label encoder with [0, 0.5, 1]. The reason why we will do it # this way is that 'x' and 'o' are antagonists, and 'b' is the neutral value. So, there is some kind of natural # order. This way, we can avoid the one hot encoding, which would increase the number of columns. # Since all the variables have the same domain, we should be consistent with the encoding. For us, 'x' # will always be encoded as '0' and 'o' will always be encoded as '1'. # X_encoded = X.apply(LabelEncoder().fit_transform) # LabelEncoder works alphabetically and with range [0,n_classes-1], # so 'b' will be encoded as 2, 'o' as 1, and 'x' as 0, which is not the intended outcome for us. # It has no additional parameters, so we will apply our own encoder: def recode(x): recode_map = {'x': 0, 'b': 0.5, 'o': 1} return recode_map[x] X_encoded = X.applymap(recode) # save the cleaned/encoded X as a CSV for later. y is needed for supervised evaluation. X.to_csv(os.path.join('datasets', 'connect_4_clean.csv'), index=False) X_encoded.to_csv(os.path.join('datasets', 'connect_4_clean_num.csv'), index=False) y.to_csv(os.path.join('datasets', 'connect_4_clean_y.csv'), index=False, header=False) return 'connect_4_clean.csv', 'connect_4_clean_num.csv', 'connect_4_clean_y.csv'
if opt.deviceIds[0] >= 0 else "Use CPU as target slu torch device") logger.info("Use GPU with index %s as target nlg device" % (opt.deviceIds[1]) if opt.deviceIds[1] >= 0 else "Use CPU as target nlg torch device") ##### Vocab and Dataset Reader ##### slu_vocab, nlg_vocab = Vocab(dataset=opt.dataset, task='slu'), Vocab(dataset=opt.dataset, task='nlg') lm_vocab = Vocab(dataset=opt.dataset, task='lm') slu_evaluator, nlg_evaluator = Evaluator.get_evaluator_from_task( task='slu', vocab=slu_vocab), Evaluator.get_evaluator_from_task(task='nlg', vocab=nlg_vocab) if not opt.testing: train_dataset, dev_dataset = read_dataset( opt.dataset, choice='train'), read_dataset(opt.dataset, choice='valid') labeled_dataset, unlabeled_dataset = split_dataset(train_dataset, opt.labeled) logger.info( "Labeled/Unlabeled train and dev dataset size is: %s/%s and %s" % (len(labeled_dataset), len(unlabeled_dataset), len(dev_dataset))) unlabeled_dataset = labeled_dataset + unlabeled_dataset test_dataset = read_dataset(opt.dataset, choice='test') logger.info("Test dataset size is: %s" % (len(test_dataset))) ##### Model Construction and Init ##### if not opt.testing: params = vars(opt) json.dump(params, open(os.path.join(exp_path, 'params.json'), 'w'), indent=4)
from classifier import train_classifier from features import extract_features, flatten_features from utils.dataset import read_dataset from utils.preprocessing import preprocess_dataframe, extract_labels, decipher_labels, oversample_minority_classes from utils.scoring import print_cv_score, evaluate_submission from utils.splits import generate_hold_out_split if __name__ == "__main__": print('Reading data...') raw_data = read_dataset('data') data = preprocess_dataframe(raw_data, 'raw_data') labels = extract_labels(data) print('Extracting features...') features = extract_features(data, raw_data) print('Flattening features...') flattened_features = flatten_features(features) print('Generating hold-out split...') training_data, testing_data, unused_data = generate_hold_out_split(raw_data) training_features, testing_features = flattened_features.iloc[training_data.index], flattened_features.iloc[testing_data.index] training_labels, testing_labels = labels.iloc[training_data.index], labels.iloc[testing_data.index] print('Oversampling minority classes...') oversampled_training_features, oversampled_training_labels = oversample_minority_classes(training_features, training_labels) print('Training classifier...') classifier = train_classifier(oversampled_training_features, oversampled_training_labels) print('Cross-validating...')
def preprocess(): dataset = read_dataset('adult') data = dataset['data'] df = pd.DataFrame(data) df = df.sample(n=5000, replace=False, random_state=1).reset_index(drop=True) df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x) # Real Y labels y = df['class'].copy() df = df.drop(columns=['class']) categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] numerical_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] # Encode categorical values into numerical with OHE ohe = OneHotEncoder(handle_unknown='ignore', sparse=False) X_categorical = ohe.fit_transform(df[categorical_features]) columns = ohe.get_feature_names(input_features=categorical_features) X_categorical = pd.DataFrame(data=X_categorical, columns=columns) # Scale numerical values sc = MinMaxScaler() X_numerical = sc.fit_transform(df[numerical_features]) X_numerical = pd.DataFrame(data=X_numerical, columns=numerical_features) # All to categorical X_numerical_as_categorical = X_numerical.copy() for feat in numerical_features: X_numerical_as_categorical[feat] = pd.qcut(x=X_numerical[feat], q=5, duplicates='drop') # Mix data X_df = pd.concat((df[categorical_features], X_numerical), axis=1) # Numerical only data X_df_num = pd.concat((X_categorical, X_numerical), axis=1) # Categorical only data X_df_cat = pd.concat((df[categorical_features], X_numerical_as_categorical), axis=1) # In[34]: X_df.to_csv(os.path.join('datasets', 'adult_clean.csv'), index=False) X_df_num.to_csv(os.path.join('datasets', 'adult_clean_num.csv'), index=False) X_df_cat.to_csv(os.path.join('datasets', 'adult_clean_cat.csv'), index=False) y.to_csv(os.path.join('datasets', 'adult_clean_y.csv'), index=False, header=False) return 'adult_clean_num.csv', 'adult_clean_cat.csv', 'adult_clean.csv', 'adult_clean_y.csv'
for name in namelist.strip().split(' '): index = 0 if len(name) >= 3: while index < len(sen): if index < len(sen) - 2 and sen[index] == name[0] and sen[ index + 1] == name[1] and sen[index + 2] == name[2]: tmp = name[1] + name[2] sen = sen[:index + 1] + [tmp] + sen[index + 3:] index += 1 index += 1 return sen if __name__ == '__main__': print(time.strftime('%Y-%m-%d %H:%M:%S')) train_dataset = read_dataset() test_dataset = read_dataset(TEST_FILE2) hmm = HMM() hmm.fit(train_dataset) Pner = PlaceRec() Numner = NumRec() cname = CNNAME() cname.fit() separator = ' ' f = open(OUT_PUT, 'wb') re_han = re.compile(ur"([\u4E00-\u9FA5\u25cb]+)") re_skip = re.compile( ur"^[\uff0d\-{0,1}a-zA-Z0-9\uff10-\uff19\u2014\uff21-\uff3a\uff41-\uff5a\u2026\u25cb\\.]$" ) print(time.strftime('%Y-%m-%d %H:%M:%S')) print 'Start seg...'
def create_transforms(additional): res = list(additional) # add necessary transformations res.extend([ A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ChannelTranspose() ]) res = A.Compose(res) return res if __name__ == '__main__': df = read_dataset( '/mnt/HDD/home/druzhinin/kaggle/kaggle_severstal/dataset/train.csv', '/mnt/HDD/home/druzhinin/kaggle/kaggle_severstal/dataset/train_images') # Different transforms for TTA wrapper transforms = [[], [A.HorizontalFlip(p=1)], [A.VerticalFlip(p=1)], [A.HorizontalFlip(p=1), A.VerticalFlip(p=1)]] transforms = [create_transforms(t) for t in transforms] device = 'cuda' print('resnet34-class01') model = torch.jit.load( '/mnt/HDD/home/druzhinin/kaggle/kaggle_severstal/download/resnet34-class01/torchscript.pth', map_location=device)
progress = 0 with open(EMBEDDING(dataset), 'w') as out_file: for word in vocab: progress += 1 vector = word_embed.emb(word) + char_embed.emb(word) string = ' '.join([str(v) for v in vector]) out_file.write(word + ' ' + string + '\n') if progress % 1000 == 0: print("Retrieve 400-dim GK Embedding for the", progress, "-th word ...") print('In total, process %d words in %s' % (len(vocab), dataset)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--dataset', required=True, nargs='+') parser.add_argument('--mwf', type=int, default=1, help='minimum word frequency') args = parser.parse_args(sys.argv[1:]) for d in args.dataset: print('\nStart processing domain %s ...' % (d)) ex_list = read_dataset(d, 'train') + read_dataset( d, 'valid') + read_dataset(d, 'test') words, bios, slots, intents = construct_vocab(d, ex_list, args.mwf) construct_database_and_com(d, ex_list) get_pretrained_embeddings(d, words, slots, intents)
for name in namelist.strip().split(' '): index = 0 if len(name) >= 3: while index < len(sen): if index < len(sen) - 2 and sen[index] == name[0] and sen[index + 1] == name[1] and sen[index + 2] == \ name[2]: tmp = name[1] + name[2] sen = sen[:index + 1] + [tmp] + sen[index + 3:] index += 1 index += 1 return sen if __name__ == '__main__': print(time.strftime('%Y-%m-%d %H:%M:%S')) train_dataset = read_dataset() hmm = HMM() hmm.fit(train_dataset) Pner = PlaceRec() Numner = NumRec() cname = CNNAME() cname.fit() separator = ' ' test_sentence = open(TEST_FILE, 'rb') re_han = re.compile(ur"([\u4E00-\u9FA5\u25cb]+)") re_skip = re.compile( ur"^[\uff0d\-{0,1}a-zA-Z0-9\uff10-\uff19\u2014\uff21-\uff3a\uff41-\uff5a\u2026\u25cb\\.]$" ) print(time.strftime('%Y-%m-%d %H:%M:%S')) print 'Start seg...' for line in test_sentence:
# catalyst=False, # pin_memory=False, # binary=True, # multi=False) # # model = mobilenetv3(1).cuda().eval() # # state = torch.load('/home/druzhinin/HDD/kaggle/kaggle_severstal/logdir/1.6.mobilenet_multi/binary/checkpoints/best.pth') # # model.load_state_dict(state['model_state_dict']) # # del state # model = model.eval() # find_best_threshold_binary(np.arange(0.05, 1, 0.05), model, dataloader) # ------------------------------------------------------------------------------------------------------------------------------------------ df = read_dataset( '../dataset/train.csv', '../dataset/train_images', ) # df = df.dropna(subset=[1, 2, 3, 4], how='all') dataloader = get_dataloader(df, transforms, batch_size=2, shuffle=False, num_workers=6, phase='valid', catalyst=False, pin_memory=False, binary=False, multi=False) from stage_experiments.transforms_1_7.model import Model # Load model
def Name_Replace(namelist,sen): for name in namelist.strip().split(' '): index = 0 if len(name)>=3: while index<len(sen): if index < len(sen)-2 and sen[index]== name[0] and sen[index+1] == name[1] and sen[index+2] == name[2]: tmp = name[1]+name[2] sen = sen[:index+1] + [tmp] + sen[index+3:] index+=1 index+=1 return sen if __name__ == '__main__': print (time.strftime('%Y-%m-%d %H:%M:%S')) train_dataset = read_dataset() test_dataset = read_dataset(TEST_FILE2) hmm = HMM() hmm.fit(train_dataset) Pner = PlaceRec() Numner = NumRec() cname = CNNAME() cname.fit() separator = ' ' f = open(OUT_PUT, 'wb') re_han = re.compile(ur"([\u4E00-\u9FA5\u25cb]+)") re_skip = re.compile(ur"^[\uff0d\-{0,1}a-zA-Z0-9\uff10-\uff19\u2014\uff21-\uff3a\uff41-\uff5a\u2026\u25cb\\.]$") print (time.strftime('%Y-%m-%d %H:%M:%S')) print 'Start seg...' for line in test_dataset: res = ''