Exemplos de preprocessing em Python, exemplos de src.preprocessing.preprocessing em Python

Exemplo n.º 1

0

Exibir arquivo

def infer(input: str, model: CharCNN) -> str:
    """
    Simple prediction function that takes in a string input, and makes
    a prediction on its sentiment using an existing model.
    """
    out_classes = {0:"Poor", 1:"Average", 2:"Good"}    
    input_prep = preprocessing(input, infer=True)
    input_quant = TextDataset.quant_text(input_prep)
    input_quant = Tensor(input_quant.to_numpy()).unsqueeze(0) # give it batch dimension
    prediction = predict(model(input_quant)).item()
    output = out_classes[prediction]
    return f"Predicted Sentiment: {output}\n"

Exemplo n.º 2

0

Exibir arquivo

    work_dir = config['work_dir']
    mismatch = config['mismatch']
    primer_file = config['primer_file']
    vector_file = config['vector_file']
    logger_level = config['logger_level']
    # 生成工作目录，初始化日志，读取样本的barcode和标识
    make_output_dir(work_dir)
    init_logger(work_dir, logger_level)
    barcodes, idents = load_barcode(bc_file)

    logging.debug('Loaded barcodes: ' + barcodes.__str__())
    try:
        for step in step_list:
            if step == 1:
                # 样本分箱
                preprocessing(fw_file, rc_file, work_dir, barcodes, mismatch)
                idents = check_pre_file(work_dir, idents)
                # 样本read数筛选和格式转换
                fastq2fasta(work_dir, idents, is_gzip, maxsize)
                logging.info('STEP 1: PREPROCESSING COMPLETED')
            elif step == 2:
                # 去除引物
                suffix = 'fasta.gz' if is_gzip else 'fasta'
                idents = check_output_files(work_dir, idents, step, suffix)
                remove_primer(work_dir, primer_file, idents, is_gzip)
                logging.info('STEP 2: PRIMER SEQUENCES REMOVED')
            elif step == 3:
                # 去除载体
                suffix = 'clean.fasta.gz' if is_gzip else 'clean.fasta'
                idents = check_output_files(work_dir, idents, step, suffix)
                remove_vector(work_dir, vector_file, idents, is_gzip)

Exemplo n.º 3

0

Exibir arquivo

    ([
        'top_' + str(p) + '_building'
        for p in [1, 2, 5, 10, 15, 20, 25, 30, 50]
    ], None),  #Leak on CV
    ('time_stamp', None)  #Magic feature
]

# Currently LightGBM core dumps on categorical data, deactivate in the transformer

################ Preprocessing #####################
cache_file = './cache.db'
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=1337)
folds = list(cv.split(X, y))

# Sorry for the spaghetti code
x_trn, x_val, y_trn, y_val, labelencoder, X_train, X_test, y_train = preprocessing(
    X, X_test, y, tr_pipeline, select_feat, folds, cache_file)

############ Train and Validate ####################
print("############ Final Classifier ######################")
clf, metric, n_stop = training_step(x_trn, x_val, y_trn, y_val, X_train,
                                    y_train, folds)

################## Predict #########################
output(X_test, idx_test, clf, labelencoder, n_stop, metric)

with open(
        './out/' + time.strftime("%Y-%m-%d_%H%M-") + '-valid' + str(metric) +
        '-features.txt', "w") as text_file:
    for item in select_feat:
        text_file.write("{}\n".format(item))

Exemplo n.º 4

0

Exibir arquivo

if __name__ == "__main__":

    all_params = get_default_params()
    data_paths = all_params[0]
    prep_params = all_params[1]
    quant_params = all_params[2]
    data_params = all_params[3]
    model_params = all_params[4]
    pipe_params = all_params[5]

    # Setup tensorboard
    writer = SummaryWriter(config['log_path'])

    print("Preparing Data")
    train_df, test_df = preprocessing(data=None, **data_paths, **prep_params)

    # Stratify shuffle the test set to ensure each batch during validation will
    # contain all of the classes as per proportion in the original test set
    test_df = stratified_shuffle(test_df, model_params['batch_size'],
                                 prep_params['label_col'], 10)

    # Data loading
    train_dl, test_dl = data_loading(train_df, test_df,
                                     prep_params['text_col'],
                                     prep_params['label_col'], **quant_params,
                                     **data_params)

    print("Start Training")
    # Model instantiation
    device = get_default_device()

Exemplo n.º 5

0

Exibir arquivo

METADATA_DIR = 'data/csv/'
METADATA = 'df_crop_' + OBJECT_NAME

df_metadata = pd.read_csv(path.join(sys.path[1], METADATA_DIR, METADATA))

image_names = df_metadata['filename'].tolist()

result = []

with ProgressBar(max_value=len(image_names)) as bar:
    bar.update(0)

    for i, image_name in enumerate(image_names):

        image = cv2.imread(path.join(sys.path[1], IMAGES_DIR, image_name))
        image = preprocessing(image=image)

        predicted_text = recognize_text(image=image)
        target_text = df_metadata[df_metadata['filename'] ==
                                  image_name]['attribute'].tolist()[0]

        score = text_recognition_score(target_text=target_text,
                                       predicted_text=predicted_text)

        result.append(
            [image_name, target_text, predicted_text, score[0], score[1]])

        bar.update(i)

df_result = pd.DataFrame(data=result,
                         columns=[