示例#1
0
def cv_1fold(df, df_store, with_pca=True):
    train = df.copy().iloc[::-1]
    train.Date = pd.to_datetime(train.Date)
    train_set = train[train.Date < '2015-06-19']
    test_set = train[train.Date >= '2015-06-19']
    reg_model = Regressor()
    X_train, y_train, X_PCA_train = pp.Preprocessor().transform(
        df_store, train_set)
    X_test, y_test, X_PCA_test = pp.Preprocessor().transform(
        df_store, test_set)
    # Dummy variables can induce differences in the schemas
    if with_pca:
        X_train = X_PCA_train.copy()
        X_test = X_PCA_test.copy()
    missing_test = set(X_train.columns) - set(X_test.columns)
    missing_train = set(X_test.columns) - set(X_train.columns)
    for c in missing_test:
        X_test[c] = 0
    for c in missing_train:
        X_train[c] = 0
    # Reorder to match columns order in train and test
    X_test = X_test[X_train.columns]
    # Model fitting on training set
    train_model(reg_model, X_train, y_train)
    # Scoring on test set
    y_pred = reg_model.predict(X_test)
    rmse_scores = rmse(y_test, y_pred)
    r2_scores = r2(y_test, y_pred)
    print("RSME = " + str(rmse_scores) + " | R² = " + str(r2_scores))
    results = {}
    results['RMSE'] = rmse_scores
    results['R2'] = r2_scores
    return results
示例#2
0
def get_pipeline_steps(config):
    """
    Determine whether the full pipeline / a section of it is to be run
    Return a list of parallel steps to run, and boolean variables
    denoting whether the serial steps (batching, relation extraction)
    should be run.
    """
    steps = []
    partial_execution = config.getboolean('General', 'partial_execution')
    if partial_execution:
        start_step = config.getint('General', 'start_step')
        end_step = config.getint('General', 'end_step')
    else:
        start_step = 1
        end_step = 6
    # Run batching and relation extraction steps?
    batching = True if start_step == 1 else False
    rel_extraction = True if end_step == 6 else False
    #nel = True if start_step<=4 and end_step>4 else False
    # Parallel pipeline steps, removed parsing.UnstParser(configmap) from list,
    parallel_step_list = [
        pre.Preprocessor(configmap),
        ner.Ner(configmap),
        nel.Nel(configmap)
    ]
    parallel_steps = parallel_step_list[max(0, start_step - 2):end_step - 1]
    return parallel_steps, batching, rel_extraction
示例#3
0
    def test_serialization(self):
        """Test serialization and de-serialization code"""

        pre_orig = pp.Preprocessor()
        pre_test = pp.Preprocessor()

        # Populate orig from input file, create new object
        # from serialized JSON
        pre_orig.preprocess("input/bible_characters.txt", 5)
        txt = pre_orig.to_json()
        pre_test.from_json(txt)

        # Check non-numeric attributes
        self.assertEqual(pre_orig.get_max_length(), pre_test.get_max_length())
        self.assertEqual(pre_orig.filename, pre_test.filename)
        self.assertEqual(pre_orig.window, pre_test.window)
        self.assertEqual(pre_orig.get_targets(), pre_test.get_targets())

        #  Check numeric
        self.assertTrue(np.all(np.isclose(pre_orig.x_train, pre_test.x_train)))
        self.assertTrue(np.all(np.isclose(pre_orig.y_train, pre_test.y_train)))
        self.assertTrue(np.all(np.isclose(pre_orig.x_test, pre_test.x_test)))
        self.assertTrue(np.all(np.isclose(pre_orig.y_test, pre_test.y_test)))

        # Check statistics - order
        self.assertTrue(
            np.all(pre_test.statistics.get_second_df().index ==
                   pre_orig.statistics.get_second_df().index))
        self.assertTrue(
            np.all(pre_test.statistics.get_second_df().columns ==
                   pre_orig.statistics.get_second_df().columns))

        # Check statistics - values
        self.assertTrue(
            np.all(
                np.isclose(pre_orig.statistics.get_first_prob(),
                           pre_test.statistics.get_first_prob())))
        for test_letter in pp.LETTERS:
            self.assertTrue(
                np.all(
                    np.isclose(
                        pre_orig.statistics.get_second_prob(test_letter),
                        pre_test.statistics.get_second_prob(test_letter))))
示例#4
0
文件: model.py 项目: megan0529/test
 def __init__(self):
     self.EMBED_HIDDEN_SIZE = 50
     self.SENT_HIDDEN_SIZE = 100
     self.QUERY_HIDDEN_SIZE = 100
     self.BATCH_SIZE = 32
     self.EPOCHS = 5
     
     prep = pp.Preprocessor()    
     self.x, self.tx, self.xq, 
     self.txq, self.y, self.ty = prep.prepare_cmu_data("../../Data/Question_Answer_Dataset_v1.2/S08")
     self.embedding_matrix = prep.generate_embedding_matrix(prep.word_idx, r"C:\Users\Anand Natu\Desktop\glove.6B")
     self.story_maxlen = prep.story_maxlen
     self.query_maxlen = prep.query_maxlen 
示例#5
0
def build_index():
    corpus_path = util.get_corpus_dir_path_from_args()
    preprocessor = preprocessing.Preprocessor(corpus_path)
    doc_to_terms: list[preprocessing.DocToTerms] = preprocessor.parse()

    indexer_ob = indexer.Indexer(doc_to_terms)
    inverted_index: dict[str, indexer.Posting] = indexer_ob.inverter_index()
    doc_id_name_index: dict[int, str] = indexer_ob.doc_id_to_doc_name_index()

    tf_idf_ranker = ranker.Ranker(inverted_index, doc_id_name_index)
    _tfidf = tf_idf_ranker.tfidf()

    print('Indexing completed..saving...')
    util.save_obj(doc_id_name_index, DOC_ID_NAME_INDEX_NAME)
    util.save_obj(inverted_index, INVERTED_INDEX_FILE_NAME)
    util.save_pandas_df_as_pickle(_tfidf, TFIDF_NAME_INDEX_NAME)
    print('Saved index for quick results for future queries')
示例#6
0
 def __init__(self):
     self.feature_list = []
     self.train_range = None
     self.test_range = None
     self.x_train = None
     self.y_train = None
     self.x_test = None
     self.y_test = None
     self.y_test_pred_prob = None
     self.y_test_pred = None
     self.y_var = "abandoned_y"
     self.has_prob = True
     self.preprocessor = preprocessing.Preprocessor()
     self.subsampled = False
     self.subsample_ratio = 0.0
     self.html_file = None
     self.hdf_file = '/mnt/data/infonavit/master_loan_features/master_loan_features_v4.h5'
示例#7
0
import numpy as np
import std_models.vgg as vgg

IMG_SIZE = 150  #224
CATEGORIES = ["Human", "NoHuman"]
DATADIR = "/mnt/HDD/Masterthesis/DB"
PARTIAL_LOAD = 0.1
PARTIAL_LOAD_STR = "01"
COLOR_MODE = pp.Colormode.GRAYSCALE  #pp.Colormode.RGB
PROCESSED_IMG_DIR = "../res/data/p{0}_s{1}_{2}".format(PARTIAL_LOAD_STR,
                                                       str(IMG_SIZE),
                                                       COLOR_MODE.value)

p = pp.Preprocessor(img_size=IMG_SIZE,
                    categories=CATEGORIES,
                    colormode=COLOR_MODE,
                    datadir=DATADIR,
                    data_pattern=pp.DataPattern.X_X_Y_Y)
#x_train, x_test, y_train, y_test = p.run(partial_load=PARTIAL_LOAD)
#p.save(PROCESSED_IMG_DIR, (x_train, x_test, y_train, y_test))
x_train, x_test, y_train, y_test = p.load(PROCESSED_IMG_DIR)

#learner = vgg.VGGAdapter(version=vgg.VGGVersion.VGG_19, input_shape=x_train.shape[1:], output_shape=[0, 1])
#learner.model = learner.load("../res/models/model_002_001.model")
#model.train(x_train, y_train, x_val=None, y_val=None, validation_split=0.2, batch_size=2, epochs=2)
#model.evaluate(x_test, y_test)
#model.save()
model = learn.ImageClassifier(input_shape=x_train.shape[1:],
                              model_path="../res/models/model_001_006.model")
#model.train(x_train, y_train, x_val=None, y_val=None, validation_split=0.2, batch_size=132, epochs=5)
#model.evaluate(x_test, y_test)
示例#8
0
 def setUp(self):
     """Load bible dataset for testing"""
     self.pre = pp.Preprocessor()
     self.pre.preprocess("input/testing.txt", window=3, shuffle=False)
示例#9
0
 sys.path.insert(0, unstableparserpath)
 import parsing
 # Get pipeline steps for full / partial execution as specified in config
 steps = get_pipeline_steps(configmap)
 parallel_steps = steps[0]
 batching = steps[1]
 rel_extraction = steps[2]
 # Determine number of cores to use (based on config setting and availability)
 cores = compute_cores(configmap)
 # Batching and sentence segmentation
 homedir = configmap.get('General', 'home')
 batchgroupsfile = homedir + '/' + configmap.get('General',
                                                 'batch_groups_file')
 logging.info('started batching: ' + str(datetime.now()))
 if batching:
     preprocessor = pre.Preprocessor(configmap)
     preprocessor.batch_and_segment()
     # Split batches into groups according to number of cores available for paralellisation
     batchnamesfile = homedir + '/' + configmap.get('General',
                                                    'batches_file')
     batch_groups_list = hf.group_batches_for_parallel_processing(
         batchnamesfile, batchgroupsfile, cores)
 else:
     # Read batch groups from file
     batch_groups_list = hf.read_group_batches(batchgroupsfile)
     print(batch_groups_list)
 # Implement pipeline steps for which parallelisation makes sense
 for step in parallel_steps:
     # Set up a pool of workers
     pool = mp.Pool(processes=cores)
     process_batch_group_with_instance = partial(process_batch_group,
示例#10
0
def cv_kfold(df, df_store, n_splits=10, test_size=42, with_pca=True):
    train = df.copy().iloc[::-1]
    train.Date = pd.to_datetime(train.Date)

    tscv = TimeSeriesSplit(n_splits=n_splits)

    reg_model = Regressor()
    rmse_scores = []
    r2_scores = []

    date_grouping = train.groupby(train.Date)['Store']
    date_list = [g[0] for g in list(date_grouping)[:]]
    for train_index, test_index in tscv.split(date_grouping):
        # Fixed test set cardinality (in number of days)
        train_index = np.append(
            train_index,
            list(range(len(train_index), 1 + int(test_index[-1] - test_size))))
        test_index = test_index[(1 + int(train_index[-1] - test_index[0])):]

        train_dates = [date_list[train_index[0]], date_list[train_index[-1]]]
        test_dates = [date_list[test_index[0]], date_list[test_index[-1]]]
        train_mask = (train.Date >= train_dates[0]) & (train.Date <=
                                                       train_dates[1])
        test_mask = (train.Date >= test_dates[0]) & (train.Date <=
                                                     test_dates[1])

        # Train and test sets
        X_train, y_train, X_PCA_train = pp.Preprocessor().transform(
            df_store, train.loc[train_mask])
        X_test, y_test, X_PCA_test = pp.Preprocessor().transform(
            df_store, train.loc[test_mask])
        if with_pca:
            X_train = X_PCA_train.copy()
            X_test = X_PCA_test.copy()
        # Dummy variables can induce differences in the schemas
        missing_test = set(X_train.columns) - set(X_test.columns)
        missing_train = set(X_test.columns) - set(X_train.columns)
        for c in missing_test:
            X_test[c] = 0
        for c in missing_train:
            X_train[c] = 0
        # Reorder to match columns order in train and test
        X_test = X_test[X_train.columns]

        # Model fitting on training set
        train_model(reg_model, X_train, y_train)

        # Scoring on test set
        y_pred = reg_model.predict(X_test)
        rmse_scores.append(rmse(y_test, y_pred))
        r2_scores.append(r2(y_test, y_pred))

    # Final display
    for i in range(n_splits):
        print("FOLD " + str(i + 1) + ": " + "RSME = " + str(rmse_scores[i]) +
              " | R² = " + str(r2_scores[i]))
    results = {}
    results['RMSE'] = rmse_scores
    results['R2'] = r2_scores
    # Overall scores
    w = [1 + 0.5 * i for i in range(1, n_splits + 1)]
    print("--- OVERALL ---")
    print("RSME = " + '{0:.2f}'.format(np.average(rmse_scores, weights=w)) +
          " | R² = " + '{0:.2f}'.format(np.average(r2_scores, weights=w)))
    return
示例#11
0
# 모델을 학습한다.
import models
import preprocessing
import numpy as np
import joblib


# 데이터 전처리 및 분할
pre = preprocessing.Preprocessor()

try:
    mfcc_x, mfcc_y, ft_x, ft_y = joblib.load('dataset/data.joblib')
except:
    raise Exception('embedding.py를 먼저 실행해 주세요.')

y = ft_y.reshape((-1))
x = np.array(list(zip(mfcc_x, ft_x)))
xtrain, xtest, ytrain, ytest = pre.preprocess(xydata=[x, y])  # 전처리

mfcc_xtrain, ft_xtrain = xtrain[:,0], xtrain[:,1]
mfcc_xtest, ft_xtest = xtest[:,0], xtest[:,1]
mfcc_xtrain = mfcc_xtrain.reshape((-1, 30, 100))
mfcc_xtest = mfcc_xtest.reshape((-1, 30, 100))
ft_xtrain = ft_xtrain.reshape((-1, 30, 100))
ft_xtest = ft_xtest.reshape((-1, 30, 100))

# 데이터 불균형 문제 해결을 위해 class_weight 사용
class_weight = [(1 / pre.get_count(ytrain, [1, 0]))*len(ytrain)/2.0,
                (1 / pre.get_count(ytrain, [0, 1]))*len(ytrain)/2.0]

# 모델 정의
示例#12
0
def setup_processor():
    macros = load_pokered_macros()
    processor = preprocessing.Preprocessor(config, macros)
    return processor