예제 #1
0
def main():
    experiment_set = final_experiment
    print("There are {} experiments to run".format(len(experiment_set)))
    train_data_path = "data/training.dat"
    dev_data_path = "data/full/dev.dat"
    tst_data_path = "data/full/evaluation.dat"
    feats_path = "data/model.features"
    num_feats = len([line for line in open(feats_path)])
    batch_size = 80
    runs_per_experiment = 5

    for experiment_name in experiment_set.keys():
        logger.info("Running experiment {}".format(experiment_name))
        exp_features = experiment_set[experiment_name]
        out_path = 'output/experiments_v3/{}'.format(experiment_name)
        makedirs(out_path, exist_ok=True)
        train_instances = load_data(train_data_path, num_feats, exp_features)
        dev_instances = load_data(dev_data_path, num_feats, exp_features)
        dev_eval_instances = load_eval_data(dev_data_path, num_feats,
                                            exp_features)
        tst_instances = load_eval_data(tst_data_path, num_feats, exp_features)
        logger.info("Loaded {} training instances with {} features".format(
            len(train_instances), num_feats))
        for i in range(runs_per_experiment):
            iter_path = out_path + '/v{}'.format(i)
            makedirs(iter_path, exist_ok=True)
            ranker = Ranker(num_feats, 256)
            trainer = RankerTrainer(ranker, batch_size, iter_path)
            trainer.train(train_instances, dev_instances, None,
                          dev_eval_instances, tst_instances)
예제 #2
0
def main(args):
    torch.manual_seed(333)
    if use_cuda:
        torch.cuda.manual_seed(333)
    random.seed(333)
    train_data_path = "data/training.dat"
    train_eval_data_path = "data/train-eval.dat"
    dev_data_path = "data/full/dev.dat"
    eval_data_path = "data/full/evaluation.dat"
    feats_path = "data/model.features"
    num_feats = len([line for line in open(feats_path)])
    batch_size = 80
    ranker = Ranker(num_feats, 256)
    ## Instances for training - loaded as pairs
    feat_indices = set([i for i in range(num_feats)])
    train_instances = load_data(train_data_path, num_feats, feat_indices)
    train_eval_instances = load_eval_data(train_data_path, num_feats,
                                          feat_indices)
    dev_instances = load_data(dev_data_path, num_feats, feat_indices)
    dev_eval_instances = load_eval_data(dev_data_path, num_feats, feat_indices)
    tst_instances = load_eval_data(eval_data_path, num_feats, feat_indices)
    logger.info("Loaded {} training instances with {} features".format(
        len(train_instances), num_feats))
    trainer = RankerTrainer(ranker, batch_size, 'output/')
    trainer.train(train_instances, dev_instances, train_eval_instances,
                  dev_eval_instances, tst_instances)
    ranker.save('output/ranker.model')
def main():
    """
    The main function
    """

    es = Elasticsearch()
    ic = IndicesClient(es)
    dl.create_wikipedia_index(ic)
    dl.load_data(es)

    print("The top ranked title without synonym:", search_and_rank(es))
    add_synonyms_to_index(ic)
    print("The top ranked title with synonym:", search_and_rank(es))
예제 #4
0
def main():
    """
    The main function
    """

    es = Elasticsearch()
    ic = IndicesClient(es)
    dl.create_wikipedia_index(ic)
    dl.load_data(es)
    print(
        f"There are {filter(es)['hits']['total']['value']} documents contains 'lake' or 'tour'"
    )
    print(
        f"There are {search_without_improvement(es)['hits']['total']['value']} documents contains"
        " 'lake' or 'tour', but without the 'improvement required' sentense.")
예제 #5
0
def main(models, dataset_paths, best_grids, model_fitting_parameters):

    trained_model_list = list()
    for model, dset_path, grid, fitting_parameters in zip(
            models, dataset_paths, best_grids, model_fitting_parameters):

        # Load data set.
        dataframe = data_loading.load_data(dset_path,
                                           constants.OUTPUT_DATA_PROC_PATH)
        # Divide into training and test data set.
        x_train, x_test, y_train, y_test = data_loading.train_test_split(
            dataframe)

        # Load and train models.
        trained_model = load_trained_model(model, constants.OUTPUT_MODEL_PATH)
        if trained_model is None:  # if model is not lodaded train and save.
            trained_model = model
            grid.pop('scores')
            trained_model.set_params(**grid)
            trained_model.fit(x_train, y_train, **fitting_parameters)
            save_trained_model(trained_model, constants.OUTPUT_MODEL_PATH)
        trained_model_list.append(trained_model)

        # Report model results.
        report_models_results(x_train, x_test, y_train, y_test, model)
def get_trained_model():
    ''' Return trained model (if no pre-trained model, then also train it) '''

    model = tflearn.DNN(build_resnet(),
                        tensorboard_verbose=0,
                        tensorboard_dir='tensorboard')
    if os.path.exists(MODEL_FILE_PATH):
        print('-' * 80)
        print('Pretrained model was found.')
        model.load(os.path.splitext(MODEL_FILE_PATH)[0])
    else:
        print('-' * 80)
        print('Pretrained model was not found. Starting training:')
        images_train, labels_train, images_test, labels_test = data_loading.load_data(
        )

        model.fit(images_train,
                  labels_train,
                  n_epoch=10,
                  validation_set=(images_test, labels_test),
                  snapshot_step=100,
                  show_metric=True,
                  run_id='convnet_hand_recognition')
        model.save(os.path.splitext(MODEL_FILE_PATH)[0])
    return model
예제 #7
0
def main():
    main_params = load_main_params()
    loop_features, loop_targets, loop_info, feature_names = load_data(main_params["read_data_prefixes"], False)

    # pre-processing data analysis
    analyze_data(loop_features.copy(), loop_targets.copy(), feature_names.copy(), main_params["data_analysis"],
                 True, None)

    if main_params["create_models"]["regression"]["enabled"]:
        create_regression_models(loop_features.copy(), loop_targets.copy(), feature_names.copy(), main_params)
    if main_params["create_models"]["classification"]["enabled"]:
        create_classification_models(loop_features.copy(), loop_targets.copy(), feature_names.copy(), main_params)

    if main_params["make_prediction"]["enabled"]:
        loop_features, loop_targets, loop_info, feature_names = load_data(main_params["read_data_prefixes"], True)
        make_prediction_from_model(loop_features.copy(), loop_targets.copy(), main_params["make_prediction"],
                                   loop_info.copy())

    exit(0)
예제 #8
0
def get_model_and_df_grid_combinations(models, grids):
    all_grids_results = list()
    # Iterate over data sets.
    for path in data_processing.PROCESSED_DATASETS_PATH:
        df = data_loading.load_data(path, constants.OUTPUT_DATA_PROC_PATH)
        pgo = sku.grid_search.PersistentGrid.load_from_path(
            persistent_grid_path=constants.PERSITENT_GRID_PATH,
            dataset_path=path)
        # Iterate over models.
        for grid, model in zip(grids, models):
            best_grid = get_best_grid(df, model, grid, pgo).copy()
            best_grid['model'] = sku.get_estimator_name(model)
            best_grid['path'] = path
            all_grids_results.append(best_grid.copy())
    return all_grids_results
def search_results(user_dep,
                   user_dest,
                   user_time_dep,
                   user_passengers,
                   postgres=False,
                   redis=False):
    redis_db = None
    pg_conn = None

    if redis:
        redis_db = StrictRedis(socket_connect_timeout=3, **redis_config)

    elif postgres:
        pg_conn = psycopg2.connect(**pg_config)

    return load_data(user_dep, user_dest, user_time_dep, user_passengers,
                     redis_db, pg_conn)
예제 #10
0
    def __init__(self, dataset, supergroups, size, epochs, learning_rate):
        self.dataset = dataset
        self.data_name = dataset
        self.supergroups = supergroups
        self.size = size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        print(self.device)
        self.model = Model(self.dataset,
                           self.size,
                           supergroups=self.supergroups)
        self.model.to(self.device)
        #self.model.load_state_dict(torch.load("./agnews_40,0.0001.pt", map_location=torch.device('cpu')))

        if self.supergroups:
            trans = 'supergroups'
        else:
            trans = None
        self.train_loader, self.test_loader = load_data(dataset=self.dataset,
                                                        transformation=trans)
예제 #11
0
def main():
    # Set the random seed for the entire project.
    du.set_random_seed(0)
    # Rationale: ensure reproducibility of the results.

    # Flush previous runs.
    constants.flush_project_results(constants.TMP_PATH, constants.OUTPUT_PATH)
    # Rationale: provide a clear state for the project to run and enforces
    # reproducibility of the results.

    # Load, save and split data.
    dataframe = data_loading.load_data(constants.DATA_PATH)
    data_loading.save_data(dataframe, constants.TMP_PATH)
    x_train, x_test, y_train, y_test = data_loading.train_test_split(dataframe)
    # Rationale: *Loading*: load data in the main module and pass it as a first
    # argument to every other defined function (that relates to the data set)
    # thus saving precious time with data loading. *Saving*: for big data sets
    # saving the dataset as a fast read format (such as HDF5) saves time.

    # Load and combine data processing pipelines.
    # TODO:
    data_processing_pipelines = None

    # Perform exploratory data analyses.
    data_exploration.main(dataframe)
    # Rationale: conduct exploratory data analyses.

    # Perform grid search.
    persistent_grid_object = sku.grid_search.PersistentGrid.load_from_path(
        persistent_grid_path=constants.PERSITENT_GRID_PATH,
        dataset_path=constants.DATA_PATH)
    # Iteration over processed data sets may occur here since they are model
    # dependent.
    grid_search.main(dataframe, constants.MODELS, data_processing_pipelines,
                     constants.GRIDS, persistent_grid_object)
    best_grids = grid_search.get_best_grids(  # noqa
        constants.MODELS, data_processing_pipelines, persistent_grid_object)
예제 #12
0

DATA_PATH = './szwagropol_data/transactions.txt'  # zmienna, pod którą przechowywana jest ścieżka z danymi. Zapisanie nazwy zmiennej
                                                  # w całości wielkimi literami jest konwencją (ogólnie stosowaną), do oznaczania
                                                  # stałych (zmiennych, których wartość nie powinna się w trakcie wykonywania
                                                  # programu zmieniać)
TRANSACTION_TIME_INDEX = 0
CUSTOMER_INDEX = 1
PRODUCT_NAME_INDEX = 2
CATEGORY_NAME_INDEX = 3
QUANTITY_INDEX = 4
UNIT_PRICE_INDEX = 5
TOTAL_VALUE_INDEX = 6  # zmienne (stałe), przy pomocy których zapisujemy pod którym indeksem znajdują się konkretne informacje
                       # przechowywane w wierszach  danych - zwiększając przy tym czytelność kodu

columns, rows = load_data(DATA_PATH)  # wczytujemy listę kolumn i listę wierszy z pliku - przy pomocy wcześniej
                                      # stworzonej, i zaimportowanej funkcji load_data
columns.append('total_transaction_values')  # dodajemy dodatkową kolumnę, która będzie zawierać łączną wartość danego wiersza.
                                            # początkowo w wierszu zawarta jest liczba sztuk kupionego produktu i cena/sztukę,
                                            # zapisując sobie wynik mnożenia tych wartości w dodatkowej kolumnie unikniemy
                                            # wymnażania tych wartości wielokrotnie
for row in rows:  # dla każdego z wczytanych wierszy
    total = row[QUANTITY_INDEX] * row[UNIT_PRICE_INDEX]  # wylicz całkowitą wartość danej transakcji, mnożąc liczbę sztuk * cenna/sztukę...
    row.append(total)  # ... i dodaj obliczony wynik na końcu wiersza


def calculate_total_revenue(rows):  # funkcja do obliczania wartości całkowitego utargu w przekazanych wierszach z transakcjami
    total_revenue = 0  # inicjalizuemy zmienną, która będzie zawierała całkowity utarg
    for row in rows:  # iterujemy po każdym z przekazanych wierszy
        total_revenue += row[TOTAL_VALUE_INDEX]  # dodajemy do zmiennej pomocniczej, w której trzymamy całkowity utarg
                                                 # wartość zamówienia po którym aktualnie iterujemy
예제 #13
0
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.backend import clear_session
from sklearn.model_selection import KFold

import numpy as np

from model import create_model, create_encoder
from data_loading import load_data, clean_sentence
from plot_results import HistoriesStorage, plot_model_histories
from hyperparameters import *

tweets_data = load_data('Data_tweets.csv')
tweets_data = tweets_data[[1, 6]]
tweets_data.columns = ['Class', 'Tweet']


# One-hot-encoding classes
def substitute_classes(x):
    """Maps classes (0,2,4) to indexes (0,1,2)"""
    sub_dict = {0: 0, 2: 1, 4: 2}
    return sub_dict[x]


# Clean tweets
inputs = np.array(tweets_data['Tweet'].apply(lambda x: clean_sentence(x)))
# One-hot-encode classes
targets = tweets_data["Class"].apply(lambda x: substitute_classes(x))
targets = to_categorical(targets, num_classes=3)

# Fit encoder on input data - create vocabulary of NUM_WORDS most frequent words
encoder = create_encoder(inputs)
예제 #14
0
def main():
    # Filter warnings that polute the project stdout.
    filter_warnings()
    # Rationale: produce cleaner results.

    # Set the random seed for the entire project.
    du.set_random_seed(0)
    # Rationale: ensure reproducibility of the results.

    # Flush previous runs.
    # constants.flush_project_results(constants.TMP_PATH,
    #                                 constants.OUTPUT_PATH)
    # Rationale: provide a clear state for the project to run and enforces
    # reproducibility of the results.

    # Download, load and save data.
    data_loading.main()
    dataframe = data_loading.load_data(constants.DATASET_PATH,
                                       constants.TMP_PATH)
    data_loading.save_data(dataframe, constants.TMP_PATH,
                           constants.DATASET_PATH)
    # Rationale: *Loading*: load data in the main module and pass it as a first
    # argument to every other defined function (that relates to the data set)
    # thus saving precious time with data loading. *Saving*: for big data sets
    # saving the dataset as a fast read format (such as HDF5) saves time.

    # Load and combine data processing pipelines.
    data_processing.main(dataframe, nan_strategy='drop')
    # Rationale: prepare data to be fed into the models.
    # Different algorithms make use of different data structures. For instance
    # XGBoost allow for nans. Data transformations usually don't.

    # Perform exploratory data analyses.
    data_exploration.main(dataframe)
    # Rationale: conduct exploratory data analyses.

    # Data split.
    # Removed.
    # Rationale: module 'models' should execute this.

    # Perform grid search.
    # Iteration over processed data sets may occur here since they are model
    # dependent.
    grid_search.main(constants.MODELS, constants.GRIDS)
    best_combination_of_datasets_and_grids = (
        grid_search.dict_of_best_datasets_and_grids(constants.MODELS,
                                                    constants.GRIDS))
    best_datasets = best_combination_of_datasets_and_grids['best_datasets']
    best_grids = best_combination_of_datasets_and_grids['best_grids']
    # Rationale: perform grid search as part of machine learning best
    # practices.

    # Summary of what was executed so far:
    # 1) Setting of the random seed for reproducibility.
    # 2) Flusing of intermediate results for a clean run.
    # 3) Data loading and data saving.
    # 4) Conduction of exploratory data analyses.
    # 5) Grid search of best model hyper parameters.
    # To conclude our project we need the grand finale: model selection and
    # evaluation/comparison.
    models.main(constants.MODELS, best_datasets, best_grids,
                constants.MODEL_FITTING_PARAMETERS)
예제 #15
0
#All file strings corresponding to BOLD data for subject 4 

files = ['task001_run001.bold_dico.nii', 'task001_run002.bold_dico.nii', 
         'task001_run003.bold_dico.nii', 'task001_run004.bold_dico.nii', 
         'task001_run005.bold_dico.nii', 'task001_run006.bold.nii'
         'task001_run007.bold.nii', 'task001_run008.bold.nii']

#
# Load the images as an image object
# Load all the image data from the images
# Drop the first four volumes, as we know these are outliers
#

all_data = []
for index, filename in enumerate(files):
	new_data = dl.load_data(filename) #load_data function drops first 4 for us
	num_vols = new_data.shape[-1]
	if index != 0 or index != 7:
		new_num_vols = num_vols - 4
		new_data = new_data[:,:,:,:new_num_vols] #Drop last 4 volumes for middle runs
	all_data.append(new_data)


# * Get indices of outlier volumes for each dataset. 
# * Write each as its own file and save in 'vol_std_outliers' folder 
# * Takes 15 min to run

all_bands_outliers = []
all_sdevs = []
all_iqr_outliers = []
for data in all_data:
예제 #16
0
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer

from anoflows.hpo import find_best_flows

from data_loading import load_data

logging.getLogger().setLevel(logging.INFO)

if len(sys.argv) == 1:
    logging.error("YAML data specification missing from the command line arguments")
    exit(1)

spec_file = sys.argv[1]
df, spec = load_data(spec_file)
max_rows = min(len(df), spec.get("max_rows", 40000))
novelty_detection = spec.get("novelty", True)
normal_classes = spec["normal_classes"]

precision = defaultdict(list)

for rounds in range(spec.get("rounds", 1)):
    # random sampling
    df = df.sample(n=max_rows, replace=False)
    label_col = spec["label_column"]
    y = df[label_col].values
    other = df.drop(label_col, inplace=False, axis=1)
    X = other.values

    # imputing
예제 #17
0
import torch.optim as optim
from model import Model
from data_loading import load_data

data = input('Type Dataset Choice, AGNews or 20Newsground:  ')
size = input('Pick model size, big or small:  ')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device=torch.device("cpu")
print('device used: ', device)

model = Model(data, size)
model.to(device)

#define train/test here
train_loader, test_loader = load_data(dataset=data)

loss_crit = nn.CrossEntropyLoss()
#optimizer=optim.SGD(model.parameters(),lr=0.001,momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

print("Starting training...")
for epoch in range(5):

    running_loss = 0.0

    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs, labels = torch.tensor(inputs), torch.tensor(labels)
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
예제 #18
0
    tsne_df = _tsne(norm_df, 3)
    joined_df = pd.concat((norm_df, tsne_df, y),
                          axis=1)
    assert norm_df.shape[0] == tsne_df.shape[0] == joined_df.shape[0]
    data_loading.save_data(joined_df, constants.OUTPUT_DATA_PROC_PATH,
                           DATA_TSNE3)


def no_transform(dataframe):
    if data_loading.dataframe_already_exists(constants.OUTPUT_DATA_PROC_PATH,
                                             DATA_VANILLA):
        return None
    data_loading.save_data(dataframe, constants.OUTPUT_DATA_PROC_PATH,
                           DATA_VANILLA)


def main(dataframe, nan_strategy='drop'):
    df = _process_nan(dataframe, how=nan_strategy)
    x = df[data_loading.get_x_columns(df)]
    y = df[constants.Y_COLUMN]
    norm_pca2(x, y)
    norm_pca3(x, y)
    norm_tsne2(x, y)
    norm_tsne3(x, y)
    no_transform(df)


if __name__ == '__main__':
    dataframe = data_loading.load_data(constants.DATASET_PATH)
    main(dataframe)