Exemplo n.º 1
0
    def test_read_text(self):
        """
        Test to check if the read_text function
        return a list of words given a txt file.
        """
        dr1 = DataReader()
        dr2 = DataReader(punctuation=True)
        words1 = dr1.read_text()
        words2 = dr2.read_text()
        print("\nReading time = {}\n".format(get_time(dr1.read_text)))

        self.assertTrue(len(words1) > 0)
        self.assertTrue(len(words2) > 0)
        self.assertEqual(words1[22], "System")
        self.assertEqual(words2[22], "System.")
 def __init__(self, debug_mode=0):
     Tk.__init__(self)
     self.engine = None
     self.language = None
     self.width = 0
     self.height = 0
     self.resolution_code = None
     self.is_full_screen = IntVar()
     self.screen_ratio = None
     self.resolution_list = []
     self.debug_mode = debug_mode
     if self.debug_mode:
         basicConfig(level=DEBUG)
         pil_logger = getLogger("PIL.PngImagePlugin")
         pil_logger.level = WARNING
     self.data_reader = DataReader(self)
     self._process_config()
     self.card_texts = {}
     self.ui_text_variables = {}
     self._load_text_variables()
     self.save_handler = SaveHandler(self)
     self.is_game_setup_in_progress = IntVar(value=0)
     self.is_game_in_progress = IntVar(value=0)
     self.is_turn_in_progress = IntVar(value=1)
     self._render_panes()
     self.is_game_in_progress.trace('w', self._follow_game_progress_change)
     self.is_turn_in_progress.trace('w', self._follow_turn_progress_change)
     self.players = {}
     self._text_placer()
     self.protocol("WM_DELETE_WINDOW", self.shutdown_ttk_repeat_fix)
     self.exit_in_progress = False
Exemplo n.º 3
0
    def test(self, test_info, path_to_model):
        """Test given model with task, path to the model and model datareder names."""

        # 1. Load trained model and set it to eval mode
        Model = ModelCT()
        Model.load_state_dict(torch.load(path_to_model))
        Model.eval()
        Model.cpu()

        # 2. Create datalodaer
        test_datareader = DataReader(self.main_path_to_data, test_info)
        test_generator = DataLoader(test_datareader,
                                    batch_size=10,
                                    shuffle=False,
                                    pin_memory=True,
                                    num_workers=2)

        # 3. Calculate metrics
        predictions = []
        trues = []

        for item_test in test_generator:
            prediction = Model.predict(item_test, is_prob=True)
            predictions.append(np.mean(prediction.cpu().numpy()))
            trues.append(item_test[1].numpy()[0])

        auc = roc_auc_score(trues, predictions)
        fpr, tpr, thresholds = roc_curve(trues, predictions, pos_label=1)
        return auc, fpr, tpr, thresholds, trues, predictions
Exemplo n.º 4
0
class MyWindow(Gtk.ApplicationWindow):

    datareader = DataReader()

    def __init__(self, app):
        Gtk.Window.__init__(self, application=app)
        self.set_default_size(800, 600)

        self.builder = Gtk.Builder()
        self.builder.add_from_file("main.glade")

        tesla = self.datareader.get_stock_data("TSLA")
        chart1 = PriceChart(tesla)

        priceChartBox = self.builder.get_object("PriceChart")
        priceChartBox.add(chart1.canvas)

        apple = self.datareader.get_stock_data("AAPL")
        chart2 = PriceChart(apple)

        priceChartBox = self.builder.get_object("PriceChart2")
        priceChartBox.add(chart2.canvas)

        window = self.builder.get_object("MainWindow")
        window.show_all()
Exemplo n.º 5
0
def read_input_data(file_name):
    dr = DataReader(file_name)
    texts, scores = dr.read_data()
    tk = Tokenizer()
    tk.fit_on_texts(texts)
    x = tk.texts_to_matrix(texts, mode='tfidf')
    x = utils.matrix_to_input(x)
    y = utils.scores_to_categorical(scores)
    return x, y
Exemplo n.º 6
0
def process_text_data(file_path, vocab_size):
    """
    This function is responsible for preprocessing the text data we will use to
    train our model. It will perform the following steps:

    * Create an word array for the file we have received. For example, if our
      text is:

        'I want to learn wordvec to do cool stuff'

    It will produce the following array:

        ['I', 'want', 'to', 'learn', 'wordvec', 'to', 'do', 'cool', 'stuff']

    * Create the frequency count for every word in our array:

       [('I', 1), ('want', 1), ('to', 2), ('learn', 1), ('wordvec', 1),
        ('do', 1), ('cool', 1), ('stuff', 1)]

    * With the count array, we choose as our vocabulary the words with the
      highest count. The number of words will be decided by the variable
      vocab_size.

    * After that we will create a dictionary to map a word to an index and an
      index to a word:

      index2word: {0: 'I', 1: 'want', 2: 'to', 3: 'learn', 4: 'wordvec',
                   5: 'do', 6: 'cool', 7: 'stuff'}
      word2index: {'I': 0, 'want': 1, 'to': 2, 'learn': 3, 'wordvec': 4,
                   'do': 5, 'cool': 6, 'stuff': 7}

      Both of these dictionaries are based on the words provided by the count
      array.

    * Finally, we will transform the words array to a number array, using the
      word2vec dictionary.

      Therefore, our words array:

      ['I', 'want', 'to', 'learn', 'wordvec', 'to', 'do', 'cool', 'stuff']

      Will be translated to:

      [0, 1, 2, 3, 4, 2, 5, 6, 7]

      If a word is not present in the word2index array, it will be considered an
      unknown word. Every unknown word will be mapped to the same index.
    """
    my_data = DataReader(file_path)
    my_data.process_data(vocab_size)
    return my_data
Exemplo n.º 7
0
def show_measurement():
    if request.method == 'POST':
        request_json = request.get_json()
        user_id = request_json['userId']
        type = request_json['type']
        start_date = request_json['beginDate']
        end_date = request_json['endDate']
        r = DataReader()
        start = datetime.strptime(start_date, '%d.%m.%Y')
        end = datetime.strptime(end_date, '%d.%m.%Y')
        response = make_response(
            json.dumps(r.heart_rate_special(user_id, start, end)))
        response.headers['Content-Type'] = 'application/json; charset=utf-8'
        return response
Exemplo n.º 8
0
def correlations():
    if request.method == 'POST':
        request_json = request.get_json()
        user_id = request_json['userId']
        x_label = request_json['xAxis']
        y_label = request_json['yAxis']
        next_day = request_json['nextDay']
        cr = DataReader()
        response = make_response(
            json.dumps(
                cr.read_correlation_data(user_id, x_label, y_label,
                                         bool(next_day))))
        response.headers['Content-Type'] = 'application/json'
        return response
Exemplo n.º 9
0
def sleep_data():
    if request.method == 'POST':
        request_json = request.get_json()
        user_id = request_json['userId']
        start_date = request_json['beginDate']
        end_date = request_json['endDate']
        gaussian_settings = request_json['gaussianSettings']
        r = DataReader()
        start = datetime.strptime(start_date, '%d.%m.%Y')
        end = datetime.strptime(end_date, '%d.%m.%Y')
        if gaussian_settings:
            sleep_data = r.read_sleep_data(user_id, start, end)
            average_list = []
            var_list = []
            for data in sleep_data:
                average_list.append(data['x'])
                var_list.append(data['y'])
            if len(average_list) > 1 and len(var_list) > 1:
                mean_duration = mean(average_list)
                variance_duration = variance(average_list)
                response = make_response(
                    json.dumps([{
                        'user_id': user_id,
                        'avg': mean_duration,
                        'std': math.sqrt(variance_duration)
                    }]))
                response.headers[
                    'Content-Type'] = 'application/json; charset=utf-8'
                return response
            else:
                response = make_response(
                    json.dumps([{
                        'user_id': user_id,
                        'avg': -1000,
                        'std': 1
                    }]))
                response.headers[
                    'Content-Type'] = 'application/json; charset=utf-8'
                return response
        else:
            response = make_response(
                json.dumps(r.read_sleep_data(user_id, start, end)))
            response.headers[
                'Content-Type'] = 'application/json; charset=utf-8'
            return response
Exemplo n.º 10
0
def read_data(data_path,
              word_word2index,
              char_word2index,
              label_word2index,
              label_type,
              label_bucket,
              max_size=None,
              normalize_digits=True,
              use_lm=False,
              use_elmo=False):
    _buckets = label_bucket[label_type]
    max_length = 0
    data = [[] for _ in _buckets]
    max_char_length = [0 for _ in _buckets]
    print('Reading data from %s' % data_path)
    counter = 0
    reader = DataReader(data_path, word_word2index, char_word2index,
                        label_word2index, use_elmo)
    inst = reader.get_next(normalize_digits)
    while inst is not None and (not max_size or counter < max_size):
        max_length = max(max_length, inst[6])
        counter += 1
        if counter % 10000 == 0:
            print("reading data: %d" % counter)
        inst_size = len(inst[0])
        for bucket_id, bucket_size in enumerate(_buckets):
            if inst_size < bucket_size:
                if use_elmo:
                    words = inst[0]
                else:
                    words = inst[1]
                if use_lm:
                    data[bucket_id].append(
                        [words, inst[3], inst[5], inst[7], inst[8]])
                else:
                    data[bucket_id].append([words, inst[3], inst[5]])
                max_len = max([len(char_seq) for char_seq in inst[2]])
                if max_char_length[bucket_id] < max_len:
                    max_char_length[bucket_id] = max_len
                break
        inst = reader.get_next(normalize_digits)
    reader.close()
    print("Total number of data: %d" % counter)
    print("Max length: %d" % max_length)
    return data, max_char_length
Exemplo n.º 11
0
def main(start):
    file_count = len(CATEGORY[start:]) * len(YEAR) * len(OUTLET)
    pbar = ProgressBar(max_value=file_count, redirect_stdout=True)
    progress = INITIAL_PROGRESS

    for cat in CATEGORY[start:]:

        # instance of DataReader that will retain all necessary data for categorical manipulation
        by_category = DataReader()

        for time in YEAR:
            by_category.create_reference(cat, time)
            by_category.create_zeroes(cat, time)

            for store in OUTLET:

                # import relevant data for the given category, year, and store
                by_category.store_data(cat, time, store)

                # append individual occ_data to by_category.occ_list
                occurrence(by_category)

                # update by_category.sales_data with sales data
                sales(by_category)

                # update by_category.unitp_data with units data
                units(by_category)

                # append total purchase data (panel) to by_category.panel_list
                panels(by_category)

                # at the very end, update progress bar
                pbar, progress = update_pbar(pbar, progress)

            # append completed sales data to by_category.sales_list
            by_category.sales_list.append(by_category.sales_data)

            # append completed units data to by_category.units_list
            by_category.units_list.append(by_category.units_data)

        # concat all DataFrames and export the final product
        final_product(by_category, cat)

    return SUCCESS_CODE
Exemplo n.º 12
0
    def test_run_training(self):
        """
        Test to check if the read_text function
        return a list of words given a txt file.
        """
        my_data = DataReader(get_path_basic_corpus())
        my_vocab_size = 500
        my_data.process_data(my_vocab_size)
        my_config = wv.Config(num_steps=200,
                              vocab_size=my_vocab_size,
                              show_step=2)

        my_model = wv.SkipGramModel(my_config)
        duration, loss = wv.run_training(my_model,
                                         my_data,
                                         verbose=False,
                                         visualization=False,
                                         debug=True)
        self.assertTrue(duration <= 1.7)
        self.assertTrue(loss < 7)
Exemplo n.º 13
0
def main():
    config = Config(CONFIG_FILE)

    # Retrieve data
    data = DataReader(config)

    model = SsdModel(config.n_classes)

    train_data = data.getVOC07TrainData(shuffle=config.shuffle)
    test_data = data.getVOC07TestData()

    # TODO: Preprocess/format data for training
    X_train = train_data
    y_train = train_data

    X_test = test_data

    # Train model
    model = SsdModel(config.n_classes)
    model.train(X_train, y_train)
    model.test(X_test)
    showSampleData(train_data)
Exemplo n.º 14
0
    def __init__(self,
                 dataset_name: str,
                 field: str,  # TODO
                 rewards: Dict[str, int],
               ):
        print(f'<SimpleHiEnv>: data set:{dataset_name},field:{field}.')
        # TODO!
        self.field = field  # necessary?

        # init DataReader, and then init sets
        self.reader = DataReader(dataset_name, field)
        self.seeds = self.reader.get_original_seeds()
        self.gt = self.reader.get_gt_set()
        self.current_entity_set = self.seeds.copy()
        self.if_continue = True
        self.candidate_list = []

        # init CGExpan
        device= torch.device("cuda:0")
        self.cgexpan = CGExpan(device, self.reader)  # TODO

        self.rewards = rewards
        print('<SimpleHiEnv>: Env is ready!')
Exemplo n.º 15
0
    # Parameters of the model
    # You can change any of the parameters and expect the network to run without error
    num_layers = 1
    bidirectional = False
    hidden_dim = 512  # 512 worked best so far
    batch_size = 128
    learning_rate = 0.001  # 0.05 results in nan for GRU

    # X, Y = generate_data(batch_size * 8, 1)
    # X, Y = generate_multi_attr_data(batch_size * 64, 1)

    # Worst convergence after using Plain encoding.
    # Sine encoding : Losses converge till a certain context
    fname = "data/EKPC_daily.csv"
    datareader = DataReader(fname,
                            encoding='Plain',
                            batch_size=batch_size,
                            sample_size=-1)
    X, Y = datareader.get_data()

    input_dim = len(X[0])
    print("Input dim : ", input_dim)
    trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.25)
    del X, Y

    # For auto-encoder, the input is also the output.
    train_dataset = RegressionDataset(inputs=trainY, labels=trainY)
    test_dataset = RegressionDataset(inputs=testY, labels=testY)

    train_iter = torch.utils.data.DataLoader(train_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
Exemplo n.º 16
0
 def setUpClass(cls):
     cls.dr = DataReader()
     cls.words = cls.dr.read_text()
Exemplo n.º 17
0
def Eval(data, MLP):
    num_correct = 0
    total = 0
    LL = []
    PL = []
    for i, batch in enumerate(data.dev_b_iter):
        (feature, batch_length), label = batch.sentence, batch.label
        Probs = MLP.forward(feature, batch_length, train=False)
        Probs = Probs.view(label.shape[0], -1)
        # print(Probs.shape)
        _, prediction = torch.max(Probs, 1)
        LL = LL + label.tolist()
        PL = PL + prediction.tolist()
        num_correct += (prediction == label).sum()
        total += len(label)
    # F1Score = metrics.f1_score(LL, PL, average='weighted')
    # print(LL)
    # print(PL)
    F1Score = metrics.f1_score(LL, PL)
    print("Accuracy: %s" % (float(num_correct) / float(total)))
    print('F1 score: %s' % F1Score)


if __name__ == "__main__":
    data = DataReader(data_path=data_path,
                      batch_size=batch_size,
                      device=device,
                      embedding=embedding)
    MLP = Train(data, Train_iter, batch_size)
    Eval(data, MLP)
Exemplo n.º 18
0
currentdir = os.path.dirname(
    os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

from datareader import DataReader
import word2vec as wv
import util

file_path = os.path.join(parentdir, "data")
file_path = os.path.join(file_path, "Wiki.txt")
eval_path = os.path.join(parentdir, "evaluation")
eval_path = os.path.join(eval_path, "questions-words-ptbr.txt")

my_data = DataReader(file_path)
my_data.get_data()
word2index = my_data.word2index
index2word = my_data.index2word

BATCH_SIZE = np.array(range(1, 17)) * 10
number_of_exp = len(BATCH_SIZE)
results = []
info = []

for i, bs in enumerate(BATCH_SIZE):
    print("\n ({0} of {1})".format(i + 1, number_of_exp))
    config = wv.Config(batch_size=bs)
    attrs = vars(config)
    config_info = ["%s: %s" % item for item in attrs.items()]
    info.append(config_info)
Exemplo n.º 19
0
    def train(self, train_info, valid_info, hyperparameters):
        """ Train the model. """

        # 1. Create folders to save the model
        timedate_info = str(datetime.datetime.now()).split(' ')[0] + '_' + str(
            datetime.datetime.now().strftime("%H:%M:%S")).replace(':', '-')
        path_to_model = 'trained_models/' + self.unique_name + '_' + timedate_info
        os.mkdir(path_to_model)

        # 2. Use cudnn as backend for speed up
        #torch.backends.cudnn.benchmark = True # ce laufas CUDA

        learning_rate = hyperparameters['learning_rate']
        weight_decay = hyperparameters['weight_decay']
        total_epoch = hyperparameters['total_epoch']
        multiplicator = hyperparameters['multiplicator']

        negative, positive = 0, 0
        for _, label in train_info:
            if label == 0:
                negative += 1
            elif label == 1:
                positive += 1

        pos_weight = torch.Tensor([(negative / positive)]).cpu()

        # 4. Creat train and validation generators, use mask, and batch_size = 10 for validation generator
        train_datareader = DataReader(self.main_path_to_data, train_info)
        train_generator = DataLoader(train_datareader,
                                     batch_size=16,
                                     shuffle=True,
                                     pin_memory=True,
                                     num_workers=2)

        valid_datareader = DataReader(self.main_path_to_data, valid_info)
        valid_generator = DataLoader(valid_datareader,
                                     batch_size=10,
                                     shuffle=False,
                                     pin_memory=True,
                                     num_workers=2)

        # 5. Prepare model
        Model = ModelCT()
        Model.cpu()

        # 6. Define criterion function, optimizer and scheduler
        citerion_clf = torch.nn.BCEWithLogitsLoss(pos_weight)
        optimizer = torch.optim.Adam(Model.parameters(),
                                     lr=learning_rate,
                                     weight_decay=weight_decay)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,
                                                           multiplicator,
                                                           last_epoch=-1)

        # 7. Run training
        aucs = []
        losses = []
        best_auc = -np.inf

        nb_batches = len(train_generator)

        for epoch in range(total_epoch):
            start = time.time()
            print('Epoch: %d/%d' % (epoch + 1, total_epoch))
            running_loss = 0
            Model.train()

            for item_train in train_generator:
                # Forward pass
                optimizer.zero_grad()
                loss = Model.train_update(item_train, citerion_clf)
                # Backward pass
                loss.backward()
                optimizer.step()

                # Track loss change
                running_loss += loss.item()

            # Calculate AUC on validation set
            predictions = []
            trues = []
            for item_valid in valid_generator:
                prediction = Model.predict(item_valid, is_prob=True)
                predictions.append(np.mean(prediction.cpu().numpy()))
                trues.append(item_valid[1].numpy()[0])

            auc = roc_auc_score(trues, predictions)
            aucs.append(auc)

            print("AUC: ", auc, ", Running loss: ", running_loss / nb_batches,
                  ", Time: ",
                  time.time() - start)

            # If over 1/2 of epochs, and auc is the best auc so far, save this model as best model.
            if (epoch >= total_epoch // 2) and (auc > best_auc):
                torch.save(Model.state_dict(),
                           path_to_model + '/BEST_model.pth')
                best_auc = auc
            else:
                pass

            # Update losses
            losses.append(running_loss / nb_batches)
            scheduler.step()

        np.save(path_to_model + '/AUCS.npy', np.array(aucs))
        np.save(path_to_model + '/LOSSES.npy', np.array(losses))
        torch.save(Model.state_dict(), path_to_model + '/LAST_model.pth')

        return aucs, losses, path_to_model
Exemplo n.º 20
0
        plt.title("Actul vs Predicted Load ({}) - {}  Window size {}".format(
            location, "SVR_" + self.kernel_type, window_size))
        plt.plot(list(range(len(preds))), testY, label='Actual Load')
        plt.plot(list(range(len(preds))), preds, label='Predicted Load')
        plt.xlabel('Time')
        plt.ylabel('Power Consumption (MW)')
        plt.legend()
        plt.show()

        return error


if __name__ == '__main__':
    fname = "data/household.csv"  # Works for household. Boosting does not.
    location = os.path.split(fname)[1].split(".")[0]
    datareader = DataReader(fname, sample_size=10000, encoding='Cosine')
    features, Y = datareader.get_data()

    window_size = 7
    # 7 does not predict higher extreme values
    # 28 does not predict lower extreme values

    features = features[:-window_size]
    X, Y = window(Y, window_size)
    X = np.concatenate((X, features), axis=1)

    svm_poly = SVRRegression(kernel_type='poly')
    loss = svm_poly.fit_predict(X, Y, location)

    print("Loss : ", "%.2f" % loss)
Exemplo n.º 21
0
def read_dataset(dataset_file):
    dr = DataReader(dataset_file)
    return dr.read_data()
Exemplo n.º 22
0
class RegressionDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, id):
        sample = self.inputs[id], self.labels[id]
        return sample


if __name__ == '__main__':
    fname = "data/AEP_hourly.csv"
    datareader = DataReader(fname)
    X, Y = datareader.get_data()

    dataset = RegressionDataset(inputs=X, labels=Y)
    dataset_loader = DataLoader(dataset,
                                batch_size=8,
                                shuffle=False,
                                num_workers=2)

    for i, [input, label] in enumerate(dataset_loader):
        print(input)
        print(label)
        print()
        if i == 2: break
Exemplo n.º 23
0
    print('Client connected')


@socket_.on('disconnect_request', namespace='/biometrics')
def disconnect_request():
    @copy_current_request_context
    def can_disconnect():
        disconnect()

    print('Client disconnected')
    emit('my_response', {'data': 'Disconnected!'}, callback=can_disconnect)


if __name__ == '__main__':

    reader = DataReader()

    def send_data():
        while True:
            red, ir, hr, hr_v, spo2, spo2_v = reader.get_values()
            payload = {
                't': round(time()),
                'red': red,
                'ir': ir,
                'hr': hr,
                'hr_v': hr_v,
                'spo2': spo2,
                'spo2_v': spo2_v
            }
            socket_.emit('data', payload, namespace="/biometrics")
            socket_.sleep(0.1)
Exemplo n.º 24
0
# скоростные параметры
learning_rate = 1e-6  # скорость обучения
num_epochs = 10  # количество эпох

input_channels = 1  # входной канал
input_height = 28  # высота
input_width = 28  # ширина
num_classes = 6  # количество классов изображений
# размер изображения, класс изображения
one_layer_net = OneLayerNet(input_height * input_width, num_classes)

#путь деррикториям с картинками
train_dir = "data/train"
test_dir = "data/test"

train_generator = DataReader(train_dir, [input_height, input_width], True,
                             input_channels, num_classes).get_generator()
# берется с помощью DataReader изображения, тренировочную и тестовую выборку
test_generator = DataReader(test_dir, [input_height, input_width], False,
                            input_channels, num_classes).get_generator()

print('Size of training set: {}'.format(
    train_generator.get_data_size()))  # вес тренировочных изображений
print('Size of testing set: {}'.format(
    test_generator.get_data_size()))  # вес тесовых изображенй изображений

print("{} Start training...".format(datetime.now()))  # время начала обучения

# функция ,которая для каждой итерации считает ошибочность распознавания в обучении
for epoch in range(num_epochs):
    print("{} Epoch number: {}".format(datetime.now(), epoch + 1))
    loss = 0
Exemplo n.º 25
0
# CONSTANTS
DIR = 'challenges/spooky_author_identification'
TARGET_COLUMNS = ["EAP", "HPL", "MWS"]

# VARIABLES
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
process_data = False
num_hyper_samples = 10
cv_splits = 5
cv_repeats = 1

# PROCESS/LOAD DATA
if process_data:
    #   Load raw data
    data_reader = DataReader(config_fp='{}/data/config.yml'.format(DIR))
    df_train = data_reader.load_from_csv(fp='{}/data/train.csv'.format(DIR),
                                         validate_col_names=True,
                                         is_test=False,
                                         append_vartype=True)
    df_test = data_reader.load_from_csv(fp='{}/data/test.csv'.format(DIR),
                                        validate_col_names=True,
                                        is_test=True,
                                        append_vartype=True)

    # EMBEDDED EXPERIMENTS
    #   Load embedding vectors
    filename = '/Users/gregwalsh/Downloads/GoogleNews-vectors-negative300.bin'
    keyed_vectors = KeyedVectors.load_word2vec_format(fname=filename,
                                                      binary=True)