Python chunker примеры, utils.chunker Python примеры использования

Пример #1

0

Показать файл

 def __getitem__(self,
                 index,
                 mus_win=512,
                 mus_hop=256,
                 eeg_win=32,
                 eeg_hop=2):
     """Generates one sample of data"""
     # Select sample
     index_rand = round(np.random.uniform(0, len(self.eeg) - 1))
     X = chunker(self.eeg[index_rand], self.music[index_rand],
                 self.sample_len, self.eeg_sr, self.sr)
     X_m = stft(X[1], self.sr, mus_win, mus_hop)
     X_e = stft_eeg(X[0], self.sr, eeg_win, eeg_hop)
     if self.use_noise:
         X_m = add_rand_noise(abs(X_m))
         X_e = add_rand_noise(abs(X_e))
     X_m = to_log(abs(X_m) + 1e-6)
     X_e = z_norm(X_e)
     X_e = to_log(abs(X_e) + 1e-6)
     X_e = z_norm(X_e)
     X_e = torch.tensor(X_e).float()
     X_m = torch.tensor(X_m).float()
     X_m = (X_m - X_m.mean(dim=0, keepdim=True)) / (
         X_m.std(dim=0, keepdim=True) + 1e-6)
     for i in np.arange(X_e.size(0)):
         X_e[i] = (X_e[i] - X_e[i].mean(dim=0, keepdim=True)) / (
             X_e[i].std(dim=0, keepdim=True) + 1e-6)
     return X_e, X_m

Пример #2

0

Показать файл

Файл: tle.py Проект: hakansgit/satPassAPI

def update_TLEs():
    global TLEs

    url = TLE_SETTINGS['url']

    req = urllib.request.Request(url, method='GET')
    retrieved_lines = []
    with urllib.request.urlopen(req) as f:
        if f.status == 200:
            retrieved_lines = [line.decode().replace('\r\n', '').strip()
                               for line in list(f.readlines())]
        else:
            raise Exception("Error downloading TLE file")

    new_TLEs = []
    for group in chunker(retrieved_lines, 3):
        sat = {
            'name': group[0],
            'id': int(group[2].split()[1]),
            'line1': group[1],
            'line2': group[2]
        }
        new_TLEs.append(sat)
    TLEs = new_TLEs
    prep_data()

Пример #3

0

Показать файл

def update_TLEs(localOnly=False):
    global TLEs

    # print('updating TLEs')

    if localOnly and Path(TLE_SETTINGS['localFile']).is_file():
        with open(TLE_SETTINGS['localFile']) as local:
            TLEs = json.load(local)
            prep_data()
            return

    url = TLE_SETTINGS['url']
    logging.info(f"Retrieving TLE information from {url}")
    # print('downloading TLEs')

    req = urllib.request.Request(url, method='GET')
    retrieved_lines = []
    with urllib.request.urlopen(req) as f:
        if f.status == 200:
            retrieved_lines = [
                line.decode().replace('\r\n', '').strip()
                for line in list(f.readlines())
            ]
        else:
            logging.error(f'Cannot retrieve TLE file: {f.status}: {f.reason}')

    # if you get an error page from ISP with a stupid 200 code
    if len(retrieved_lines) < 3 or retrieved_lines[0].startswith('<!'):
        logging.error(f'Cannot retrieve TLE file: {retrieved_lines[0]}')
        # if no TLEs in memory fall back to last local temp file
        if len(TLEs) < 3:
            with open(TLE_SETTINGS['localFile']) as local:
                TLEs = json.load(local)
                prep_data()
        return

    # filter retrieved lines to TLEs array and write to local temp file
    pattern = re.compile(TLE_SETTINGS['filter'])

    new_TLEs = []
    for group in chunker(retrieved_lines, 3):
        if len(group) == 3 and pattern.match(group[0]):
            sat = {
                'name': group[0],
                'id': int(group[2].split()[1]),
                'line1': group[1],
                'line2': group[2]
            }
            new_TLEs.append(sat)
    TLEs = new_TLEs
    prep_data()

    # write to local file
    with open(TLE_SETTINGS['localFile'], 'w') as local:
        json.dump(TLEs, local, ensure_ascii=False, indent=4)
    logging.info(
        f'Retrieved {len(retrieved_lines)//3} TLEs from {url}, filtered {len(TLEs)} TLEs'
    )

Пример #4

0

Показать файл

def build_vocab(lines, result_queue):
    """Build a word count dictionary given an iterable of raw lines."""
    vocab = collections.Counter()
    chunks = utils.chunker(lines, 10000, '')
    for chunk in chunks:
        tokenizer = Tokenizer(chunk)
        vocab.update(tok for tok in tokenizer)
    result_queue.put(vocab)
    return vocab

Пример #5

0

Показать файл

def clean_corpus(corpus):
    with open(corpus, 'r', encoding='iso-8859-1') as in_file:
        fname, _, ext = corpus.rpartition('.')
        with open(f'{fname}_clean.{ext}', 'w',
                  encoding='iso-8859-1') as out_file:
            chunks = utils.chunker(in_file, 10000, '')
            for chunk in chunks:
                tokenizer = Tokenizer(chunk)
                for sent in tokenizer.yield_sentences():
                    out_file.write(sent + '\n')

Пример #6

0

Показать файл

    def predict(self, x):
        if self.standardize:
            x = self.scaler.transform(x)

        # Predict in chunks to save memory
        preds = np.array([])
        for x_ch in utils.chunker(x, 2**19):
            preds = np.append(preds, self.model.predict(x_ch))

        return pd.Series(preds, index=x.index).clip(0., 20.)

Пример #7

0

Показать файл

Файл: news_prediction.py Проект: amackillop/projects

def main():

    sentence_batches = utils.chunker(CSV_DATA['RedditNews.csv'].News,
                                     CHUNK_SIZE)
    character_ids = process_sent_batches(sentence_batches)

    options = config.MODELS + config.MODEL_FILE
    weights = config.MODELS + config.OPTIONS_FILE
    model = load_elmo(options, weights)

    embeddings = get_sent_embeddings(character_ids, model)

Пример #8

0

Показать файл

def encode():
    """Encode all of the sentences to vector form"""
    train, dev, test = loader.getData()
    sentences = []
    tokens = []

    # Load the vocab
    en_vocab = get_english_vocab(DATA_DIR, VOCAB_SIZE)

    # Collect all the training sentences
    for i, row in pd.concat((train, test)).iterrows():
        if isinstance(row["sentence1"], basestring) and isinstance(
                row["sentence2"], basestring):
            sentences.append(row["sentence1"])
            sentences.append(row["sentence2"])

    # Allocate the sentences to buckets
    bucketed = {}
    for sentence in sentences:
        bucket_id = get_bucket(en_vocab, sentence)
        bucketed.setdefault(bucket_id, [])
        bucketed[bucket_id].append(sentence)

    mapped = {}
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True, train_dir=TRAIN_DIR)
        model.batch_size = BATCH_SIZE  # We decode 64 sentence at a time.
        # Iterate over each bucket
        for bucket_id, sentences in bucketed.iteritems():
            for batch in chunker(sentences, BATCH_SIZE):
                data = []
                for sentence in batch:
                    token_ids = data_utils.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence), en_vocab)
                    expected_output = []
                    data.append((token_ids, expected_output))
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: data}, bucket_id)
                contexts = model.step_context(sess, encoder_inputs,
                                              decoder_inputs, target_weights,
                                              bucket_id)
                features = np.hstack(contexts)
                print 'Extracted another set of features with shape:', features.shape
                # Now we align sentences with their contexts
                for i, sentence in enumerate(batch):
                    mapped[sentence] = features[i, :].tolist()
    print sentence
    print mapped[sentence]
    print "Saving sentences to %s" % JSON_NAME
    with open(JSON_NAME, 'w') as file:
        json.dump(mapped, file)

Пример #9

0

Показать файл

def resample_df_mean(df, f_ratio_of_output_cone=3.65):
    eelabel = "EE_in_" + str(f_ratio_of_output_cone) + "_cone"
    chunks = utils.chunker(df, 4)
    dff = pd.DataFrame()
    for chunk in chunks:
        dff = pd.concat([
            dff,
            pd.DataFrame(chunk[[
                "f_ratio_in", "f_ratio_out", "EE_in_input_cone", eelabel
            ]].median(skipna=True)).T
        ],
                        ignore_index=True)
    return dff

Пример #10

0

Показать файл

Файл: tw_stream.py Проект: nitin-yad/crypto_monitor

def get_follow_ids(twitter_names):
    ids = []
    if path.exists("twitter_ids.txt"):
        f = open("twitter_ids.txt", "r+")
        for line in f:
            c = line[:-1]
            ids.append(c)
        f.close()
        return ids
    else:
        for t in list(chunker(twitter_names, 100)):
            follow_ids = []
            for user in api.UsersLookup(screen_name=t):
                id_str = json.loads(user.__str__())['id_str']
                follow_ids.append(id_str)
            write_to_file(follow_ids)

Пример #11

0

Показать файл

    def _parse_entry_table(self) -> (List[Firmware], List[Directory]):
        entries = chunker(self.firmware_entry_table[4:], 4)

        for index, entry in enumerate(entries):
            firmware_type = self._FIRMWARE_ENTRY_TYPES[index] if index < len(
                self._FIRMWARE_ENTRY_TYPES) else 'unknown'
            address = struct.unpack('<I', entry)[0] & 0x00FFFFFF

            # assumption: offset == 0 is an invalid entry
            if address not in [0x0, 0xfffffe]:
                directory = self[address:address + 16 * 8]
                magic = directory[:4]

                # either this entry points to a PSP directory directly
                if magic in [b'$PSP', b'$BHD']:
                    directory = Directory(self, address, firmware_type)
                    self.directories.append(directory)

                    # if this Directory points to a secondary directory: add it, too
                    if directory.secondary_directory_address is not None:
                        secondary_directory = Directory(
                            self, directory.secondary_directory_address,
                            'secondary')
                        self.directories.append(secondary_directory)

                # or this entry points to a combo-directory (i.e. two directories)
                elif magic == b'2PSP':
                    psp_dir_one_addr = struct.unpack(
                        '<I', directory[10 * 4:10 * 4 + 4])[0] & 0x00FFFFFF
                    psp_dir_two_addr = struct.unpack(
                        '<I', directory[14 * 4:14 * 4 + 4])[0] & 0x00FFFFFF

                    for address in [psp_dir_one_addr, psp_dir_two_addr]:
                        directory = Directory(self, address, firmware_type)
                        self.directories.append(directory)

                        # if this Directory points to a secondary directory: add it, too
                        if directory.secondary_directory_address is not None:
                            secondary_directory = Directory(
                                self, directory.secondary_directory_address,
                                'secondary')
                            self.directories.append(secondary_directory)

                # or this entry is unparsable and thus a firmware
                else:
                    firmware = Firmware(self, address, firmware_type, magic)
                    self.firmwares.append(firmware)

Пример #12

0

Показать файл

Файл: chars.py Проект: machinelearning-su/style-change-detection

def ascii_discrepancies(data, window, local_diff, feature_names=[]):
    vectors = []

    data_length = len(data)

    isascii = lambda s: len(s) == len(s.encode())

    for i, entry in enumerate(data):
        entry = entry.lower()

        window_chars = round(len(entry) * window)

        local = []

        for chunk in chunker(entry, window_chars):
            non_ascii_count = 0

            for char in chunk:
                if not char: continue

                if not isascii(char):
                    non_ascii_count += 1

            local.append([non_ascii_count / len(chunk)])

        if(local_diff):
            local_len = len(local)

            for local_index in range(local_len):
                if(local_index == local_len - 1): break

                local[local_index] = [abs(a - b) for a, b in zip(local[local_index], local[local_index + 1])]

        min_v = np.amin(local, axis=0).tolist()
        max_v = np.amax(local, axis=0).tolist()
        diff = np.subtract(max_v, min_v).tolist()

        vectors.append(diff)

        print_progress_bar(i + 1, data_length, description = 'ascii_chars_discrepancies')

    
    feature_names.extend(['ascii_chars_discrepancies'])

    return vectors

Пример #13

0

Показать файл

def get_twitter_name_from_ids(cmc_ids):
    info_url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/info'
    tw_ids = []
    for id_chunk in list(chunker(cmc_ids, 500)):
        try:
            info_parameters = {'id': ','.join(map(str, id_chunk))}
            response = session.get(info_url, params=info_parameters)
            data = json.loads(response.text)
            for i in data['data'].keys():
                print(data['data'][i])
                twitter = data['data'][i]['urls']['twitter']
                if twitter:
                    d = twitter[0].split('/')
                    tw_id = d[len(d) - 1]
                    tw_ids.append(tw_id.__str__())
        except (ConnectionError, Timeout, TooManyRedirects, Exception) as e:
            print(e)
    return tw_ids

Пример #14

0

Показать файл

Файл: embed.py Проект: maxkferg/stanford-cs224n-project

def get_sentence_to_context_map(sentences):
    """
  Process all of the sentences with the model
  Return a map between sentence text and the context vectors
  The order of the map is undefined due to the bucketing process
  """
    # Load the vocab
    en_vocab = get_english_vocab(DATA_DIR, VOCAB_SIZE)

    # Allocate the sentences to buckets
    bucketed = {}
    for sentence in sentences:
        bucket_id = get_bucket(en_vocab, sentence)
        bucketed.setdefault(bucket_id, [])
        bucketed[bucket_id].append(sentence)

    mapped = {}
    with tf.Session() as sess:
        # Create model and load parameters.
        model = create_model(sess, True, train_dir=TRAIN_DIR)
        model.batch_size = BATCH_SIZE  # We decode 64 sentence at a time.
        # Iterate over each bucket
        for bucket_id, sentences in bucketed.iteritems():
            for batch in chunker(sentences, BATCH_SIZE):
                data = []
                # Tokenize each sentence
                for sentence in batch:
                    token_ids = data_utils.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence), en_vocab)
                    expected_output = []
                    data.append((token_ids, expected_output))
                # Use the model to obtain contexts for each sentence in the batch
                encoder_inputs, decoder_inputs, target_weights = model.get_batch(
                    {bucket_id: data}, bucket_id)
                contexts = model.step_context(sess, encoder_inputs,
                                              decoder_inputs, target_weights,
                                              bucket_id)
                features = np.hstack(contexts)
                print 'Encoded {0} sentences into {1} dimensional vectors'.format(
                    *features.shape)
                # Now we align sentences with their contexts
                for i, sentence in enumerate(batch):
                    mapped[sentence] = features[i, :].tolist()
    return mapped

Пример #15

0

Показать файл

def pred_alt(X):

    preds = []
    gt = []

    for record in test_dict:
        all_rows = test_dict[record]['x']
        record_y_gt = []
        record_y_pred = []
        for batch_hyp in chunker(range(all_rows.shape[0])):


            X = all_rows[min(batch_hyp):max(batch_hyp)+1, ...]
            Y = test_dict[record]['y'][min(batch_hyp):max(batch_hyp)+1]

            X = np.expand_dims(X, 0)

            X = rescale_array(X)

            Y_pred = model.predict(X)
            Y_pred = Y_pred.argmax(axis=-1).ravel().tolist()

            gt += Y.ravel().tolist()
            preds += Y_pred

            record_y_gt += Y.ravel().tolist()
            record_y_pred += Y_pred

        # fig_1 = plt.figure(figsize=(12, 6))
        # plt.plot(record_y_gt)
        # plt.title("Sleep Stages")
        # plt.ylabel("Classes")
        # plt.xlabel("Time")
        # plt.show()
        #
        # fig_2 = plt.figure(figsize=(12, 6))
        # plt.plot(record_y_pred)
        # plt.title("Predicted Sleep Stages")
        # plt.ylabel("Classes")
        # plt.xlabel("Time")
        # plt.show()
    return preds

Пример #16

0

Показать файл

Файл: directory.py Проект: slafniy/psptool

    def _parse_entries(self):
        for entry_bytes in self.body.get_chunks(self._entry_size):
            entry_fields = {}
            for key, word in zip(self.ENTRY_FIELDS, chunker(entry_bytes, 4)):
                entry_fields[key] = struct.unpack('<I', word)[0]

            # addresses are all starting at 0xff000000, but we just want everything from there
            entry_fields['offset'] &= 0x00FFFFFF

            entry = Entry.from_fields(self, self.parent_buffer, entry_fields['type'], entry_fields['size'],
                                      entry_fields['offset'])

            for existing_entry in self.blob.unique_entries:
                if entry == existing_entry:
                    existing_entry.references.append(self)

            if isinstance(entry, PubkeyEntry):
                self.blob.pubkeys[entry.key_id] = entry

            self.blob.unique_entries.add(entry)
            self.entries.append(entry)

Пример #17

0

Показать файл

    def apply_license(self, path, verbose=False):
        """
        Applies a given license (class instance) to a given path.

        lic                     License : License class instance, should have
                                been initialized with one of your choice (apache, bsd, ...)
        path                    String : File or dir, License should apply to.
        """
        files_in_path = self._get_path_elements(path)

        for chunk in utils.chunker(files_in_path, self.PATH_CHUNKS_SIZE):
            paths = [x[1] for x in chunk]  # Extract file paths
            self.buffer_file_descriptors(paths, mode='r+')

            for elem in chunk:
                # apply license using file extension
                file_license = self.license.get_license_as(elem[0])
                # retrieve the buffered file descriptor from path
                self.write_header_to_file(self.fd_buffer[elem[1]], file_license)
                if verbose :
                    print "Stamping: %s" % found_file[1]
            self._clear_fd_buffers()

        return

Пример #18

0

Показать файл

Файл: make_mask.py Проект: wkampitsch/myrepo

def make_mask(array, labels, rule):
    '''
    Generates a rule corresponding np.array mask
    Supported rules: 'close', 'far_lb1', 'far_lb2', 'v_far'
    Rules represent the distance between the labels - lb1 & lb2
    '''
    indices = []
    stack = [0]
    last = array[0]
    for i, item in enumerate(array[1:], 1):
        if item == last:
            stack.append(i)
        else:
            if stack:
                indices.append(stack[0])   # min index
                indices.append(stack[-1])  # max or repeat index
                stack = []
                stack.append(i)
        last = item
    if stack:
        indices.append(stack[0])
        indices.append(stack[-1])
    # eliminate first pair if not the starting label
    if labels[0] != array[0]:
        indices = indices[2:]
    # eliminate the last pair to comply with lb1, lb2 pairs
    if len(indices) / 2 % 2:
        indices = indices[:-2]

    result = []
    for ind in chunker(indices, 4):
        result.extend(dist_ind(ind, rule))

    mask = np.zeros(len(array), dtype=int)
    mask[result] = 1
    return mask

Пример #19

0

Показать файл

Файл: autodidactic_iteration.py Проект: Ashlesha217/Rubik-s-cube

        for c in tqdm(cubes):
            flat_cubes, rewards = get_all_possible_actions_cube_small(c)
            cube_next_reward.append(rewards)
            flat_next_states.extend(flat_cubes)
            cube_flat.append(flatten_1d_b(c))

        for _ in range(20):

            cube_target_value = []
            cube_target_policy = []

            next_state_value, _ = model.predict(np.array(flat_next_states),
                                                batch_size=1024)
            next_state_value = next_state_value.ravel().tolist()
            next_state_value = list(
                chunker(next_state_value, size=len(action_map_small)))

            for c, rewards, values in tqdm(
                    zip(cubes, cube_next_reward, next_state_value)):
                r_plus_v = 0.4 * np.array(rewards) + np.array(values)
                target_v = np.max(r_plus_v)
                target_p = np.argmax(r_plus_v)
                cube_target_value.append(target_v)
                cube_target_policy.append(target_p)

            cube_target_value = (cube_target_value - np.mean(cube_target_value)
                                 ) / (np.std(cube_target_value) + 0.01)

            print(cube_target_policy[-30:])
            print(cube_target_value[-30:])

Пример #20

0

Показать файл

Файл: item_based.py Проект: felipecruz/CFRBM

def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep):
    config_name = config["name"]
    number_hidden = config["number_hidden"]
    epochs = config["epochs"]
    ks = config["ks"]
    momentums = config["momentums"]
    l_w = config["l_w"]
    l_v = config["l_v"]
    l_h = config["l_h"]
    decay = config["decay"]

    config_result = config.copy()
    config_result["results"] = []

    vis = T.matrix()
    vmasks = T.matrix()

    rbm = CFRBM(len(all_users) * 5, number_hidden)

    profiles = defaultdict(list)

    with open(dataset, "rt") as data:
        for i, line in enumerate(data):
            uid, mid, rat, timstamp = line.strip().split(sep)
            profiles[mid].append((uid, float(rat)))

    print("Users and ratings loaded")

    for j in range(epochs):

        def get_index(col):
            if j / (epochs / len(col)) < len(col):
                return j / (epochs / len(col))
            else:
                return -1

        index = get_index(ks)
        mindex = get_index(momentums)
        icurrent_l_w = get_index(l_w)
        icurrent_l_v = get_index(l_v)
        icurrent_l_h = get_index(l_h)

        k = ks[index]
        momentum = momentums[mindex]
        current_l_w = l_w[icurrent_l_w]
        current_l_v = l_v[icurrent_l_v]
        current_l_h = l_h[icurrent_l_h]

        train = rbm.cdk_fun(
            vis, vmasks, k=k, w_lr=current_l_w, v_lr=current_l_v, h_lr=current_l_h, decay=decay, momentum=momentum
        )
        predict = rbm.predict(vis)

        batch_size = 10
        for batch_i, batch in enumerate(utils.chunker(profiles.keys(), batch_size)):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for movieid in batch:
                movie_profile = [0.0] * len(all_users)
                mask = [0] * (len(all_users) * 5)

                for user_id, rat in profiles[movieid]:
                    movie_profile[all_users.index(user_id)] = rat
                    for _i in range(5):
                        mask[5 * all_users.index(user_id) + _i] = 1

                example = expand(np.array([movie_profile])).astype("float32")
                bin_profiles[movieid] = example
                masks[movieid] = mask

            movies_batch = [bin_profiles[id] for id in batch]
            masks_batch = [masks[id] for id in batch]
            train_batch = np.array(movies_batch).reshape(size, len(all_users) * 5)
            train_masks = np.array(masks_batch).reshape(size, len(all_users) * 5)
            train_masks = train_masks.astype("float32")
            train(train_batch, train_masks)
            sys.stdout.write(".")
            sys.stdout.flush()

        batch_size = 10
        ratings = []
        predictions = []

        for batch in utils.chunker(tests.keys(), batch_size):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for movieid in batch:
                movie_profile = [0.0] * len(all_users)
                mask = [0] * (len(all_users) * 5)

                for userid, rat in profiles[movieid]:
                    movie_profile[all_users.index(userid)] = rat
                    for _i in range(5):
                        mask[5 * all_users.index(userid) + _i] = 1

                example = expand(np.array([movie_profile])).astype("float32")
                bin_profiles[movieid] = example
                masks[movieid] = mask

            positions = {movie_id: pos for pos, movie_id in enumerate(batch)}
            movies_batch = [bin_profiles[el] for el in batch]
            test_batch = np.array(movies_batch).reshape(size, len(all_users) * 5)
            movie_predictions = revert_expected_value(predict(test_batch))
            for movie_id in batch:
                test_users = tests[movie_id]
                try:
                    for user, rating in test_users:
                        current_movie = movie_predictions[positions[movie_id]]
                        predicted = current_movie[all_users.index(user)]
                        rating = float(rating)
                        ratings.append(rating)
                        predictions.append(predicted)
                except Exception:
                    pass

        vabs = np.vectorize(abs)
        distances = np.array(ratings) - np.array(predictions)

        mae = vabs(distances).mean()
        rmse = sqrt((distances ** 2).mean())

        iteration_result = {
            "iteration": j,
            "k": k,
            "momentum": momentum,
            "mae": mae,
            "rmse": rmse,
            "lrate": current_l_w,
        }

        config_result["results"].append(iteration_result)

        print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse))

        with open("{}_{}.json".format(config_name, name), "wt") as res_output:
            res_output.write(json.dumps(config_result, indent=4))

Пример #21

0

Показать файл

 all_users, all_movies, tests = load_dataset(FLAGS.train_path, FLAGS.test_path,
                                             FLAGS.sep, user_based=True)
 rbm = RBM(len(all_movies) * 5, FLAGS.num_hidden)
 print("model created")
 init = tf.global_variables_initializer()
 sess = tf.Session()
 sess.run(init)
 profiles = defaultdict(list)
 with open(FLAGS.train_path, 'rt') as data:
     for i, line in enumerate(data):
         uid, mid, rat, timstamp = line.strip().split(FLAGS.sep)
         profiles[uid].append((mid, float(rat)))
 print("Users and ratings loaded")
 for e in range(FLAGS.epochs):
     
     for batch_i, batch in enumerate(chunker(list(profiles.keys()),
                                             FLAGS.batch_size)):
         size = min(len(batch), FLAGS.batch_size)
         
         # create needed binary vectors
         bin_profiles = {}
         masks = {}
         # only consider the movie that users have iteracted 
         for userid in batch:
             user_profile = np.array([0.] * len(all_movies))
             mask = [0] * (len(all_movies) * 5)
             for movie_id, rat in profiles[userid]:
                 user_profile[all_movies.index(movie_id)] = rat
                 for _i in range(5):
                     mask[5 * all_movies.index(movie_id) + _i] = 1
             example = expand(np.array([user_profile])).astype('float32')
             bin_profiles[userid] = example

Пример #22

0

Показать файл

Файл: gp_query.py Проект: joernhees/graph-pattern-learner

def _multi_query(
        sparql, timeout, graph_pattern, source_target_pairs,
        batch_size,
        _vars, _values, _ret_val_mapping,
        _res_init, _chunk_q, _chunk_res,
        _res_update=lambda r, u, **___: r.update(u),
        **kwds):
    total_time = 0
    res = _res_init(source_target_pairs, **kwds)
    for val_chunk in chunker(_values, batch_size):
        q = _chunk_q(graph_pattern, _vars, val_chunk, **kwds)
        chunk_stps = [stp for v in val_chunk for stp in _ret_val_mapping[v]]
        _start_time = timer()
        t = None
        chunk_res = None
        loop = 1
        while loop:
            loop -= 1
            try:
                t, q_res = _query(sparql, timeout, q, **kwds)
                chunk_res = _chunk_res(
                    q_res, _vars, _ret_val_mapping, **kwds)
            except EndPointNotFound:
                # happens if the endpoint reports a 404...
                # as virtuoso in rare cases seems to report a 404 let's
                # retry once after some time but then
                if not loop:  # expected to 0 on first such exception
                    logger.info(
                        'SPARQL endpoint reports a 404, will retry once in 10s'
                    )
                    sleep(10)
                    loop += 2
                    continue
                else:  # expected to be 1 on second such exception
                    loop = 0
                    logger.warning(
                        'SPARQL endpoint unreachable even after back-off '
                        'and retry\n'
                        'could not perform query:\n%s for %s\nException:',
                        q, val_chunk,
                        exc_info=1,  # appends exception to message
                    )
                    t, chunk_res = timer() - _start_time, {}
            except (SPARQLWrapperException, SAXParseException, URLError) as e:
                if (isinstance(e, SPARQLWrapperException) and
                        re.search(
                            r'The estimated execution time [0-9]+ \(sec\) '
                            r'exceeds the limit of [0-9]+ \(sec\)\.',
                            repr(e))):
                    t, chunk_res = timeout, {}
                elif len(val_chunk) > 1:
                    logger.debug('error in batch: {}'.format(val_chunk))
                    logger.debug('retrying with half size batch: {}...'.format(
                        len(val_chunk) // 2
                    ))
                    t, chunk_res = _multi_query(
                        sparql, timeout, graph_pattern, chunk_stps,
                        len(val_chunk) // 2,
                        _vars, val_chunk, _ret_val_mapping,
                        _res_init, _chunk_q, _chunk_res,
                        _res_update,
                        **kwds)
                else:
                    logger.warning(
                        'could not perform query:\n%s for %s\nException:',
                        q, val_chunk,
                        exc_info=1,  # appends exception to message
                    )
                    t, chunk_res = timer() - _start_time, {}
            except Exception:
                # TODO: maybe introduce a max error counter? per process?
                logger.warning(
                    'unhandled exception, assuming empty res for multi-query:\n'
                    'Query:\n%s\nChunk:%r\nException:',
                    q, val_chunk,
                    exc_info=1,  # appends exception to message
                )
                t, chunk_res = timer() - _start_time, {}
        _res_update(res, chunk_res, **kwds)
        total_time += t
        if query_time_soft_exceeded(total_time, timeout):
            logger.debug('early terminating batch query as timeout/2 exceeded')
            break
    return total_time, res

Пример #23

0

Показать файл

def write_to_file(ids):
    f = open("twitter_names.txt", "a+")
    for o in ids:
        f.write('%s\n' % o)
    f.close()


def get_twitter_names():
    names = []
    if path.exists("twitter_names.txt"):
        f = open("twitter_names.txt", "r+")
        for line in f:
            c = line[:-1]
            names.append(c)
        f.close()
        return names
    else:
        cmc_ids = get_top_n_cmc(1500)
        twitter_names = get_twitter_name_from_ids(cmc_ids)
        write_to_file(twitter_names)
        return twitter_names


if __name__ == "__main__":
    t_names = get_twitter_names()

    for t in list(chunker(t_names, 100)):
        if t.__contains__("aeternity"):
            print(t)

Пример #24

0

Показать файл

Файл: 02-import.py Проект: crisbal/taxi-rides-university-project

    client = MongoClient('localhost',
                         27017,
                         username='******',
                         password='******')
    db = client["taxiRides"]
    rides_collection = db["rides"]

    column_remapping = json.load(
        open(folder + '/../dataset/column_remapping.json'))

    FILES = CONFIG['FILES']
    for FILE in FILES:
        print(f"Importing {FILE} Rides")
        with open(folder + '/../dataset/chicago_taxi_trips_' + FILE +
                  '.csv') as csvfile:
            rides = csv.DictReader(csvfile)

            i = 0  # for progress
            CHUNKER_SIZE = 1000
            if CONFIG['IMPORT_LIMIT'] > 0:
                rides = islice(rides, CONFIG['IMPORT_LIMIT'])
            for rides_chunk in chunker(rides, CHUNKER_SIZE):
                rides_chunk = [
                    embed_ride(ride, column_remapping) for ride in rides_chunk
                ]
                rides_collection.insert_many(rides_chunk)

                i += CHUNKER_SIZE
                if (i % 10000 == 0):
                    print(f"Progress: {i}")

Пример #25

0

Показать файл

Файл: api.py Проект: mrinaldhar/NLScramblr

def parse_question(question):
	chunk_dict = {'id' : question.id}
	question = utils.chunker(question.question)
	for i in xrange(len(question)):
		chunk_dict[i] = question[i]
	return json.dumps(chunk_dict)

Пример #26

0

Показать файл

 def get_pem_encoded(self):
     return b'-----BEGIN PUBLIC KEY-----\n' + \
            b'\n'.join(chunker(b64encode(self.get_der_encoded()), 64)) + \
            b'\n-----END PUBLIC KEY-----\n'

Пример #27

0

Показать файл

Файл: user_based.py Проект: felipecruz/CFRBM

def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep):
    config_name = config['name']
    number_hidden = config['number_hidden']
    epochs = config['epochs']
    ks = config['ks']
    momentums = config['momentums']
    l_w = config['l_w']
    l_v = config['l_v']
    l_h = config['l_h']
    decay = config['decay']
    batch_size = config['batch_size']

    config_result = config.copy()
    config_result['results'] = []

    vis = T.matrix()
    vmasks = T.matrix()

    rbm = CFRBM(len(all_movies) * 5, number_hidden)

    profiles = defaultdict(list)

    with open(dataset, 'rt') as data:
        for i, line in enumerate(data):
            uid, mid, rat, timstamp = line.strip().split(sep)
            profiles[uid].append((mid, float(rat)))

    print("Users and ratings loaded")

    for j in range(epochs):
        def get_index(col):
            if j/(epochs/len(col)) < len(col):
                return j/(epochs/len(col))
            else:
                return -1

        index = get_index(ks)
        mindex = get_index(momentums)
        icurrent_l_w = get_index(l_w)
        icurrent_l_v = get_index(l_v)
        icurrent_l_h = get_index(l_h)

        k = ks[index]
        momentum = momentums[mindex]
        current_l_w = l_w[icurrent_l_w]
        current_l_v = l_v[icurrent_l_v]
        current_l_h = l_h[icurrent_l_h]

        train = rbm.cdk_fun(vis,
                            vmasks,
                            k=k,
                            w_lr=current_l_w,
                            v_lr=current_l_v,
                            h_lr=current_l_h,
                            decay=decay,
                            momentum=momentum)
        predict = rbm.predict(vis)

        for batch_i, batch in enumerate(chunker(profiles.keys(),
                                                batch_size)):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 5)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask[5 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask

            profile_batch = [bin_profiles[id] for id in batch]
            masks_batch = [masks[id] for id in batch]
            train_batch = np.array(profile_batch).reshape(size,
                                                          len(all_movies) * 5)
            train_masks = np.array(masks_batch).reshape(size,
                                                        len(all_movies) * 5)
            train_masks = train_masks.astype('float32')
            train(train_batch, train_masks)
            sys.stdout.write('.')
            sys.stdout.flush()

        ratings = []
        predictions = []

        for batch in chunker(tests.keys(), batch_size):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 5)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask[5 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask

            positions = {profile_id: pos for pos, profile_id
                         in enumerate(batch)}
            profile_batch = [bin_profiles[el] for el in batch]
            test_batch = np.array(profile_batch).reshape(size,
                                                         len(all_movies) * 5)
            user_preds = revert_expected_value(predict(test_batch))
            for profile_id in batch:
                test_movies = tests[profile_id]
                try:
                    for movie, rating in test_movies:
                        current_profile = user_preds[positions[profile_id]]
                        predicted = current_profile[all_movies.index(movie)]
                        rating = float(rating)
                        ratings.append(rating)
                        predictions.append(predicted)
                except Exception:
                    pass

        vabs = np.vectorize(abs)
        distances = np.array(ratings) - np.array(predictions)

        mae = vabs(distances).mean()
        rmse = sqrt((distances ** 2).mean())

        iteration_result = {
            'iteration': j,
            'k': k,
            'momentum': momentum,
            'mae': mae,
            'rmse': rmse,
            'lrate': current_l_w
        }

        config_result['results'].append(iteration_result)

        print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse))

        with open('{}_{}.json'.format(config_name, name), 'wt') as res_output:
            res_output.write(json.dumps(config_result, indent=4))

Пример #28

0

Показать файл

Файл: mod.py Проект: ssiwapol/time-series-forecasting

 def forecast(self,
              output_dir,
              act_st,
              fcst_st,
              fcst_model,
              test_type,
              test_bck,
              top_model=3,
              ens_method='mean',
              chunk_sz=1,
              cpu=1):
     """Forecast and write result by batch
     Parameters
     ----------
     output_dir : str
         output directory
     act_st : datetime
         actual start date
     fcst_st : datetime
         forecast date
     fcst_model : dict('period', [list of models])
         forecast model options for each periods
     test_type : {'monthly', 'daily}
         type of testing back error by month or day
     test_bck : int
         number of months to test back
     chunk_sz : int
         number of item to validate for each chunk
     cpu : int
         number of running processors
     """
     # make output directory
     output_dir = "{}forecast_{}/".format(
         output_dir,
         datetime.datetime.now(timezone(self.tz)).strftime("%Y%m%d-%H%M%S"))
     self.output_dir = output_dir
     self.fp.mkdir(output_dir)
     self.lg.logtxt("create output directory: {}".format(output_dir))
     self.fp.writecsv(self.df_act, "{}input_actual.csv".format(output_dir))
     self.fp.writecsv(self.df_fcstlog,
                      "{}input_forecast.csv".format(output_dir))
     # write external features
     if self.ext is not None:
         self.fp.writecsv(self.ext,
                          "{}input_external.csv".format(output_dir))
         self.fp.writecsv(self.ext_lag,
                          "{}input_externallag.csv".format(output_dir))
         self.lg.logtxt(
             "write input file: {}input_actual.csv | {}input_forecast.csv | {}input_external.csv | {}input_externallag.csv"
             .format(output_dir, output_dir, output_dir, output_dir))
     else:
         self.lg.logtxt(
             "write input file: {}input_actual.csv | {}input_forecast.csv".
             format(output_dir, output_dir))
     self.runitem = {}
     # set parameter
     items = self.df_act['id'].unique()
     n_chunk = len([x for x in chunker(items, chunk_sz)])
     act_st = datetime.datetime.combine(act_st,
                                        datetime.datetime.min.time())
     fcst_st = datetime.datetime.combine(fcst_st,
                                         datetime.datetime.min.time())
     test_st = fcst_st + relativedelta(months=-test_bck)
     fcst_pr = len(fcst_model.keys())
     pr_st = min(fcst_model.keys())
     model_list = list(set(b for a in fcst_model.values() for b in a))
     self.lg.logtxt(
         "total items: {} | chunk size: {} | total chunk: {}".format(
             len(items), chunk_sz, n_chunk))
     # rank the models
     df_rank = self.rank_model(fcst_model, act_st, fcst_st, test_type,
                               test_st)
     # forecast
     cpu_count = 1 if cpu <= 1 else multiprocessing.cpu_count(
     ) if cpu >= multiprocessing.cpu_count() else cpu
     self.lg.logtxt("run at {} processor(s)".format(cpu_count))
     for i, c in enumerate(chunker(items, chunk_sz), 1):
         df_fcst = pd.DataFrame()
         if cpu_count == 1:
             for r in [
                     self.forecast_byitem(x, act_st, fcst_st, fcst_pr,
                                          model_list, pr_st, i) for x in c
             ]:
                 df_fcst = df_fcst.append(r, ignore_index=True)
         else:
             pool = multiprocessing.Pool(processes=cpu_count)
             for r in pool.starmap(
                     self.forecast_byitem,
                 [[x, act_st, fcst_st, fcst_pr, model_list, pr_st, i]
                  for x in c]):
                 df_fcst = df_fcst.append(r, ignore_index=True)
             pool.close()
             pool.join()
         # ensemble forecast results
         df_ens = self.ensemble_model(df_fcst,
                                      df_rank,
                                      top_model,
                                      method=ens_method)
         # write forecast result
         fcst_path = "{}output_forecast_{:04d}-{:04d}.csv".format(
             output_dir, i, n_chunk)
         self.fp.writecsv(df_ens, fcst_path)
         # write forecast log result
         fcstlog_path = "{}output_forecastlog_{:04d}-{:04d}.csv".format(
             output_dir, i, n_chunk)
         self.fp.writecsv(df_fcst, fcstlog_path)
         self.lg.logtxt("write output file ({}/{}): {} | {}".format(
             i, n_chunk, fcst_path, fcstlog_path))
     self.lg.logtxt("[END FORECAST]")
     self.lg.writelog("{}logfile.log".format(output_dir))

Пример #29

0

Показать файл

Файл: nosparse_user_based.py Проект: sagarverma/LC-CFRBM

def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep):
    config_name = config['name']
    number_hidden = config['number_hidden']
    epochs = config['epochs']
    ks = config['ks']
    momentums = config['momentums']
    l_w = config['l_w']
    l_v = config['l_v']
    l_h = config['l_h']
    decay = config['decay']
    batch_size = config['batch_size']

    config_result = config.copy()
    config_result['results'] = []

    vis = T.matrix()
    vmasks = T.matrix()

    rbm = CFRBM(len(all_movies) * 5, number_hidden)

    profiles = defaultdict(list)
    #all_ratings = np.zeros((943,1682*5), dtype=np.float32)
    #all_masks = np.zeros((943,1682*5), dtype=np.float32)

    with open(dataset, 'rt') as data:
        for i, line in enumerate(data):
            uid, mid, rat, timstamp = line.strip().split(sep)
            profiles[uid].append((mid, float(rat)))
            #for i in range(1,5):
            #    if i == int(rat):
            #        all_ratings[int(uid)-1][(int(mid)-1)*5+i-1] = 1.0
            #    all_masks[int(uid)-1][(int(mid)-1)*5+i-1] = 1.0

    print("Users and ratings loaded")

    for j in range(epochs):
        def get_index(col):
            if j/(epochs/len(col)) < len(col):
                return j/(epochs/len(col))
            else:
                return -1

        index = get_index(ks)
        mindex = get_index(momentums)
        icurrent_l_w = get_index(l_w)
        icurrent_l_v = get_index(l_v)
        icurrent_l_h = get_index(l_h)

        k = ks[index]
        momentum = momentums[mindex]
        current_l_w = l_w[icurrent_l_w]
        current_l_v = l_v[icurrent_l_v]
        current_l_h = l_h[icurrent_l_h]

        train = rbm.cdk_fun(vis,
                            vmasks,
                            k=k,
                            w_lr=current_l_w,
                            v_lr=current_l_v,
                            h_lr=current_l_h,
                            decay=decay,
                            momentum=momentum)
        predict = rbm.predict(vis)

        #batch_size = 10
        start_time = time.time()

        for batch_i, batch in enumerate(chunker(profiles.keys(), batch_size)):
        #for batch_i in range(0,943,batch_size):

            #profile_batch = np.copy(all_ratings[batch_i:batch_i+batch_size])
            #masks_batch = np.copy(all_masks[batch_i:batch_i+batch_size])
            #print batch_i, len(profile_batch)
            size = min(len(batch), batch_size)

            
            #create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 5)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask[5 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask
                #print example[0].shape,userid,all_ratings[343].shape
                #print example[0][:20],all_ratings[343][:20],user_profile[:20]
                

            profile_batch = [bin_profiles[id] for id in batch]
            masks_batch = [masks[id] for id in batch]
            
            train_batch = np.array(profile_batch).reshape(size,
                                                         len(all_movies) * 5)
            train_masks = np.array(masks_batch).reshape(size,
                                                        len(all_movies) * 5)
            #print train_batch[0]
            train_masks = train_masks.astype('float32')
            train(train_batch, train_masks)
            #train(movies_batch, masks_batch)
            sys.stdout.write('.')
            sys.stdout.flush()

        end_time = time.time()

        train_time = end_time - start_time
        #batch_size = 10
        ratings = []
        predictions = []

        start_time = time.time()

        for batch in chunker(tests.keys(), batch_size):
            size = min(len(batch), batch_size)

            #profile_batch = []
            #from_test = []
            #for b in batch:
            #    profile_batch.append(all_ratings[int(b)-1])
            #    users = [0 for x in range(1682)]
            #    for u in tests[b]:
            #        users[int(u[0])-1] = int(u[1])
            #    from_test.append(users)

            
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 5)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask[5 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask

            

            positions = {profile_id: pos for pos, profile_id
                         in enumerate(batch)}
            profile_batch = [bin_profiles[el] for el in batch]

            test_batch = np.array(profile_batch).reshape(size,
                                                         len(all_movies) * 5)
            user_preds = revert_expected_value(predict(test_batch))
            
            for profile_id in batch:
                test_movies = tests[profile_id]
                try:
                    for movie, rating in test_movies:
                        current_profile = user_preds[positions[profile_id]]
                        predicted = current_profile[all_movies.index(movie)]
                        rating = float(rating)
                        ratings.append(rating)
                        predictions.append(predicted)
                except Exception:
                    pass
        
        end_time = time.time()
        test_time = end_time - start_time

        vabs = np.vectorize(abs)
        distances = np.array(ratings) - np.array(predictions)

        true_rat = np.array(ratings, dtype=np.uint8)
        pred_rat = np.array(predictions, dtype=np.uint8)

        #print true_rat < 3, true_rat
        prec_rec = precision_recall_fscore_support(true_rat < 3,pred_rat < 3, average='binary')
        print prec_rec
        
        mae = vabs(distances).mean()
        rmse = sqrt((distances ** 2).mean())

        iteration_result = {
            'iteration': j,
            'k': k,
            'momentum': momentum,
            'mae': mae,
            'rmse': rmse,
            'lrate': current_l_w,
            'train_time': train_time,
            'test_time': test_time,
            'prec_rec': prec_rec
        }

        config_result['results'].append(iteration_result)

        print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse))

        with open('experiments/{}_{}.json'.format(config_name, name), 'wt') as res_output:
            res_output.write(json.dumps(config_result, indent=4))

        W,V,H = rbm.get_weights()
        print H

Пример #30

0

Показать файл

def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep):
    config_name = config['name']
    number_hidden = config['number_hidden']
    epochs = config['epochs']
    ks = config['ks']
    momentums = config['momentums']
    l_w = config['l_w']
    l_v = config['l_v']
    l_h = config['l_h']
    lr_decay = config['lr_decay'][0]
    decay = config['decay']
    batch_size = config['batch_size']

    config_result = config.copy()
    config_result['results'] = []

    vis = T.matrix()
    vmasks = T.matrix()

    rbm = CFRBM(len(all_movies) * 20, number_hidden)

    profiles = defaultdict(list)

    with open(dataset, 'rt') as data:
        for i, line in enumerate(data):
            uid, mid, rat = line.strip().split(sep)
            profiles[uid].append((mid, float(rat)))

    current_l_w = l_w[0]
    current_l_v = l_v[0]
    current_l_h = l_h[0]

    print("Users and ratings loaded")

    for j in range(epochs):

        print "epochs: ", j

        def get_index(col):
            if j / (epochs / len(col)) < len(col):
                return j / (epochs / len(col))
            else:
                return -1

        index = get_index(ks)
        mindex = get_index(momentums)
        #icurrent_l_w = get_index(l_w)
        #icurrent_l_v = get_index(l_v)
        #icurrent_l_h = get_index(l_h)

        k = ks[index]
        momentum = momentums[mindex]
        current_l_w *= lr_decay
        current_l_v *= lr_decay
        current_l_h *= lr_decay

        train = rbm.cdk_fun(vis,
                            vmasks,
                            k=k,
                            w_lr=current_l_w,
                            v_lr=current_l_v,
                            h_lr=current_l_h,
                            decay=decay,
                            momentum=momentum)
        predict = rbm.predict(vis)

        n_batch = 0
        users_ids = []
        for batch in chunker(tests.keys(), batch_size):

            n_batch += 1

            # print "&*&*" * 20
            # print "START OF A BATCH"
            # print "batch: ", batch
            users_ids.extend(batch)

            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 20)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat

                    for _i in range(20):
                        mask[20 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask
                #print np.sum(mask)

            positions = {
                profile_id: pos
                for pos, profile_id in enumerate(batch)
            }
            profile_batch = [bin_profiles[el] for el in batch]

            # print profile_batch[0]
            # print len(profile_batch[0])

            test_batch = np.array(profile_batch).reshape(
                size,
                len(all_movies) * 20)

            # print batch

            # print "test batch :"
            # print test_batch
            # print test_batch.shape
            #print test_batch[:3,:3]
            batch_preds = predict(test_batch)
            user_preds = revert_expected_value(batch_preds, do_round=False)
            if n_batch == 1:
                print user_preds[:4, :5]

        train_batch_i = 0
        for batch_i, batch in enumerate(chunker(profiles.keys(), batch_size)):
            size = min(len(batch), batch_size)

            train_batch_i += 1

            # create needed binary vectors

            bin_profiles = {}
            masks = {}
            for userid in batch:

                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 20)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(20):
                        mask[20 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask

            # print example
            # print len(example[0])

            profile_batch = [bin_profiles[id] for id in batch]
            # print profile_batch[0][0]
            # print len(profile_batch[0][0])

            masks_batch = [masks[id] for id in batch]
            train_batch = np.array(profile_batch).reshape(
                size,
                len(all_movies) * 20)

            train_masks = np.array(masks_batch).reshape(
                size,
                len(all_movies) * 20)
            train_masks = train_masks.astype('float32')
            train(train_batch, train_masks)

            if (train_batch_i % 200 == 0):
                sys.stdout.write('.')
                sys.stdout.flush()

        # print "number of train batches: ", train_batch_i

        ratings = []
        predictions = []

        # pickle.dump(all_movies, open("item_ids.pickle", "wb"))

        # print "###############################################"
        # print "user ids"
        # print tests.keys()[1:100]
        # # print len(tests.keys)
        # # print type(tests.keys)
        # print "all users"
        # print all_users[1:100]
        # print len(all_users)
        # print type(all_users)
        # print "beer ids"
        # print all_movies[1:100]
        # print len(all_movies)
        # print type(all_movies)

        #reconstruct_mat = np.array([]).reshape(0, 1269)

        n_batch = 0
        users_ids = []
        for batch in chunker(tests.keys(), batch_size):

            n_batch += 1

            # print "&*&*" * 20
            # print "START OF A BATCH"
            # print "batch: ", batch
            users_ids.extend(batch)

            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 20)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat

                    for _i in range(20):
                        mask[20 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask
                #print np.sum(mask)

            positions = {
                profile_id: pos
                for pos, profile_id in enumerate(batch)
            }
            profile_batch = [bin_profiles[el] for el in batch]

            # print profile_batch[0]
            # print len(profile_batch[0])

            test_batch = np.array(profile_batch).reshape(
                size,
                len(all_movies) * 20)

            #print batch

            # print "test batch :"
            # print test_batch
            # print test_batch.shape
            batch_preds = predict(test_batch)
            user_preds = revert_expected_value(batch_preds, do_round=False)
            #if n_batch == 1:
            #    print test_batch[:2,:]

            # reconstruct_mat = np.concatenate((reconstruct_mat, user_preds))

            # print predict(test_batch)

            # print "user pred: ", user_preds
            # print user_preds.shape

            for profile_id in batch:
                test_movies = tests[profile_id]
                try:
                    for movie, rating in test_movies:
                        current_profile = user_preds[positions[profile_id]]
                        predicted = current_profile[all_movies.index(movie)]
                        rating = float(rating)
                        ratings.append(rating)
                        predictions.append(predicted)
                except Exception:
                    pass

        #print (np.array(predictions))[0:10]
        # print "number of test batches: ", n_batch

        # print reconstruct_mat

        # pickle.dump(users_ids, open("users_ids.pickle", "wb"))
        # pickle.dump(reconstruct_mat, open("reconstruct_mat.pickle", "wb"))

        vabs = np.vectorize(abs)
        distances = np.array(ratings) - np.array(predictions)

        mae = vabs(distances).mean()
        rmse = sqrt((distances**2).mean())

        iteration_result = {
            'iteration': j,
            'k': k,
            'momentum': momentum,
            'mae': mae,
            'rmse': rmse,
            'lrate': current_l_w
        }

        config_result['results'].append(iteration_result)

        print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse))

        with open('{}_{}.json'.format(config_name, name), 'wt') as res_output:
            res_output.write(json.dumps(config_result, indent=4))

        w = rbm.weights.eval()
        np.save('weights', w)

Пример #31

0

Показать файл

Файл: train_ce.py Проект: yaqiliu-cs/CISDL-DMAC

    def train(self):
        self.train_hist = {}
        self.train_hist['loc_loss'] = []
        self.train_hist['per_epoch_time'] = []
        self.train_hist['total_time'] = []

        self.loc.train()

        print('training start!!')
        start_time = time.time()
        for epoch in range(self.start_epoch_idx, self.epoch):
            epoch_start_time = time.time()
            data_gen = utils.chunker(self.pair_list, self.batch_size)
            if epoch == self.start_epoch_idx:
                start_iter_idx = self.start_iter_idx
            else:
                start_iter_idx = 0
            for iter in range(start_iter_idx, self.epoch_len):
                if iter == self.epoch_len // self.batch_size:
                    break

                # read images
                chunk = data_gen.next()
                images1, images2, labels, gt1, gt2 = utils.get_data_from_chunk(
                    self.data_path, chunk, self.input_scale)
                images1 = images1.cuda(self.gpu)
                images2 = images2.cuda(self.gpu)

                # gt masks variable
                gt1_ = torch.squeeze(gt1, dim=1).long()
                gt2_ = torch.squeeze(gt2, dim=1).long()
                gt1_ = gt1_.cuda(self.gpu)
                gt2_ = gt2_.cuda(self.gpu)

                # localization
                output1, output2 = self.loc(images1, images2)

                #localization update
                if (iter + 1) % self.loc_update_stride == 0:

                    self.loc_optimizer.zero_grad()

                    #localization net update
                    log_o1 = self.logsoftmax(output1)
                    log_o2 = self.logsoftmax(output2)
                    loc_loss_1 = self.ce_criterion(log_o1, gt1_)
                    loc_loss_2 = self.ce_criterion(log_o2, gt2_)

                    loc_loss = loc_loss_1 + loc_loss_2

                    self.train_hist['loc_loss'].append(loc_loss.data)

                    loc_loss.backward()
                    self.loc_optimizer.step()

                    if (iter + 1) % 10 == 0:
                        print '********************************************************************************'
                        print 'iter = ', iter, '  epoch = ', epoch, 'completed, loc_loss = ', loc_loss.data.cpu(
                        ).numpy()
                        print 'iter = ', iter, '  epoch = ', epoch, 'completed, loc_loss_1 = ', loc_loss_1.data.cpu(
                        ).numpy()
                        print 'iter = ', iter, '  epoch = ', epoch, 'completed, loc_loss_2 = ', loc_loss_2.data.cpu(
                        ).numpy()

                if (iter + 1) % self.snapshot_stride == 0:
                    snapshot(self.loc, self.snapshot_prefix_loc, epoch, iter)
                    trainhist_snapshot(self.train_hist['loc_loss'],
                                       self.snapshot_prefix_loc, epoch, iter)
                    self.train_hist['loc_loss'] = []

            self.train_hist['per_epoch_time'].append(time.time() -
                                                     epoch_start_time)

        self.train_hist['total_time'].append(time.time() - start_time)
        print("Avg one epoch time: %.2f, total %d epochs time: %.2f" %
              (np.mean(self.train_hist['per_epoch_time']), self.epoch,
               self.train_hist['total_time'][0]))
        print("Training finish!... save training results")

Пример #32

0

Показать файл

Файл: sparse_user_based.py Проект: sagarverma/LC-CFRBM

def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep):
    config_name = config['name']
    number_hidden = config['number_hidden']
    epochs = config['epochs']
    ks = config['ks']
    momentums = config['momentums']
    l_w = config['l_w']
    l_v = config['l_v']
    l_h = config['l_h']
    decay = config['decay']
    batch_size = config['batch_size']

    config_result = config.copy()
    config_result['results'] = []

    vis = T.matrix()
    vmasks = T.matrix()

    rbm = CFRBM(len(all_movies) * 5, number_hidden)

    profiles = defaultdict(list)

    with open(dataset, 'rt') as data:
        for i, line in enumerate(data):
            uid, mid, rat, timstamp = line.strip().split(sep)
            profiles[uid].append((mid, float(rat)))

    print("Users and ratings loaded")

    for j in range(epochs):
        def get_index(col):
            if j/(epochs/len(col)) < len(col):
                return j/(epochs/len(col))
            else:
                return -1

        index = get_index(ks)
        mindex = get_index(momentums)
        icurrent_l_w = get_index(l_w)
        icurrent_l_v = get_index(l_v)
        icurrent_l_h = get_index(l_h)

        k = ks[index]
        momentum = momentums[mindex]
        current_l_w = l_w[icurrent_l_w]
        current_l_v = l_v[icurrent_l_v]
        current_l_h = l_h[icurrent_l_h]

        train = rbm.cdk_fun(vis,
                            vmasks,
                            k=k,
                            w_lr=current_l_w,
                            v_lr=current_l_v,
                            h_lr=current_l_h,
                            decay=decay,
                            momentum=momentum)
        predict = rbm.predict(vis)

        for batch_i, batch in enumerate(chunker(profiles.keys(),
                                                batch_size)):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 5)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask[5 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask

            profile_batch = [bin_profiles[id] for id in batch]
            masks_batch = [masks[id] for id in batch]
            train_batch = np.array(profile_batch).reshape(size,
                                                          len(all_movies) * 5)
            train_masks = np.array(masks_batch).reshape(size,
                                                        len(all_movies) * 5)
            train_masks = train_masks.astype('float32')
            train(train_batch, train_masks)
            sys.stdout.write('.')
            sys.stdout.flush()

        ratings = []
        predictions = []

        for batch in chunker(tests.keys(), batch_size):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            masks = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                mask = [0] * (len(all_movies) * 5)

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask[5 * all_movies.index(movie_id) + _i] = 1

                example = expand(np.array([user_profile])).astype('float32')
                bin_profiles[userid] = example
                masks[userid] = mask

            positions = {profile_id: pos for pos, profile_id
                         in enumerate(batch)}
            profile_batch = [bin_profiles[el] for el in batch]
            test_batch = np.array(profile_batch).reshape(size,
                                                         len(all_movies) * 5)
            user_preds = revert_expected_value(predict(test_batch))
            for profile_id in batch:
                test_movies = tests[profile_id]
                try:
                    for movie, rating in test_movies:
                        current_profile = user_preds[positions[profile_id]]
                        predicted = current_profile[all_movies.index(movie)]
                        rating = float(rating)
                        ratings.append(rating)
                        predictions.append(predicted)
                except Exception:
                    pass

        vabs = np.vectorize(abs)
        distances = np.array(ratings) - np.array(predictions)

        mae = vabs(distances).mean()
        rmse = sqrt((distances ** 2).mean())

        iteration_result = {
            'iteration': j,
            'k': k,
            'momentum': momentum,
            'mae': mae,
            'rmse': rmse,
            'lrate': current_l_w
        }

        config_result['results'].append(iteration_result)

        print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse))

        with open('experiments/{}_{}.json'.format(config_name, name), 'wt') as res_output:
            res_output.write(json.dumps(config_result, indent=4))

        W,V,H = rbm.get_weights()
        print H

Пример #33

0

Показать файл

Файл: nosparse_info_user_based.py Проект: sagarverma/LC-CFRBM

def run(name, dataset, user_info, config, all_users, all_movies, all_occupations, all_sex, all_ages, tests, initial_v, sep):
    config_name = config['name']
    number_hidden = config['number_hidden']
    epochs = config['epochs']
    ks = config['ks']
    momentums = config['momentums']
    l_w = config['l_w']
    l_v = config['l_v']
    l_h = config['l_h']
    decay = config['decay']
    batch_size = config['batch_size']

    config_result = config.copy()
    config_result['results'] = []

    vis_x = T.matrix()
    vis_o = T.matrix()
    vis_s = T.matrix()
    vis_a = T.matrix()
    vmasks_x = T.matrix()
    vmasks_o = T.matrix()
    vmasks_s = T.matrix()
    vmasks_a = T.matrix()

    rbm = CFRBM(len(all_movies) * 5, len(all_occupations), 1, len(all_ages), number_hidden)

    profiles = defaultdict(list)

    with open(dataset, 'rt') as data:
        for i, line in enumerate(data):
            uid, mid, rat, timstamp = line.strip().split(sep)
            profiles[uid].append((mid, float(rat)))

    print("Users and ratings loaded")

    user_occ = defaultdict(list)
    user_sex = defaultdict(list)
    user_age = defaultdict(list)

    r = csv.reader(open(user_info, 'rb'), delimiter='|')
    for row in r:
        user_age[row[0]] = [int(x) for x in row[1:7]]
        user_sex[row[0]] = [int(row[7])]
        user_occ[row[0]] = [int(x) for x in row[8:]]

    print("User info loaded")

    for j in range(epochs):
        def get_index(col):
            if j/(epochs/len(col)) < len(col):
                return j/(epochs/len(col))
            else:
                return -1

        index = get_index(ks)
        mindex = get_index(momentums)
        icurrent_l_w = get_index(l_w)
        icurrent_l_v = get_index(l_v)
        icurrent_l_h = get_index(l_h)

        k = ks[index]
        momentum = momentums[mindex]
        current_l_w = l_w[icurrent_l_w]
        current_l_v = l_v[icurrent_l_v]
        current_l_h = l_h[icurrent_l_h]

        train = rbm.cdk_fun(vis_x,
                            vis_o,
                            vis_s,
                            vis_a,
                            vmasks_x,
                            vmasks_o,
                            vmasks_s,
                            vmasks_a,
                            k=k,
                            w_lr=current_l_w,
                            v_lr=current_l_v,
                            h_lr=current_l_h,
                            decay=decay,
                            momentum=momentum)
        predict = rbm.predict(vis_x, vis_o, vis_s, vis_a)

        start_time = time.time()

        for batch_i, batch in enumerate(chunker(profiles.keys(),
                                                batch_size)):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            occ_profiles = {}
            sex_profiles = {}
            age_profiles = {}
            masks_x = {}
            masks_o = {}
            masks_s = {}
            masks_a = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                occ_profile = [0.] * len(all_occupations)
                sex_profile = [0.] * 1
                age_profile = [0.] * len(all_ages)
                mask_x = [0] * (len(all_movies) * 5)
                mask_o = [1] * (len(all_occupations))
                mask_s = [1] * (1)
                mask_a = [1] * (len(all_ages))

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask_x[5 * all_movies.index(movie_id) + _i] = 1

                mask_o = [1] * len(all_occupations)
                mask_s = [1] * 1
                mask_a = [1] * len(all_ages)

                example_x = expand(np.array([user_profile])).astype('float32')
                example_o = expand(np.array([occ_profile]), k=1).astype('float32')
                example_s = expand(np.array([sex_profile]), k=1).astype('float32')
                example_a = expand(np.array([age_profile]), k=1).astype('float32')
                bin_profiles[userid] = example_x
                occ_profiles[userid] = example_o
                sex_profiles[userid] = example_s
                age_profiles[userid] = example_a
                masks_x[userid] = mask_x
                masks_o[userid] = mask_o
                masks_s[userid] = mask_s
                masks_a[userid] = mask_a

            profile_batch = [bin_profiles[id] for id in batch]
            occ_batch = [occ_profiles[id] for id in batch]
            sex_batch = [sex_profiles[id] for id in batch]
            age_batch = [age_profiles[id] for id in batch]
            masks_x_batch = [masks_x[id] for id in batch]
            masks_o_batch = [masks_o[id] for id in batch]
            masks_s_batch = [masks_s[id] for id in batch]
            masks_a_batch = [masks_a[id] for id in batch]
            train_batch_x = np.array(profile_batch).reshape(size,
                                                          len(all_movies) * 5)
            train_batch_o = np.array(occ_batch).reshape(size,
                                                         len(all_occupations))
            train_batch_s = np.array(sex_batch).reshape(size,
                                                         1)
            train_batch_a = np.array(age_batch).reshape(size,
                                                         len(all_ages))
            train_masks_x = np.array(masks_x_batch).reshape(size,
                                                        len(all_movies) * 5)
            train_masks_o = np.array(masks_o_batch).reshape(size,
                                                        len(all_occupations))
            train_masks_s = np.array(masks_s_batch).reshape(size,
                                                        1)
            train_masks_a = np.array(masks_a_batch).reshape(size,
                                                        len(all_ages))
            train_masks_x = train_masks_x.astype('float32')
            train_masks_o = train_masks_o.astype('float32')
            train_masks_s = train_masks_s.astype('float32')
            train_masks_a = train_masks_a.astype('float32')
            train(train_batch_x, train_batch_o, train_batch_s, train_batch_a, train_masks_x, train_masks_o, train_masks_s, train_masks_a)
            sys.stdout.write('.')
            sys.stdout.flush()

        end_time = time.time()

        train_time = end_time - start_time

        ratings = []
        predictions = []

        start_time = time.time()

        for batch in chunker(tests.keys(), batch_size):
            size = min(len(batch), batch_size)

            # create needed binary vectors
            bin_profiles = {}
            occ_profiles = {}
            sex_profiles = {}
            age_profiles = {}
            masks_x = {}
            masks_o = {}
            masks_s = {}
            masks_a = {}
            for userid in batch:
                user_profile = [0.] * len(all_movies)
                occ_profile = [0.] * len(all_occupations)
                sex_profile = [0.] * 1
                age_profile = [0.] * len(all_ages)
                mask_x = [0] * (len(all_movies) * 5)
                mask_o = [1] * (len(all_occupations))
                mask_s = [1] * (1)
                mask_a = [1] * (len(all_ages))

                for movie_id, rat in profiles[userid]:
                    user_profile[all_movies.index(movie_id)] = rat
                    for _i in range(5):
                        mask_x[5 * all_movies.index(movie_id) + _i] = 1

                mask_o = [1] * len(all_occupations)
                mask_s = [1] * 1
                mask_a = [1] * len(all_ages)

                example_x = expand(np.array([user_profile])).astype('float32')
                example_o = expand(np.array([occ_profile]), k=1).astype('float32')
                example_s = expand(np.array([sex_profile]), k=1).astype('float32')
                example_a = expand(np.array([age_profile]), k=1).astype('float32')
                bin_profiles[userid] = example_x
                occ_profiles[userid] = example_o
                sex_profiles[userid] = example_s
                age_profiles[userid] = example_a
                masks_x[userid] = mask_x
                masks_o[userid] = mask_o
                masks_s[userid] = mask_s
                masks_a[userid] = mask_a

            positions = {profile_id: pos for pos, profile_id
                         in enumerate(batch)}
            profile_batch = [bin_profiles[el] for el in batch]
            occ_batch = [occ_profiles[el] for el in batch]
            sex_batch = [sex_profiles[el] for el in batch]
            age_batch = [age_profiles[el] for el in batch]
            test_batch_x = np.array(profile_batch).reshape(size,
                                                         len(all_movies) * 5)
            test_batch_o = np.array(occ_batch).reshape(size,
                                                        len(all_occupations))
            test_batch_s = np.array(sex_batch).reshape(size,
                                                        1)
            test_batch_a = np.array(age_batch).reshape(size,
                                                        len(all_ages))
            user_preds = revert_expected_value(predict(test_batch_x, test_batch_o, test_batch_s, test_batch_a))
            for profile_id in batch:
                test_movies = tests[profile_id]
                try:
                    for movie, rating in test_movies:
                        current_profile = user_preds[positions[profile_id]]
                        predicted = current_profile[all_movies.index(movie)]
                        rating = float(rating)
                        ratings.append(rating)
                        predictions.append(predicted)
                except Exception:
                    pass

        end_time = time.time()

        test_time = end_time - start_time

        true_rat = np.array(ratings, dtype=np.uint8)
        pred_rat = np.array(predictions, dtype=np.uint8)

        #print true_rat < 3, true_rat
        prec_rec = precision_recall_fscore_support(true_rat < 3,pred_rat < 3, average='binary')
        print prec_rec

        vabs = np.vectorize(abs)
        distances = np.array(ratings) - np.array(predictions)

        mae = vabs(distances).mean()
        rmse = sqrt((distances ** 2).mean())

        iteration_result = {
            'iteration': j,
            'k': k,
            'momentum': momentum,
            'mae': mae,
            'rmse': rmse,
            'lrate': current_l_w,
            'train_time': train_time,
            'test_time': test_time,
            'prec_rec': prec_rec
        }

        config_result['results'].append(iteration_result)

        print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse))

        with open('experiments/{}_{}.json'.format(config_name, name), 'wt') as res_output:
            res_output.write(json.dumps(config_result, indent=4))

Пример #34

0

Показать файл

    callbacks_list = [checkpoint, redonplat]  # early

    model.fit_generator(gen(train_dict, aug=False),
                        validation_data=gen(val_dict),
                        epochs=40,
                        verbose=2,
                        steps_per_epoch=1000,
                        validation_steps=300,
                        callbacks=callbacks_list)
    model.load_weights(file_path)

    for record in tqdm(test_dict):
        all_rows = test_dict[record]['x']
        record_y_gt = []
        record_y_pred = []
        for batch_hyp in chunker(range(all_rows.shape[0])):

            X = all_rows[min(batch_hyp):max(batch_hyp) + 1, ...]
            Y = test_dict[record]['y'][min(batch_hyp):max(batch_hyp) + 1]

            X = np.expand_dims(X, 0)

            X = rescale_array(X)

            Y_pred = model.predict(X)
            Y_pred = Y_pred.argmax(axis=-1).ravel().tolist()

            gt += Y.ravel().tolist()
            preds += Y_pred

            record_y_gt += Y.ravel().tolist()

Пример #35

0

Показать файл

Файл: mod.py Проект: ssiwapol/time-series-forecasting

 def validate(self, output_dir, act_st, test_st, test_pr, test_model,
              fcst_pr, pr_st, chunk_sz, cpu):
     """Validate forecast model and write result by batch
     Parameters
     ----------
     output_dir : str
         output directory
     act_st : datetime
         actual start date
     test_st : datetime
         test start date
     test_pr : int
         number of rolling period to test (months)
     test_model : list
         list of model to test
     fcst_pr : int
         number of periods to forecast for each rolling
     pr_st : int
         starting period for each forecast (default 0/1)
     chunk_sz : int
         number of item to validate for each chunk
     cpu : int
         number of running processors
     """
     # make output directory
     output_dir = "{}validate_{}/".format(
         output_dir,
         datetime.datetime.now(timezone(self.tz)).strftime("%Y%m%d-%H%M%S"))
     self.output_dir = output_dir
     self.fp.mkdir(output_dir)
     self.lg.logtxt("create output directory: {}".format(output_dir))
     self.fp.writecsv(self.df, "{}input_actual.csv".format(output_dir))
     # write external features
     if self.ext is not None:
         self.fp.writecsv(self.ext,
                          "{}input_external.csv".format(output_dir))
         self.fp.writecsv(self.ext_lag,
                          "{}input_externallag.csv".format(output_dir))
         self.lg.logtxt(
             "write input file: {}input_actual.csv | {}input_external.csv | {}input_externallag.csv"
             .format(output_dir, output_dir, output_dir))
     else:
         self.lg.logtxt(
             "write input file: {}input_actual.csv".format(output_dir))
     # set parameter
     items = self.df['id'].unique()
     n_chunk = len([x for x in chunker(items, chunk_sz)])
     test_date = [
         x.to_pydatetime() + datetime.timedelta(days=+test_st.day - 1)
         for x in pd.date_range(start=test_st, periods=test_pr, freq='MS')
     ]
     self.lg.logtxt(
         "total items: {} | chunk size: {} | total chunk: {}".format(
             len(items), chunk_sz, n_chunk))
     # loop by chunk
     cpu_count = 1 if cpu <= 1 else multiprocessing.cpu_count(
     ) if cpu >= multiprocessing.cpu_count() else cpu
     self.lg.logtxt("run at {} processor(s)".format(cpu_count))
     for i, c in enumerate(chunker(items, chunk_sz), 1):
         df_fcst = pd.DataFrame()
         if cpu_count == 1:
             for r in [
                     self.validate_byitem(x, act_st, test_date, test_model,
                                          fcst_pr, pr_st, i) for x in c
             ]:
                 df_fcst = df_fcst.append(r, ignore_index=True)
         else:
             pool = multiprocessing.Pool(processes=cpu_count)
             for r in pool.starmap(
                     self.validate_byitem,
                 [[x, act_st, test_date, test_model, fcst_pr, pr_st, i]
                  for x in c]):
                 df_fcst = df_fcst.append(r, ignore_index=True)
             pool.close()
             pool.join()
         # write csv file
         output_path = "{}output_validate_{:04d}-{:04d}.csv".format(
             output_dir, i, n_chunk)
         self.fp.writecsv(df_fcst, output_path)
         self.lg.logtxt("write output file ({}/{}): {}".format(
             i, n_chunk, output_path))
     self.lg.logtxt("[END VALIDATION]")
     self.lg.writelog("{}logfile.log".format(output_dir))

Python chunker примеры использования