def __getitem__(self, index, mus_win=512, mus_hop=256, eeg_win=32, eeg_hop=2): """Generates one sample of data""" # Select sample index_rand = round(np.random.uniform(0, len(self.eeg) - 1)) X = chunker(self.eeg[index_rand], self.music[index_rand], self.sample_len, self.eeg_sr, self.sr) X_m = stft(X[1], self.sr, mus_win, mus_hop) X_e = stft_eeg(X[0], self.sr, eeg_win, eeg_hop) if self.use_noise: X_m = add_rand_noise(abs(X_m)) X_e = add_rand_noise(abs(X_e)) X_m = to_log(abs(X_m) + 1e-6) X_e = z_norm(X_e) X_e = to_log(abs(X_e) + 1e-6) X_e = z_norm(X_e) X_e = torch.tensor(X_e).float() X_m = torch.tensor(X_m).float() X_m = (X_m - X_m.mean(dim=0, keepdim=True)) / ( X_m.std(dim=0, keepdim=True) + 1e-6) for i in np.arange(X_e.size(0)): X_e[i] = (X_e[i] - X_e[i].mean(dim=0, keepdim=True)) / ( X_e[i].std(dim=0, keepdim=True) + 1e-6) return X_e, X_m
def update_TLEs(): global TLEs url = TLE_SETTINGS['url'] req = urllib.request.Request(url, method='GET') retrieved_lines = [] with urllib.request.urlopen(req) as f: if f.status == 200: retrieved_lines = [line.decode().replace('\r\n', '').strip() for line in list(f.readlines())] else: raise Exception("Error downloading TLE file") new_TLEs = [] for group in chunker(retrieved_lines, 3): sat = { 'name': group[0], 'id': int(group[2].split()[1]), 'line1': group[1], 'line2': group[2] } new_TLEs.append(sat) TLEs = new_TLEs prep_data()
def update_TLEs(localOnly=False): global TLEs # print('updating TLEs') if localOnly and Path(TLE_SETTINGS['localFile']).is_file(): with open(TLE_SETTINGS['localFile']) as local: TLEs = json.load(local) prep_data() return url = TLE_SETTINGS['url'] logging.info(f"Retrieving TLE information from {url}") # print('downloading TLEs') req = urllib.request.Request(url, method='GET') retrieved_lines = [] with urllib.request.urlopen(req) as f: if f.status == 200: retrieved_lines = [ line.decode().replace('\r\n', '').strip() for line in list(f.readlines()) ] else: logging.error(f'Cannot retrieve TLE file: {f.status}: {f.reason}') # if you get an error page from ISP with a stupid 200 code if len(retrieved_lines) < 3 or retrieved_lines[0].startswith('<!'): logging.error(f'Cannot retrieve TLE file: {retrieved_lines[0]}') # if no TLEs in memory fall back to last local temp file if len(TLEs) < 3: with open(TLE_SETTINGS['localFile']) as local: TLEs = json.load(local) prep_data() return # filter retrieved lines to TLEs array and write to local temp file pattern = re.compile(TLE_SETTINGS['filter']) new_TLEs = [] for group in chunker(retrieved_lines, 3): if len(group) == 3 and pattern.match(group[0]): sat = { 'name': group[0], 'id': int(group[2].split()[1]), 'line1': group[1], 'line2': group[2] } new_TLEs.append(sat) TLEs = new_TLEs prep_data() # write to local file with open(TLE_SETTINGS['localFile'], 'w') as local: json.dump(TLEs, local, ensure_ascii=False, indent=4) logging.info( f'Retrieved {len(retrieved_lines)//3} TLEs from {url}, filtered {len(TLEs)} TLEs' )
def build_vocab(lines, result_queue): """Build a word count dictionary given an iterable of raw lines.""" vocab = collections.Counter() chunks = utils.chunker(lines, 10000, '') for chunk in chunks: tokenizer = Tokenizer(chunk) vocab.update(tok for tok in tokenizer) result_queue.put(vocab) return vocab
def clean_corpus(corpus): with open(corpus, 'r', encoding='iso-8859-1') as in_file: fname, _, ext = corpus.rpartition('.') with open(f'{fname}_clean.{ext}', 'w', encoding='iso-8859-1') as out_file: chunks = utils.chunker(in_file, 10000, '') for chunk in chunks: tokenizer = Tokenizer(chunk) for sent in tokenizer.yield_sentences(): out_file.write(sent + '\n')
def predict(self, x): if self.standardize: x = self.scaler.transform(x) # Predict in chunks to save memory preds = np.array([]) for x_ch in utils.chunker(x, 2**19): preds = np.append(preds, self.model.predict(x_ch)) return pd.Series(preds, index=x.index).clip(0., 20.)
def main(): sentence_batches = utils.chunker(CSV_DATA['RedditNews.csv'].News, CHUNK_SIZE) character_ids = process_sent_batches(sentence_batches) options = config.MODELS + config.MODEL_FILE weights = config.MODELS + config.OPTIONS_FILE model = load_elmo(options, weights) embeddings = get_sent_embeddings(character_ids, model)
def encode(): """Encode all of the sentences to vector form""" train, dev, test = loader.getData() sentences = [] tokens = [] # Load the vocab en_vocab = get_english_vocab(DATA_DIR, VOCAB_SIZE) # Collect all the training sentences for i, row in pd.concat((train, test)).iterrows(): if isinstance(row["sentence1"], basestring) and isinstance( row["sentence2"], basestring): sentences.append(row["sentence1"]) sentences.append(row["sentence2"]) # Allocate the sentences to buckets bucketed = {} for sentence in sentences: bucket_id = get_bucket(en_vocab, sentence) bucketed.setdefault(bucket_id, []) bucketed[bucket_id].append(sentence) mapped = {} with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True, train_dir=TRAIN_DIR) model.batch_size = BATCH_SIZE # We decode 64 sentence at a time. # Iterate over each bucket for bucket_id, sentences in bucketed.iteritems(): for batch in chunker(sentences, BATCH_SIZE): data = [] for sentence in batch: token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) expected_output = [] data.append((token_ids, expected_output)) encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: data}, bucket_id) contexts = model.step_context(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id) features = np.hstack(contexts) print 'Extracted another set of features with shape:', features.shape # Now we align sentences with their contexts for i, sentence in enumerate(batch): mapped[sentence] = features[i, :].tolist() print sentence print mapped[sentence] print "Saving sentences to %s" % JSON_NAME with open(JSON_NAME, 'w') as file: json.dump(mapped, file)
def resample_df_mean(df, f_ratio_of_output_cone=3.65): eelabel = "EE_in_" + str(f_ratio_of_output_cone) + "_cone" chunks = utils.chunker(df, 4) dff = pd.DataFrame() for chunk in chunks: dff = pd.concat([ dff, pd.DataFrame(chunk[[ "f_ratio_in", "f_ratio_out", "EE_in_input_cone", eelabel ]].median(skipna=True)).T ], ignore_index=True) return dff
def get_follow_ids(twitter_names): ids = [] if path.exists("twitter_ids.txt"): f = open("twitter_ids.txt", "r+") for line in f: c = line[:-1] ids.append(c) f.close() return ids else: for t in list(chunker(twitter_names, 100)): follow_ids = [] for user in api.UsersLookup(screen_name=t): id_str = json.loads(user.__str__())['id_str'] follow_ids.append(id_str) write_to_file(follow_ids)
def _parse_entry_table(self) -> (List[Firmware], List[Directory]): entries = chunker(self.firmware_entry_table[4:], 4) for index, entry in enumerate(entries): firmware_type = self._FIRMWARE_ENTRY_TYPES[index] if index < len( self._FIRMWARE_ENTRY_TYPES) else 'unknown' address = struct.unpack('<I', entry)[0] & 0x00FFFFFF # assumption: offset == 0 is an invalid entry if address not in [0x0, 0xfffffe]: directory = self[address:address + 16 * 8] magic = directory[:4] # either this entry points to a PSP directory directly if magic in [b'$PSP', b'$BHD']: directory = Directory(self, address, firmware_type) self.directories.append(directory) # if this Directory points to a secondary directory: add it, too if directory.secondary_directory_address is not None: secondary_directory = Directory( self, directory.secondary_directory_address, 'secondary') self.directories.append(secondary_directory) # or this entry points to a combo-directory (i.e. two directories) elif magic == b'2PSP': psp_dir_one_addr = struct.unpack( '<I', directory[10 * 4:10 * 4 + 4])[0] & 0x00FFFFFF psp_dir_two_addr = struct.unpack( '<I', directory[14 * 4:14 * 4 + 4])[0] & 0x00FFFFFF for address in [psp_dir_one_addr, psp_dir_two_addr]: directory = Directory(self, address, firmware_type) self.directories.append(directory) # if this Directory points to a secondary directory: add it, too if directory.secondary_directory_address is not None: secondary_directory = Directory( self, directory.secondary_directory_address, 'secondary') self.directories.append(secondary_directory) # or this entry is unparsable and thus a firmware else: firmware = Firmware(self, address, firmware_type, magic) self.firmwares.append(firmware)
def ascii_discrepancies(data, window, local_diff, feature_names=[]): vectors = [] data_length = len(data) isascii = lambda s: len(s) == len(s.encode()) for i, entry in enumerate(data): entry = entry.lower() window_chars = round(len(entry) * window) local = [] for chunk in chunker(entry, window_chars): non_ascii_count = 0 for char in chunk: if not char: continue if not isascii(char): non_ascii_count += 1 local.append([non_ascii_count / len(chunk)]) if(local_diff): local_len = len(local) for local_index in range(local_len): if(local_index == local_len - 1): break local[local_index] = [abs(a - b) for a, b in zip(local[local_index], local[local_index + 1])] min_v = np.amin(local, axis=0).tolist() max_v = np.amax(local, axis=0).tolist() diff = np.subtract(max_v, min_v).tolist() vectors.append(diff) print_progress_bar(i + 1, data_length, description = 'ascii_chars_discrepancies') feature_names.extend(['ascii_chars_discrepancies']) return vectors
def get_twitter_name_from_ids(cmc_ids): info_url = 'https://pro-api.coinmarketcap.com/v1/cryptocurrency/info' tw_ids = [] for id_chunk in list(chunker(cmc_ids, 500)): try: info_parameters = {'id': ','.join(map(str, id_chunk))} response = session.get(info_url, params=info_parameters) data = json.loads(response.text) for i in data['data'].keys(): print(data['data'][i]) twitter = data['data'][i]['urls']['twitter'] if twitter: d = twitter[0].split('/') tw_id = d[len(d) - 1] tw_ids.append(tw_id.__str__()) except (ConnectionError, Timeout, TooManyRedirects, Exception) as e: print(e) return tw_ids
def get_sentence_to_context_map(sentences): """ Process all of the sentences with the model Return a map between sentence text and the context vectors The order of the map is undefined due to the bucketing process """ # Load the vocab en_vocab = get_english_vocab(DATA_DIR, VOCAB_SIZE) # Allocate the sentences to buckets bucketed = {} for sentence in sentences: bucket_id = get_bucket(en_vocab, sentence) bucketed.setdefault(bucket_id, []) bucketed[bucket_id].append(sentence) mapped = {} with tf.Session() as sess: # Create model and load parameters. model = create_model(sess, True, train_dir=TRAIN_DIR) model.batch_size = BATCH_SIZE # We decode 64 sentence at a time. # Iterate over each bucket for bucket_id, sentences in bucketed.iteritems(): for batch in chunker(sentences, BATCH_SIZE): data = [] # Tokenize each sentence for sentence in batch: token_ids = data_utils.sentence_to_token_ids( tf.compat.as_bytes(sentence), en_vocab) expected_output = [] data.append((token_ids, expected_output)) # Use the model to obtain contexts for each sentence in the batch encoder_inputs, decoder_inputs, target_weights = model.get_batch( {bucket_id: data}, bucket_id) contexts = model.step_context(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id) features = np.hstack(contexts) print 'Encoded {0} sentences into {1} dimensional vectors'.format( *features.shape) # Now we align sentences with their contexts for i, sentence in enumerate(batch): mapped[sentence] = features[i, :].tolist() return mapped
def pred_alt(X): preds = [] gt = [] for record in test_dict: all_rows = test_dict[record]['x'] record_y_gt = [] record_y_pred = [] for batch_hyp in chunker(range(all_rows.shape[0])): X = all_rows[min(batch_hyp):max(batch_hyp)+1, ...] Y = test_dict[record]['y'][min(batch_hyp):max(batch_hyp)+1] X = np.expand_dims(X, 0) X = rescale_array(X) Y_pred = model.predict(X) Y_pred = Y_pred.argmax(axis=-1).ravel().tolist() gt += Y.ravel().tolist() preds += Y_pred record_y_gt += Y.ravel().tolist() record_y_pred += Y_pred # fig_1 = plt.figure(figsize=(12, 6)) # plt.plot(record_y_gt) # plt.title("Sleep Stages") # plt.ylabel("Classes") # plt.xlabel("Time") # plt.show() # # fig_2 = plt.figure(figsize=(12, 6)) # plt.plot(record_y_pred) # plt.title("Predicted Sleep Stages") # plt.ylabel("Classes") # plt.xlabel("Time") # plt.show() return preds
def _parse_entries(self): for entry_bytes in self.body.get_chunks(self._entry_size): entry_fields = {} for key, word in zip(self.ENTRY_FIELDS, chunker(entry_bytes, 4)): entry_fields[key] = struct.unpack('<I', word)[0] # addresses are all starting at 0xff000000, but we just want everything from there entry_fields['offset'] &= 0x00FFFFFF entry = Entry.from_fields(self, self.parent_buffer, entry_fields['type'], entry_fields['size'], entry_fields['offset']) for existing_entry in self.blob.unique_entries: if entry == existing_entry: existing_entry.references.append(self) if isinstance(entry, PubkeyEntry): self.blob.pubkeys[entry.key_id] = entry self.blob.unique_entries.add(entry) self.entries.append(entry)
def apply_license(self, path, verbose=False): """ Applies a given license (class instance) to a given path. lic License : License class instance, should have been initialized with one of your choice (apache, bsd, ...) path String : File or dir, License should apply to. """ files_in_path = self._get_path_elements(path) for chunk in utils.chunker(files_in_path, self.PATH_CHUNKS_SIZE): paths = [x[1] for x in chunk] # Extract file paths self.buffer_file_descriptors(paths, mode='r+') for elem in chunk: # apply license using file extension file_license = self.license.get_license_as(elem[0]) # retrieve the buffered file descriptor from path self.write_header_to_file(self.fd_buffer[elem[1]], file_license) if verbose : print "Stamping: %s" % found_file[1] self._clear_fd_buffers() return
def make_mask(array, labels, rule): ''' Generates a rule corresponding np.array mask Supported rules: 'close', 'far_lb1', 'far_lb2', 'v_far' Rules represent the distance between the labels - lb1 & lb2 ''' indices = [] stack = [0] last = array[0] for i, item in enumerate(array[1:], 1): if item == last: stack.append(i) else: if stack: indices.append(stack[0]) # min index indices.append(stack[-1]) # max or repeat index stack = [] stack.append(i) last = item if stack: indices.append(stack[0]) indices.append(stack[-1]) # eliminate first pair if not the starting label if labels[0] != array[0]: indices = indices[2:] # eliminate the last pair to comply with lb1, lb2 pairs if len(indices) / 2 % 2: indices = indices[:-2] result = [] for ind in chunker(indices, 4): result.extend(dist_ind(ind, rule)) mask = np.zeros(len(array), dtype=int) mask[result] = 1 return mask
for c in tqdm(cubes): flat_cubes, rewards = get_all_possible_actions_cube_small(c) cube_next_reward.append(rewards) flat_next_states.extend(flat_cubes) cube_flat.append(flatten_1d_b(c)) for _ in range(20): cube_target_value = [] cube_target_policy = [] next_state_value, _ = model.predict(np.array(flat_next_states), batch_size=1024) next_state_value = next_state_value.ravel().tolist() next_state_value = list( chunker(next_state_value, size=len(action_map_small))) for c, rewards, values in tqdm( zip(cubes, cube_next_reward, next_state_value)): r_plus_v = 0.4 * np.array(rewards) + np.array(values) target_v = np.max(r_plus_v) target_p = np.argmax(r_plus_v) cube_target_value.append(target_v) cube_target_policy.append(target_p) cube_target_value = (cube_target_value - np.mean(cube_target_value) ) / (np.std(cube_target_value) + 0.01) print(cube_target_policy[-30:]) print(cube_target_value[-30:])
def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep): config_name = config["name"] number_hidden = config["number_hidden"] epochs = config["epochs"] ks = config["ks"] momentums = config["momentums"] l_w = config["l_w"] l_v = config["l_v"] l_h = config["l_h"] decay = config["decay"] config_result = config.copy() config_result["results"] = [] vis = T.matrix() vmasks = T.matrix() rbm = CFRBM(len(all_users) * 5, number_hidden) profiles = defaultdict(list) with open(dataset, "rt") as data: for i, line in enumerate(data): uid, mid, rat, timstamp = line.strip().split(sep) profiles[mid].append((uid, float(rat))) print("Users and ratings loaded") for j in range(epochs): def get_index(col): if j / (epochs / len(col)) < len(col): return j / (epochs / len(col)) else: return -1 index = get_index(ks) mindex = get_index(momentums) icurrent_l_w = get_index(l_w) icurrent_l_v = get_index(l_v) icurrent_l_h = get_index(l_h) k = ks[index] momentum = momentums[mindex] current_l_w = l_w[icurrent_l_w] current_l_v = l_v[icurrent_l_v] current_l_h = l_h[icurrent_l_h] train = rbm.cdk_fun( vis, vmasks, k=k, w_lr=current_l_w, v_lr=current_l_v, h_lr=current_l_h, decay=decay, momentum=momentum ) predict = rbm.predict(vis) batch_size = 10 for batch_i, batch in enumerate(utils.chunker(profiles.keys(), batch_size)): size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} masks = {} for movieid in batch: movie_profile = [0.0] * len(all_users) mask = [0] * (len(all_users) * 5) for user_id, rat in profiles[movieid]: movie_profile[all_users.index(user_id)] = rat for _i in range(5): mask[5 * all_users.index(user_id) + _i] = 1 example = expand(np.array([movie_profile])).astype("float32") bin_profiles[movieid] = example masks[movieid] = mask movies_batch = [bin_profiles[id] for id in batch] masks_batch = [masks[id] for id in batch] train_batch = np.array(movies_batch).reshape(size, len(all_users) * 5) train_masks = np.array(masks_batch).reshape(size, len(all_users) * 5) train_masks = train_masks.astype("float32") train(train_batch, train_masks) sys.stdout.write(".") sys.stdout.flush() batch_size = 10 ratings = [] predictions = [] for batch in utils.chunker(tests.keys(), batch_size): size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} masks = {} for movieid in batch: movie_profile = [0.0] * len(all_users) mask = [0] * (len(all_users) * 5) for userid, rat in profiles[movieid]: movie_profile[all_users.index(userid)] = rat for _i in range(5): mask[5 * all_users.index(userid) + _i] = 1 example = expand(np.array([movie_profile])).astype("float32") bin_profiles[movieid] = example masks[movieid] = mask positions = {movie_id: pos for pos, movie_id in enumerate(batch)} movies_batch = [bin_profiles[el] for el in batch] test_batch = np.array(movies_batch).reshape(size, len(all_users) * 5) movie_predictions = revert_expected_value(predict(test_batch)) for movie_id in batch: test_users = tests[movie_id] try: for user, rating in test_users: current_movie = movie_predictions[positions[movie_id]] predicted = current_movie[all_users.index(user)] rating = float(rating) ratings.append(rating) predictions.append(predicted) except Exception: pass vabs = np.vectorize(abs) distances = np.array(ratings) - np.array(predictions) mae = vabs(distances).mean() rmse = sqrt((distances ** 2).mean()) iteration_result = { "iteration": j, "k": k, "momentum": momentum, "mae": mae, "rmse": rmse, "lrate": current_l_w, } config_result["results"].append(iteration_result) print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse)) with open("{}_{}.json".format(config_name, name), "wt") as res_output: res_output.write(json.dumps(config_result, indent=4))
all_users, all_movies, tests = load_dataset(FLAGS.train_path, FLAGS.test_path, FLAGS.sep, user_based=True) rbm = RBM(len(all_movies) * 5, FLAGS.num_hidden) print("model created") init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) profiles = defaultdict(list) with open(FLAGS.train_path, 'rt') as data: for i, line in enumerate(data): uid, mid, rat, timstamp = line.strip().split(FLAGS.sep) profiles[uid].append((mid, float(rat))) print("Users and ratings loaded") for e in range(FLAGS.epochs): for batch_i, batch in enumerate(chunker(list(profiles.keys()), FLAGS.batch_size)): size = min(len(batch), FLAGS.batch_size) # create needed binary vectors bin_profiles = {} masks = {} # only consider the movie that users have iteracted for userid in batch: user_profile = np.array([0.] * len(all_movies)) mask = [0] * (len(all_movies) * 5) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask[5 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example
def _multi_query( sparql, timeout, graph_pattern, source_target_pairs, batch_size, _vars, _values, _ret_val_mapping, _res_init, _chunk_q, _chunk_res, _res_update=lambda r, u, **___: r.update(u), **kwds): total_time = 0 res = _res_init(source_target_pairs, **kwds) for val_chunk in chunker(_values, batch_size): q = _chunk_q(graph_pattern, _vars, val_chunk, **kwds) chunk_stps = [stp for v in val_chunk for stp in _ret_val_mapping[v]] _start_time = timer() t = None chunk_res = None loop = 1 while loop: loop -= 1 try: t, q_res = _query(sparql, timeout, q, **kwds) chunk_res = _chunk_res( q_res, _vars, _ret_val_mapping, **kwds) except EndPointNotFound: # happens if the endpoint reports a 404... # as virtuoso in rare cases seems to report a 404 let's # retry once after some time but then if not loop: # expected to 0 on first such exception logger.info( 'SPARQL endpoint reports a 404, will retry once in 10s' ) sleep(10) loop += 2 continue else: # expected to be 1 on second such exception loop = 0 logger.warning( 'SPARQL endpoint unreachable even after back-off ' 'and retry\n' 'could not perform query:\n%s for %s\nException:', q, val_chunk, exc_info=1, # appends exception to message ) t, chunk_res = timer() - _start_time, {} except (SPARQLWrapperException, SAXParseException, URLError) as e: if (isinstance(e, SPARQLWrapperException) and re.search( r'The estimated execution time [0-9]+ \(sec\) ' r'exceeds the limit of [0-9]+ \(sec\)\.', repr(e))): t, chunk_res = timeout, {} elif len(val_chunk) > 1: logger.debug('error in batch: {}'.format(val_chunk)) logger.debug('retrying with half size batch: {}...'.format( len(val_chunk) // 2 )) t, chunk_res = _multi_query( sparql, timeout, graph_pattern, chunk_stps, len(val_chunk) // 2, _vars, val_chunk, _ret_val_mapping, _res_init, _chunk_q, _chunk_res, _res_update, **kwds) else: logger.warning( 'could not perform query:\n%s for %s\nException:', q, val_chunk, exc_info=1, # appends exception to message ) t, chunk_res = timer() - _start_time, {} except Exception: # TODO: maybe introduce a max error counter? per process? logger.warning( 'unhandled exception, assuming empty res for multi-query:\n' 'Query:\n%s\nChunk:%r\nException:', q, val_chunk, exc_info=1, # appends exception to message ) t, chunk_res = timer() - _start_time, {} _res_update(res, chunk_res, **kwds) total_time += t if query_time_soft_exceeded(total_time, timeout): logger.debug('early terminating batch query as timeout/2 exceeded') break return total_time, res
def write_to_file(ids): f = open("twitter_names.txt", "a+") for o in ids: f.write('%s\n' % o) f.close() def get_twitter_names(): names = [] if path.exists("twitter_names.txt"): f = open("twitter_names.txt", "r+") for line in f: c = line[:-1] names.append(c) f.close() return names else: cmc_ids = get_top_n_cmc(1500) twitter_names = get_twitter_name_from_ids(cmc_ids) write_to_file(twitter_names) return twitter_names if __name__ == "__main__": t_names = get_twitter_names() for t in list(chunker(t_names, 100)): if t.__contains__("aeternity"): print(t)
client = MongoClient('localhost', 27017, username='******', password='******') db = client["taxiRides"] rides_collection = db["rides"] column_remapping = json.load( open(folder + '/../dataset/column_remapping.json')) FILES = CONFIG['FILES'] for FILE in FILES: print(f"Importing {FILE} Rides") with open(folder + '/../dataset/chicago_taxi_trips_' + FILE + '.csv') as csvfile: rides = csv.DictReader(csvfile) i = 0 # for progress CHUNKER_SIZE = 1000 if CONFIG['IMPORT_LIMIT'] > 0: rides = islice(rides, CONFIG['IMPORT_LIMIT']) for rides_chunk in chunker(rides, CHUNKER_SIZE): rides_chunk = [ embed_ride(ride, column_remapping) for ride in rides_chunk ] rides_collection.insert_many(rides_chunk) i += CHUNKER_SIZE if (i % 10000 == 0): print(f"Progress: {i}")
def parse_question(question): chunk_dict = {'id' : question.id} question = utils.chunker(question.question) for i in xrange(len(question)): chunk_dict[i] = question[i] return json.dumps(chunk_dict)
def get_pem_encoded(self): return b'-----BEGIN PUBLIC KEY-----\n' + \ b'\n'.join(chunker(b64encode(self.get_der_encoded()), 64)) + \ b'\n-----END PUBLIC KEY-----\n'
def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep): config_name = config['name'] number_hidden = config['number_hidden'] epochs = config['epochs'] ks = config['ks'] momentums = config['momentums'] l_w = config['l_w'] l_v = config['l_v'] l_h = config['l_h'] decay = config['decay'] batch_size = config['batch_size'] config_result = config.copy() config_result['results'] = [] vis = T.matrix() vmasks = T.matrix() rbm = CFRBM(len(all_movies) * 5, number_hidden) profiles = defaultdict(list) with open(dataset, 'rt') as data: for i, line in enumerate(data): uid, mid, rat, timstamp = line.strip().split(sep) profiles[uid].append((mid, float(rat))) print("Users and ratings loaded") for j in range(epochs): def get_index(col): if j/(epochs/len(col)) < len(col): return j/(epochs/len(col)) else: return -1 index = get_index(ks) mindex = get_index(momentums) icurrent_l_w = get_index(l_w) icurrent_l_v = get_index(l_v) icurrent_l_h = get_index(l_h) k = ks[index] momentum = momentums[mindex] current_l_w = l_w[icurrent_l_w] current_l_v = l_v[icurrent_l_v] current_l_h = l_h[icurrent_l_h] train = rbm.cdk_fun(vis, vmasks, k=k, w_lr=current_l_w, v_lr=current_l_v, h_lr=current_l_h, decay=decay, momentum=momentum) predict = rbm.predict(vis) for batch_i, batch in enumerate(chunker(profiles.keys(), batch_size)): size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 5) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask[5 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask profile_batch = [bin_profiles[id] for id in batch] masks_batch = [masks[id] for id in batch] train_batch = np.array(profile_batch).reshape(size, len(all_movies) * 5) train_masks = np.array(masks_batch).reshape(size, len(all_movies) * 5) train_masks = train_masks.astype('float32') train(train_batch, train_masks) sys.stdout.write('.') sys.stdout.flush() ratings = [] predictions = [] for batch in chunker(tests.keys(), batch_size): size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 5) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask[5 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask positions = {profile_id: pos for pos, profile_id in enumerate(batch)} profile_batch = [bin_profiles[el] for el in batch] test_batch = np.array(profile_batch).reshape(size, len(all_movies) * 5) user_preds = revert_expected_value(predict(test_batch)) for profile_id in batch: test_movies = tests[profile_id] try: for movie, rating in test_movies: current_profile = user_preds[positions[profile_id]] predicted = current_profile[all_movies.index(movie)] rating = float(rating) ratings.append(rating) predictions.append(predicted) except Exception: pass vabs = np.vectorize(abs) distances = np.array(ratings) - np.array(predictions) mae = vabs(distances).mean() rmse = sqrt((distances ** 2).mean()) iteration_result = { 'iteration': j, 'k': k, 'momentum': momentum, 'mae': mae, 'rmse': rmse, 'lrate': current_l_w } config_result['results'].append(iteration_result) print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse)) with open('{}_{}.json'.format(config_name, name), 'wt') as res_output: res_output.write(json.dumps(config_result, indent=4))
def forecast(self, output_dir, act_st, fcst_st, fcst_model, test_type, test_bck, top_model=3, ens_method='mean', chunk_sz=1, cpu=1): """Forecast and write result by batch Parameters ---------- output_dir : str output directory act_st : datetime actual start date fcst_st : datetime forecast date fcst_model : dict('period', [list of models]) forecast model options for each periods test_type : {'monthly', 'daily} type of testing back error by month or day test_bck : int number of months to test back chunk_sz : int number of item to validate for each chunk cpu : int number of running processors """ # make output directory output_dir = "{}forecast_{}/".format( output_dir, datetime.datetime.now(timezone(self.tz)).strftime("%Y%m%d-%H%M%S")) self.output_dir = output_dir self.fp.mkdir(output_dir) self.lg.logtxt("create output directory: {}".format(output_dir)) self.fp.writecsv(self.df_act, "{}input_actual.csv".format(output_dir)) self.fp.writecsv(self.df_fcstlog, "{}input_forecast.csv".format(output_dir)) # write external features if self.ext is not None: self.fp.writecsv(self.ext, "{}input_external.csv".format(output_dir)) self.fp.writecsv(self.ext_lag, "{}input_externallag.csv".format(output_dir)) self.lg.logtxt( "write input file: {}input_actual.csv | {}input_forecast.csv | {}input_external.csv | {}input_externallag.csv" .format(output_dir, output_dir, output_dir, output_dir)) else: self.lg.logtxt( "write input file: {}input_actual.csv | {}input_forecast.csv". format(output_dir, output_dir)) self.runitem = {} # set parameter items = self.df_act['id'].unique() n_chunk = len([x for x in chunker(items, chunk_sz)]) act_st = datetime.datetime.combine(act_st, datetime.datetime.min.time()) fcst_st = datetime.datetime.combine(fcst_st, datetime.datetime.min.time()) test_st = fcst_st + relativedelta(months=-test_bck) fcst_pr = len(fcst_model.keys()) pr_st = min(fcst_model.keys()) model_list = list(set(b for a in fcst_model.values() for b in a)) self.lg.logtxt( "total items: {} | chunk size: {} | total chunk: {}".format( len(items), chunk_sz, n_chunk)) # rank the models df_rank = self.rank_model(fcst_model, act_st, fcst_st, test_type, test_st) # forecast cpu_count = 1 if cpu <= 1 else multiprocessing.cpu_count( ) if cpu >= multiprocessing.cpu_count() else cpu self.lg.logtxt("run at {} processor(s)".format(cpu_count)) for i, c in enumerate(chunker(items, chunk_sz), 1): df_fcst = pd.DataFrame() if cpu_count == 1: for r in [ self.forecast_byitem(x, act_st, fcst_st, fcst_pr, model_list, pr_st, i) for x in c ]: df_fcst = df_fcst.append(r, ignore_index=True) else: pool = multiprocessing.Pool(processes=cpu_count) for r in pool.starmap( self.forecast_byitem, [[x, act_st, fcst_st, fcst_pr, model_list, pr_st, i] for x in c]): df_fcst = df_fcst.append(r, ignore_index=True) pool.close() pool.join() # ensemble forecast results df_ens = self.ensemble_model(df_fcst, df_rank, top_model, method=ens_method) # write forecast result fcst_path = "{}output_forecast_{:04d}-{:04d}.csv".format( output_dir, i, n_chunk) self.fp.writecsv(df_ens, fcst_path) # write forecast log result fcstlog_path = "{}output_forecastlog_{:04d}-{:04d}.csv".format( output_dir, i, n_chunk) self.fp.writecsv(df_fcst, fcstlog_path) self.lg.logtxt("write output file ({}/{}): {} | {}".format( i, n_chunk, fcst_path, fcstlog_path)) self.lg.logtxt("[END FORECAST]") self.lg.writelog("{}logfile.log".format(output_dir))
def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep): config_name = config['name'] number_hidden = config['number_hidden'] epochs = config['epochs'] ks = config['ks'] momentums = config['momentums'] l_w = config['l_w'] l_v = config['l_v'] l_h = config['l_h'] decay = config['decay'] batch_size = config['batch_size'] config_result = config.copy() config_result['results'] = [] vis = T.matrix() vmasks = T.matrix() rbm = CFRBM(len(all_movies) * 5, number_hidden) profiles = defaultdict(list) #all_ratings = np.zeros((943,1682*5), dtype=np.float32) #all_masks = np.zeros((943,1682*5), dtype=np.float32) with open(dataset, 'rt') as data: for i, line in enumerate(data): uid, mid, rat, timstamp = line.strip().split(sep) profiles[uid].append((mid, float(rat))) #for i in range(1,5): # if i == int(rat): # all_ratings[int(uid)-1][(int(mid)-1)*5+i-1] = 1.0 # all_masks[int(uid)-1][(int(mid)-1)*5+i-1] = 1.0 print("Users and ratings loaded") for j in range(epochs): def get_index(col): if j/(epochs/len(col)) < len(col): return j/(epochs/len(col)) else: return -1 index = get_index(ks) mindex = get_index(momentums) icurrent_l_w = get_index(l_w) icurrent_l_v = get_index(l_v) icurrent_l_h = get_index(l_h) k = ks[index] momentum = momentums[mindex] current_l_w = l_w[icurrent_l_w] current_l_v = l_v[icurrent_l_v] current_l_h = l_h[icurrent_l_h] train = rbm.cdk_fun(vis, vmasks, k=k, w_lr=current_l_w, v_lr=current_l_v, h_lr=current_l_h, decay=decay, momentum=momentum) predict = rbm.predict(vis) #batch_size = 10 start_time = time.time() for batch_i, batch in enumerate(chunker(profiles.keys(), batch_size)): #for batch_i in range(0,943,batch_size): #profile_batch = np.copy(all_ratings[batch_i:batch_i+batch_size]) #masks_batch = np.copy(all_masks[batch_i:batch_i+batch_size]) #print batch_i, len(profile_batch) size = min(len(batch), batch_size) #create needed binary vectors bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 5) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask[5 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask #print example[0].shape,userid,all_ratings[343].shape #print example[0][:20],all_ratings[343][:20],user_profile[:20] profile_batch = [bin_profiles[id] for id in batch] masks_batch = [masks[id] for id in batch] train_batch = np.array(profile_batch).reshape(size, len(all_movies) * 5) train_masks = np.array(masks_batch).reshape(size, len(all_movies) * 5) #print train_batch[0] train_masks = train_masks.astype('float32') train(train_batch, train_masks) #train(movies_batch, masks_batch) sys.stdout.write('.') sys.stdout.flush() end_time = time.time() train_time = end_time - start_time #batch_size = 10 ratings = [] predictions = [] start_time = time.time() for batch in chunker(tests.keys(), batch_size): size = min(len(batch), batch_size) #profile_batch = [] #from_test = [] #for b in batch: # profile_batch.append(all_ratings[int(b)-1]) # users = [0 for x in range(1682)] # for u in tests[b]: # users[int(u[0])-1] = int(u[1]) # from_test.append(users) bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 5) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask[5 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask positions = {profile_id: pos for pos, profile_id in enumerate(batch)} profile_batch = [bin_profiles[el] for el in batch] test_batch = np.array(profile_batch).reshape(size, len(all_movies) * 5) user_preds = revert_expected_value(predict(test_batch)) for profile_id in batch: test_movies = tests[profile_id] try: for movie, rating in test_movies: current_profile = user_preds[positions[profile_id]] predicted = current_profile[all_movies.index(movie)] rating = float(rating) ratings.append(rating) predictions.append(predicted) except Exception: pass end_time = time.time() test_time = end_time - start_time vabs = np.vectorize(abs) distances = np.array(ratings) - np.array(predictions) true_rat = np.array(ratings, dtype=np.uint8) pred_rat = np.array(predictions, dtype=np.uint8) #print true_rat < 3, true_rat prec_rec = precision_recall_fscore_support(true_rat < 3,pred_rat < 3, average='binary') print prec_rec mae = vabs(distances).mean() rmse = sqrt((distances ** 2).mean()) iteration_result = { 'iteration': j, 'k': k, 'momentum': momentum, 'mae': mae, 'rmse': rmse, 'lrate': current_l_w, 'train_time': train_time, 'test_time': test_time, 'prec_rec': prec_rec } config_result['results'].append(iteration_result) print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse)) with open('experiments/{}_{}.json'.format(config_name, name), 'wt') as res_output: res_output.write(json.dumps(config_result, indent=4)) W,V,H = rbm.get_weights() print H
def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep): config_name = config['name'] number_hidden = config['number_hidden'] epochs = config['epochs'] ks = config['ks'] momentums = config['momentums'] l_w = config['l_w'] l_v = config['l_v'] l_h = config['l_h'] lr_decay = config['lr_decay'][0] decay = config['decay'] batch_size = config['batch_size'] config_result = config.copy() config_result['results'] = [] vis = T.matrix() vmasks = T.matrix() rbm = CFRBM(len(all_movies) * 20, number_hidden) profiles = defaultdict(list) with open(dataset, 'rt') as data: for i, line in enumerate(data): uid, mid, rat = line.strip().split(sep) profiles[uid].append((mid, float(rat))) current_l_w = l_w[0] current_l_v = l_v[0] current_l_h = l_h[0] print("Users and ratings loaded") for j in range(epochs): print "epochs: ", j def get_index(col): if j / (epochs / len(col)) < len(col): return j / (epochs / len(col)) else: return -1 index = get_index(ks) mindex = get_index(momentums) #icurrent_l_w = get_index(l_w) #icurrent_l_v = get_index(l_v) #icurrent_l_h = get_index(l_h) k = ks[index] momentum = momentums[mindex] current_l_w *= lr_decay current_l_v *= lr_decay current_l_h *= lr_decay train = rbm.cdk_fun(vis, vmasks, k=k, w_lr=current_l_w, v_lr=current_l_v, h_lr=current_l_h, decay=decay, momentum=momentum) predict = rbm.predict(vis) n_batch = 0 users_ids = [] for batch in chunker(tests.keys(), batch_size): n_batch += 1 # print "&*&*" * 20 # print "START OF A BATCH" # print "batch: ", batch users_ids.extend(batch) size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 20) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(20): mask[20 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask #print np.sum(mask) positions = { profile_id: pos for pos, profile_id in enumerate(batch) } profile_batch = [bin_profiles[el] for el in batch] # print profile_batch[0] # print len(profile_batch[0]) test_batch = np.array(profile_batch).reshape( size, len(all_movies) * 20) # print batch # print "test batch :" # print test_batch # print test_batch.shape #print test_batch[:3,:3] batch_preds = predict(test_batch) user_preds = revert_expected_value(batch_preds, do_round=False) if n_batch == 1: print user_preds[:4, :5] train_batch_i = 0 for batch_i, batch in enumerate(chunker(profiles.keys(), batch_size)): size = min(len(batch), batch_size) train_batch_i += 1 # create needed binary vectors bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 20) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(20): mask[20 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask # print example # print len(example[0]) profile_batch = [bin_profiles[id] for id in batch] # print profile_batch[0][0] # print len(profile_batch[0][0]) masks_batch = [masks[id] for id in batch] train_batch = np.array(profile_batch).reshape( size, len(all_movies) * 20) train_masks = np.array(masks_batch).reshape( size, len(all_movies) * 20) train_masks = train_masks.astype('float32') train(train_batch, train_masks) if (train_batch_i % 200 == 0): sys.stdout.write('.') sys.stdout.flush() # print "number of train batches: ", train_batch_i ratings = [] predictions = [] # pickle.dump(all_movies, open("item_ids.pickle", "wb")) # print "###############################################" # print "user ids" # print tests.keys()[1:100] # # print len(tests.keys) # # print type(tests.keys) # print "all users" # print all_users[1:100] # print len(all_users) # print type(all_users) # print "beer ids" # print all_movies[1:100] # print len(all_movies) # print type(all_movies) #reconstruct_mat = np.array([]).reshape(0, 1269) n_batch = 0 users_ids = [] for batch in chunker(tests.keys(), batch_size): n_batch += 1 # print "&*&*" * 20 # print "START OF A BATCH" # print "batch: ", batch users_ids.extend(batch) size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 20) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(20): mask[20 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask #print np.sum(mask) positions = { profile_id: pos for pos, profile_id in enumerate(batch) } profile_batch = [bin_profiles[el] for el in batch] # print profile_batch[0] # print len(profile_batch[0]) test_batch = np.array(profile_batch).reshape( size, len(all_movies) * 20) #print batch # print "test batch :" # print test_batch # print test_batch.shape batch_preds = predict(test_batch) user_preds = revert_expected_value(batch_preds, do_round=False) #if n_batch == 1: # print test_batch[:2,:] # reconstruct_mat = np.concatenate((reconstruct_mat, user_preds)) # print predict(test_batch) # print "user pred: ", user_preds # print user_preds.shape for profile_id in batch: test_movies = tests[profile_id] try: for movie, rating in test_movies: current_profile = user_preds[positions[profile_id]] predicted = current_profile[all_movies.index(movie)] rating = float(rating) ratings.append(rating) predictions.append(predicted) except Exception: pass #print (np.array(predictions))[0:10] # print "number of test batches: ", n_batch # print reconstruct_mat # pickle.dump(users_ids, open("users_ids.pickle", "wb")) # pickle.dump(reconstruct_mat, open("reconstruct_mat.pickle", "wb")) vabs = np.vectorize(abs) distances = np.array(ratings) - np.array(predictions) mae = vabs(distances).mean() rmse = sqrt((distances**2).mean()) iteration_result = { 'iteration': j, 'k': k, 'momentum': momentum, 'mae': mae, 'rmse': rmse, 'lrate': current_l_w } config_result['results'].append(iteration_result) print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse)) with open('{}_{}.json'.format(config_name, name), 'wt') as res_output: res_output.write(json.dumps(config_result, indent=4)) w = rbm.weights.eval() np.save('weights', w)
def train(self): self.train_hist = {} self.train_hist['loc_loss'] = [] self.train_hist['per_epoch_time'] = [] self.train_hist['total_time'] = [] self.loc.train() print('training start!!') start_time = time.time() for epoch in range(self.start_epoch_idx, self.epoch): epoch_start_time = time.time() data_gen = utils.chunker(self.pair_list, self.batch_size) if epoch == self.start_epoch_idx: start_iter_idx = self.start_iter_idx else: start_iter_idx = 0 for iter in range(start_iter_idx, self.epoch_len): if iter == self.epoch_len // self.batch_size: break # read images chunk = data_gen.next() images1, images2, labels, gt1, gt2 = utils.get_data_from_chunk( self.data_path, chunk, self.input_scale) images1 = images1.cuda(self.gpu) images2 = images2.cuda(self.gpu) # gt masks variable gt1_ = torch.squeeze(gt1, dim=1).long() gt2_ = torch.squeeze(gt2, dim=1).long() gt1_ = gt1_.cuda(self.gpu) gt2_ = gt2_.cuda(self.gpu) # localization output1, output2 = self.loc(images1, images2) #localization update if (iter + 1) % self.loc_update_stride == 0: self.loc_optimizer.zero_grad() #localization net update log_o1 = self.logsoftmax(output1) log_o2 = self.logsoftmax(output2) loc_loss_1 = self.ce_criterion(log_o1, gt1_) loc_loss_2 = self.ce_criterion(log_o2, gt2_) loc_loss = loc_loss_1 + loc_loss_2 self.train_hist['loc_loss'].append(loc_loss.data) loc_loss.backward() self.loc_optimizer.step() if (iter + 1) % 10 == 0: print '********************************************************************************' print 'iter = ', iter, ' epoch = ', epoch, 'completed, loc_loss = ', loc_loss.data.cpu( ).numpy() print 'iter = ', iter, ' epoch = ', epoch, 'completed, loc_loss_1 = ', loc_loss_1.data.cpu( ).numpy() print 'iter = ', iter, ' epoch = ', epoch, 'completed, loc_loss_2 = ', loc_loss_2.data.cpu( ).numpy() if (iter + 1) % self.snapshot_stride == 0: snapshot(self.loc, self.snapshot_prefix_loc, epoch, iter) trainhist_snapshot(self.train_hist['loc_loss'], self.snapshot_prefix_loc, epoch, iter) self.train_hist['loc_loss'] = [] self.train_hist['per_epoch_time'].append(time.time() - epoch_start_time) self.train_hist['total_time'].append(time.time() - start_time) print("Avg one epoch time: %.2f, total %d epochs time: %.2f" % (np.mean(self.train_hist['per_epoch_time']), self.epoch, self.train_hist['total_time'][0])) print("Training finish!... save training results")
def run(name, dataset, config, all_users, all_movies, tests, initial_v, sep): config_name = config['name'] number_hidden = config['number_hidden'] epochs = config['epochs'] ks = config['ks'] momentums = config['momentums'] l_w = config['l_w'] l_v = config['l_v'] l_h = config['l_h'] decay = config['decay'] batch_size = config['batch_size'] config_result = config.copy() config_result['results'] = [] vis = T.matrix() vmasks = T.matrix() rbm = CFRBM(len(all_movies) * 5, number_hidden) profiles = defaultdict(list) with open(dataset, 'rt') as data: for i, line in enumerate(data): uid, mid, rat, timstamp = line.strip().split(sep) profiles[uid].append((mid, float(rat))) print("Users and ratings loaded") for j in range(epochs): def get_index(col): if j/(epochs/len(col)) < len(col): return j/(epochs/len(col)) else: return -1 index = get_index(ks) mindex = get_index(momentums) icurrent_l_w = get_index(l_w) icurrent_l_v = get_index(l_v) icurrent_l_h = get_index(l_h) k = ks[index] momentum = momentums[mindex] current_l_w = l_w[icurrent_l_w] current_l_v = l_v[icurrent_l_v] current_l_h = l_h[icurrent_l_h] train = rbm.cdk_fun(vis, vmasks, k=k, w_lr=current_l_w, v_lr=current_l_v, h_lr=current_l_h, decay=decay, momentum=momentum) predict = rbm.predict(vis) for batch_i, batch in enumerate(chunker(profiles.keys(), batch_size)): size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 5) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask[5 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask profile_batch = [bin_profiles[id] for id in batch] masks_batch = [masks[id] for id in batch] train_batch = np.array(profile_batch).reshape(size, len(all_movies) * 5) train_masks = np.array(masks_batch).reshape(size, len(all_movies) * 5) train_masks = train_masks.astype('float32') train(train_batch, train_masks) sys.stdout.write('.') sys.stdout.flush() ratings = [] predictions = [] for batch in chunker(tests.keys(), batch_size): size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} masks = {} for userid in batch: user_profile = [0.] * len(all_movies) mask = [0] * (len(all_movies) * 5) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask[5 * all_movies.index(movie_id) + _i] = 1 example = expand(np.array([user_profile])).astype('float32') bin_profiles[userid] = example masks[userid] = mask positions = {profile_id: pos for pos, profile_id in enumerate(batch)} profile_batch = [bin_profiles[el] for el in batch] test_batch = np.array(profile_batch).reshape(size, len(all_movies) * 5) user_preds = revert_expected_value(predict(test_batch)) for profile_id in batch: test_movies = tests[profile_id] try: for movie, rating in test_movies: current_profile = user_preds[positions[profile_id]] predicted = current_profile[all_movies.index(movie)] rating = float(rating) ratings.append(rating) predictions.append(predicted) except Exception: pass vabs = np.vectorize(abs) distances = np.array(ratings) - np.array(predictions) mae = vabs(distances).mean() rmse = sqrt((distances ** 2).mean()) iteration_result = { 'iteration': j, 'k': k, 'momentum': momentum, 'mae': mae, 'rmse': rmse, 'lrate': current_l_w } config_result['results'].append(iteration_result) print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse)) with open('experiments/{}_{}.json'.format(config_name, name), 'wt') as res_output: res_output.write(json.dumps(config_result, indent=4)) W,V,H = rbm.get_weights() print H
def run(name, dataset, user_info, config, all_users, all_movies, all_occupations, all_sex, all_ages, tests, initial_v, sep): config_name = config['name'] number_hidden = config['number_hidden'] epochs = config['epochs'] ks = config['ks'] momentums = config['momentums'] l_w = config['l_w'] l_v = config['l_v'] l_h = config['l_h'] decay = config['decay'] batch_size = config['batch_size'] config_result = config.copy() config_result['results'] = [] vis_x = T.matrix() vis_o = T.matrix() vis_s = T.matrix() vis_a = T.matrix() vmasks_x = T.matrix() vmasks_o = T.matrix() vmasks_s = T.matrix() vmasks_a = T.matrix() rbm = CFRBM(len(all_movies) * 5, len(all_occupations), 1, len(all_ages), number_hidden) profiles = defaultdict(list) with open(dataset, 'rt') as data: for i, line in enumerate(data): uid, mid, rat, timstamp = line.strip().split(sep) profiles[uid].append((mid, float(rat))) print("Users and ratings loaded") user_occ = defaultdict(list) user_sex = defaultdict(list) user_age = defaultdict(list) r = csv.reader(open(user_info, 'rb'), delimiter='|') for row in r: user_age[row[0]] = [int(x) for x in row[1:7]] user_sex[row[0]] = [int(row[7])] user_occ[row[0]] = [int(x) for x in row[8:]] print("User info loaded") for j in range(epochs): def get_index(col): if j/(epochs/len(col)) < len(col): return j/(epochs/len(col)) else: return -1 index = get_index(ks) mindex = get_index(momentums) icurrent_l_w = get_index(l_w) icurrent_l_v = get_index(l_v) icurrent_l_h = get_index(l_h) k = ks[index] momentum = momentums[mindex] current_l_w = l_w[icurrent_l_w] current_l_v = l_v[icurrent_l_v] current_l_h = l_h[icurrent_l_h] train = rbm.cdk_fun(vis_x, vis_o, vis_s, vis_a, vmasks_x, vmasks_o, vmasks_s, vmasks_a, k=k, w_lr=current_l_w, v_lr=current_l_v, h_lr=current_l_h, decay=decay, momentum=momentum) predict = rbm.predict(vis_x, vis_o, vis_s, vis_a) start_time = time.time() for batch_i, batch in enumerate(chunker(profiles.keys(), batch_size)): size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} occ_profiles = {} sex_profiles = {} age_profiles = {} masks_x = {} masks_o = {} masks_s = {} masks_a = {} for userid in batch: user_profile = [0.] * len(all_movies) occ_profile = [0.] * len(all_occupations) sex_profile = [0.] * 1 age_profile = [0.] * len(all_ages) mask_x = [0] * (len(all_movies) * 5) mask_o = [1] * (len(all_occupations)) mask_s = [1] * (1) mask_a = [1] * (len(all_ages)) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask_x[5 * all_movies.index(movie_id) + _i] = 1 mask_o = [1] * len(all_occupations) mask_s = [1] * 1 mask_a = [1] * len(all_ages) example_x = expand(np.array([user_profile])).astype('float32') example_o = expand(np.array([occ_profile]), k=1).astype('float32') example_s = expand(np.array([sex_profile]), k=1).astype('float32') example_a = expand(np.array([age_profile]), k=1).astype('float32') bin_profiles[userid] = example_x occ_profiles[userid] = example_o sex_profiles[userid] = example_s age_profiles[userid] = example_a masks_x[userid] = mask_x masks_o[userid] = mask_o masks_s[userid] = mask_s masks_a[userid] = mask_a profile_batch = [bin_profiles[id] for id in batch] occ_batch = [occ_profiles[id] for id in batch] sex_batch = [sex_profiles[id] for id in batch] age_batch = [age_profiles[id] for id in batch] masks_x_batch = [masks_x[id] for id in batch] masks_o_batch = [masks_o[id] for id in batch] masks_s_batch = [masks_s[id] for id in batch] masks_a_batch = [masks_a[id] for id in batch] train_batch_x = np.array(profile_batch).reshape(size, len(all_movies) * 5) train_batch_o = np.array(occ_batch).reshape(size, len(all_occupations)) train_batch_s = np.array(sex_batch).reshape(size, 1) train_batch_a = np.array(age_batch).reshape(size, len(all_ages)) train_masks_x = np.array(masks_x_batch).reshape(size, len(all_movies) * 5) train_masks_o = np.array(masks_o_batch).reshape(size, len(all_occupations)) train_masks_s = np.array(masks_s_batch).reshape(size, 1) train_masks_a = np.array(masks_a_batch).reshape(size, len(all_ages)) train_masks_x = train_masks_x.astype('float32') train_masks_o = train_masks_o.astype('float32') train_masks_s = train_masks_s.astype('float32') train_masks_a = train_masks_a.astype('float32') train(train_batch_x, train_batch_o, train_batch_s, train_batch_a, train_masks_x, train_masks_o, train_masks_s, train_masks_a) sys.stdout.write('.') sys.stdout.flush() end_time = time.time() train_time = end_time - start_time ratings = [] predictions = [] start_time = time.time() for batch in chunker(tests.keys(), batch_size): size = min(len(batch), batch_size) # create needed binary vectors bin_profiles = {} occ_profiles = {} sex_profiles = {} age_profiles = {} masks_x = {} masks_o = {} masks_s = {} masks_a = {} for userid in batch: user_profile = [0.] * len(all_movies) occ_profile = [0.] * len(all_occupations) sex_profile = [0.] * 1 age_profile = [0.] * len(all_ages) mask_x = [0] * (len(all_movies) * 5) mask_o = [1] * (len(all_occupations)) mask_s = [1] * (1) mask_a = [1] * (len(all_ages)) for movie_id, rat in profiles[userid]: user_profile[all_movies.index(movie_id)] = rat for _i in range(5): mask_x[5 * all_movies.index(movie_id) + _i] = 1 mask_o = [1] * len(all_occupations) mask_s = [1] * 1 mask_a = [1] * len(all_ages) example_x = expand(np.array([user_profile])).astype('float32') example_o = expand(np.array([occ_profile]), k=1).astype('float32') example_s = expand(np.array([sex_profile]), k=1).astype('float32') example_a = expand(np.array([age_profile]), k=1).astype('float32') bin_profiles[userid] = example_x occ_profiles[userid] = example_o sex_profiles[userid] = example_s age_profiles[userid] = example_a masks_x[userid] = mask_x masks_o[userid] = mask_o masks_s[userid] = mask_s masks_a[userid] = mask_a positions = {profile_id: pos for pos, profile_id in enumerate(batch)} profile_batch = [bin_profiles[el] for el in batch] occ_batch = [occ_profiles[el] for el in batch] sex_batch = [sex_profiles[el] for el in batch] age_batch = [age_profiles[el] for el in batch] test_batch_x = np.array(profile_batch).reshape(size, len(all_movies) * 5) test_batch_o = np.array(occ_batch).reshape(size, len(all_occupations)) test_batch_s = np.array(sex_batch).reshape(size, 1) test_batch_a = np.array(age_batch).reshape(size, len(all_ages)) user_preds = revert_expected_value(predict(test_batch_x, test_batch_o, test_batch_s, test_batch_a)) for profile_id in batch: test_movies = tests[profile_id] try: for movie, rating in test_movies: current_profile = user_preds[positions[profile_id]] predicted = current_profile[all_movies.index(movie)] rating = float(rating) ratings.append(rating) predictions.append(predicted) except Exception: pass end_time = time.time() test_time = end_time - start_time true_rat = np.array(ratings, dtype=np.uint8) pred_rat = np.array(predictions, dtype=np.uint8) #print true_rat < 3, true_rat prec_rec = precision_recall_fscore_support(true_rat < 3,pred_rat < 3, average='binary') print prec_rec vabs = np.vectorize(abs) distances = np.array(ratings) - np.array(predictions) mae = vabs(distances).mean() rmse = sqrt((distances ** 2).mean()) iteration_result = { 'iteration': j, 'k': k, 'momentum': momentum, 'mae': mae, 'rmse': rmse, 'lrate': current_l_w, 'train_time': train_time, 'test_time': test_time, 'prec_rec': prec_rec } config_result['results'].append(iteration_result) print(iteration_str.format(j, k, current_l_w, momentum, mae, rmse)) with open('experiments/{}_{}.json'.format(config_name, name), 'wt') as res_output: res_output.write(json.dumps(config_result, indent=4))
callbacks_list = [checkpoint, redonplat] # early model.fit_generator(gen(train_dict, aug=False), validation_data=gen(val_dict), epochs=40, verbose=2, steps_per_epoch=1000, validation_steps=300, callbacks=callbacks_list) model.load_weights(file_path) for record in tqdm(test_dict): all_rows = test_dict[record]['x'] record_y_gt = [] record_y_pred = [] for batch_hyp in chunker(range(all_rows.shape[0])): X = all_rows[min(batch_hyp):max(batch_hyp) + 1, ...] Y = test_dict[record]['y'][min(batch_hyp):max(batch_hyp) + 1] X = np.expand_dims(X, 0) X = rescale_array(X) Y_pred = model.predict(X) Y_pred = Y_pred.argmax(axis=-1).ravel().tolist() gt += Y.ravel().tolist() preds += Y_pred record_y_gt += Y.ravel().tolist()
def validate(self, output_dir, act_st, test_st, test_pr, test_model, fcst_pr, pr_st, chunk_sz, cpu): """Validate forecast model and write result by batch Parameters ---------- output_dir : str output directory act_st : datetime actual start date test_st : datetime test start date test_pr : int number of rolling period to test (months) test_model : list list of model to test fcst_pr : int number of periods to forecast for each rolling pr_st : int starting period for each forecast (default 0/1) chunk_sz : int number of item to validate for each chunk cpu : int number of running processors """ # make output directory output_dir = "{}validate_{}/".format( output_dir, datetime.datetime.now(timezone(self.tz)).strftime("%Y%m%d-%H%M%S")) self.output_dir = output_dir self.fp.mkdir(output_dir) self.lg.logtxt("create output directory: {}".format(output_dir)) self.fp.writecsv(self.df, "{}input_actual.csv".format(output_dir)) # write external features if self.ext is not None: self.fp.writecsv(self.ext, "{}input_external.csv".format(output_dir)) self.fp.writecsv(self.ext_lag, "{}input_externallag.csv".format(output_dir)) self.lg.logtxt( "write input file: {}input_actual.csv | {}input_external.csv | {}input_externallag.csv" .format(output_dir, output_dir, output_dir)) else: self.lg.logtxt( "write input file: {}input_actual.csv".format(output_dir)) # set parameter items = self.df['id'].unique() n_chunk = len([x for x in chunker(items, chunk_sz)]) test_date = [ x.to_pydatetime() + datetime.timedelta(days=+test_st.day - 1) for x in pd.date_range(start=test_st, periods=test_pr, freq='MS') ] self.lg.logtxt( "total items: {} | chunk size: {} | total chunk: {}".format( len(items), chunk_sz, n_chunk)) # loop by chunk cpu_count = 1 if cpu <= 1 else multiprocessing.cpu_count( ) if cpu >= multiprocessing.cpu_count() else cpu self.lg.logtxt("run at {} processor(s)".format(cpu_count)) for i, c in enumerate(chunker(items, chunk_sz), 1): df_fcst = pd.DataFrame() if cpu_count == 1: for r in [ self.validate_byitem(x, act_st, test_date, test_model, fcst_pr, pr_st, i) for x in c ]: df_fcst = df_fcst.append(r, ignore_index=True) else: pool = multiprocessing.Pool(processes=cpu_count) for r in pool.starmap( self.validate_byitem, [[x, act_st, test_date, test_model, fcst_pr, pr_st, i] for x in c]): df_fcst = df_fcst.append(r, ignore_index=True) pool.close() pool.join() # write csv file output_path = "{}output_validate_{:04d}-{:04d}.csv".format( output_dir, i, n_chunk) self.fp.writecsv(df_fcst, output_path) self.lg.logtxt("write output file ({}/{}): {}".format( i, n_chunk, output_path)) self.lg.logtxt("[END VALIDATION]") self.lg.writelog("{}logfile.log".format(output_dir))