def train(OpType, X, y, nepochs=150): progress = ProgressBar('Training') np.random.seed(0) #Features transformation X = np.log2(X) #Model model = kr.models.Sequential() for i, L in enumerate([64, 48, 32, 16, 8]): model.add(kr.layers.Dense(L, input_dim=X.shape[1])) model.add(kr.layers.Activation('relu')) model.add(kr.layers.Dense(1)) model.add(kr.layers.Activation('linear')) model.compile(loss='mean_squared_error', optimizer='adam') #Train history = model.fit( X, y, validation_split=0.1, batch_size=32, epochs=nepochs, verbose=1, callbacks=[ kr.callbacks.LambdaCallback( on_epoch_end=lambda i, _: progress.update(i, nepochs)) ]) return model
def show_progress(progress, nsamples): bar = ProgressBar('Benchmarks') while True: sleep(0.1) current = np.sum(progress.values()) bar.update(current, nsamples) if (current > nsamples - 1): break
def show_progress(progress, nsamples): bar = ProgressBar('Benchmarks') while True: sleep(0.1) current = np.sum(progress.values()) bar.update(current, nsamples) if(current > nsamples - 1): break
def train(self, env, config, batch_size=128, updates=500, max_seconds=30): models = config.get() for model in models: model.compile(optimizer=ko.RMSprop(lr=self.lr), loss=[self._logits_loss, self._value_loss]) # Storage helpers for a single batch of data. actions = np.empty((batch_size, config.num), dtype=np.int32) rewards, dones, values = np.empty((3, batch_size, config.num)) observations = np.empty( (batch_size, config.window_size, env.observations_size)) # Training loop: collect samples, send to optimizer, repeat updates times. deaths = {} for model in models: deaths[model.label] = 0 obs_window = env.reset() episodes = [] steps = 0 pb = ProgressBar(f'{config.label}') total_progress = updates * batch_size progress = 0 pb.reset() for _ in range(updates): for step in range(batch_size): steps += 1 progress += 1 observations[step] = obs_window for m_i, model in enumerate(models): actions[step, m_i], values[step, m_i] = model.action_value(obs_window) obs_window, rewards[step], dones[step] = env.step( actions[step]) if any(dones[step]) or max_seconds < steps * env.dt: obs_window = env.reset() episodes.append(steps * env.dt) steps = 0 for dead, model in zip(dones[step], models): if dead: deaths[model.label] += 1 losses = [] for m_i, model in enumerate(models): _, next_value = model.action_value(obs_window) returns, advs = self._returns_advantages( rewards[:, m_i], dones[:, m_i], values[:, m_i], next_value) # A trick to input actions and advantages through same API. acts_and_advs = np.concatenate( [actions[:, m_i, None], advs[:, None]], axis=-1) loss = model.train_on_batch( observations[:, -model.input_size:, :], [acts_and_advs, returns]) losses.append(loss[0]) pb(progress / total_progress, f' loss: {sum(losses)/len(losses):6.3f}') return episodes, deaths
def permute_log_odds(clf, boot_n, feature_names=None, region_names=None, n_jobs=1): """ Given a fitted RegionalClassifier object, permute the column "importances" (i.e. log odds ratios) by resampling across studies. The function returns a pandas dataframe with z-score and p-values for each combination between a region and a topic in the Dataset """ def z_score_array(arr, dist): return np.array([(v - dist[dist.region == i + 1].lor.mean()) / dist[dist.region == i + 1].lor.std() for i, v in enumerate(arr.tolist())]) pb = ProgressBar(len(clf.data), start=True) overall_results = [] if feature_names is None: feature_names = clf.feature_names if region_names is None: region_names = range(1, len(clf.data) + 1) # For each region, run boot_n number of permutations in parallel, and save to a list for reg, (X, y) in enumerate(clf.data): results = Parallel(n_jobs=n_jobs)(delayed(permutation_parallel)( X, y, clf.classifier, feature_names, reg, i) for i in range(boot_n)) for result in results: for res in result: overall_results.append(res) pb.next() # Combine permuted data to a dataframe perm_results = pd.DataFrame( overall_results, columns=['region', 'perm_n', 'topic_name', 'lor']) # Reshape observed log odds ratios with real data, z-score observed value on permuted null distribution lor = pd.DataFrame(clf.importance, index=range(1, clf.importance.shape[0] + 1), columns=feature_names) lor_z = lor.apply(lambda x: z_score_array( x, perm_results[perm_results.topic_name == x.name])) lor_z.index = region_names # Transform to long format and add p-values all_roi_z = pd.melt(pd.concat([lor_z]).reset_index(), value_name='lor_z', id_vars='index') all_roi_z = all_roi_z.rename(columns={'index': 'ROI'}) all_roi_z['p'] = (1 - norm.cdf(all_roi_z.lor_z.abs())) * 2 return all_roi_z
def test(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Testing') valid_loss = AverageMeter() valid_acc = AverageMeter() valid_f1 = AverageMeter() model.eval() count = 0 with torch.no_grad(): for batch_idx, batch in enumerate(dataloader): b_features, b_target, b_idx = batch['features'].to( DEVICE), batch['target'].to(DEVICE), batch['idx'].to(DEVICE) logits, probs = model(b_features) loss = F.cross_entropy(logits, b_target).item() pred = probs.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct = pred.eq(b_target.view_as(pred)).sum().item() f1 = f1_score(pred.to("cpu").numpy(), b_target.to("cpu").numpy(), average='macro') valid_f1.update(f1, n=b_features.size(0)) valid_loss.update(loss, n=b_features.size(0)) valid_acc.update(correct, n=1) count += b_features.size(0) pbar(step=batch_idx) return { 'valid_loss': valid_loss.avg, 'valid_acc': valid_acc.sum / count, 'valid_f1': valid_f1.avg }
def prune(OpType, model, init_cuda): progress = ProgressBar('Pruning') device, ctx, stream = init_cuda() #Restore progress X = np.empty((0, OpType.Nshapes)) Y = np.empty((0, OpType.Nparams - OpType.Nshapes), dtype=np.uint32) V = valid_configurations(OpType, device) #Update i = Y.shape[0] S = bench_shapes(OpType, device) for i, x in enumerate(S): perf, y = maximize(OpType, model, x, V, device, ctx, stream) X = np.vstack((X, x)) Y = np.vstack((Y, y)) progress.update(i, len(S)) print(x, perf) #Remove duplicates Y = np.vstack(set(map(tuple, Y))) return Y
def bootstrap_log_odds(clf, boot_n, feature_names=None, region_names=None, n_jobs=1): def percentile(n): def percentile_(x): return np.percentile(x, n) percentile_.__name__ = 'percentile_%s' % n return percentile_ pb = ProgressBar(len(clf.data), start=True) if feature_names is None: feature_names = clf.feature_names if region_names is None: region_names = range(1, len(clf.data) + 1) # For each region, calculate in parallel bootstrapped lor estimates overall_boot = [] for reg, (X, y) in enumerate(clf.data): results = Parallel(n_jobs=n_jobs)(delayed(bootstrap_parallel)( X, y, clf.classifier, feature_names, region_names[reg], i) for i in range(boot_n)) for result in results: for res in result: overall_boot.append(res) pb.next() overall_boot = pd.DataFrame( overall_boot, columns=['region', 'perm_n', 'topic_name', 'fi']) # Calculate the 95% confidence intervals from the bootstrapped samples return overall_boot.groupby(['region', 'topic_name'])['fi'].agg({ 'mean': np.mean, 'low_ci': percentile(0.05), 'hi_ci': percentile(99.95) }).reset_index()
def _valid_step(model: tf.keras.Model, dataset: tf.data.Dataset, progress_bar: ProgressBar, loss_metric: tf.keras.metrics.Mean, max_train_steps: Any = -1) -> Dict: """ 验证步 :param model: 验证模型 :param dataset: 验证数据集 :param progress_bar: 进度管理器 :param loss_metric: 损失计算器 :param max_train_steps: 验证步数 :return: 验证指标 """ print("验证轮次") start_time = time.time() loss_metric.reset_states() for (batch, (train_enc, train_dec, month_enc, month_dec, labels)) in enumerate(dataset.take(max_train_steps)): train_enc = tf.squeeze(train_enc, axis=0) train_dec = tf.squeeze(train_dec, axis=0) outputs = model(inputs=[train_enc, train_dec, month_enc, month_dec]) treat_outputs = tf.squeeze(input=outputs[:, -24:, :], axis=-1) loss = tf.keras.losses.MSE(labels, treat_outputs) loss_metric(loss) progress_bar( current=batch + 1, metrics=get_dict_string(data={"valid_loss": loss_metric.result()})) progress_bar( current=progress_bar.total, metrics=get_dict_string(data={"valid_loss": loss_metric.result()})) progress_bar.done(step_time=time.time() - start_time) return {"valid_loss": loss_metric.result()}
def classify(self, scoring='accuracy', n_jobs=1, importance_function=None): """ scoring - scoring function or type (str) n_jobs - Number of parallel jobs importance_function - Function to extract importance vectors from classifiers (differs by algorithm) """ if importance_function is None: importance_function = log_odds_ratio if self.data is None: self.load_data() self.initalize_containers() print("Classifying...") pb = ProgressBar(self.n_regions, start=True) for index, output in enumerate( Parallel(n_jobs=n_jobs)(delayed(classify_parallel)( self.classifier, scoring, region_data, importance_function) for region_data in self.data)): self.class_score[index] = output['score'] self.importance[index] = output['importance'] self.predictions[index] = output['predictions'] pb.next()
def train(OpType, X, y, nepochs = 150): progress = ProgressBar('Training') np.random.seed(0) #Features transformation X = np.log2(X) #Model model = kr.models.Sequential() for i,L in enumerate([64, 48, 32, 16, 8]): model.add(kr.layers.Dense(L, input_dim=X.shape[1])) model.add(kr.layers.Activation('relu')) model.add(kr.layers.Dense(1)) model.add(kr.layers.Activation('linear')) model.compile(loss='mean_squared_error', optimizer='adam') #Train history = model.fit(X, y, validation_split=0.1, batch_size=32, epochs=nepochs, verbose=1, callbacks = [kr.callbacks.LambdaCallback(on_epoch_end = lambda i, _: progress.update(i, nepochs))]) return model
def train(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Training') train_loss = AverageMeter() model.train() for batch_idx, batch in enumerate(dataloader): b_features, b_target, b_idx = batch['features'].to( DEVICE), batch['target'].to(DEVICE), batch['idx'].to(DEVICE) optimizer.zero_grad() with autocast(): logits, probs = model(b_features) loss = F.cross_entropy(logits, b_target) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() pbar(step=batch_idx, info={'loss': loss.item()}) train_loss.update(loss.item(), n=1) return {'loss': train_loss.avg}
def test(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Testing') valid_loss = AverageMeter() valid_acc = AverageMeter() count = 0 for batch_idx, batch in enumerate(dataloader): # forward -- skip backward prop probas = model.forward(batch['features']) # record loss loss = model._logit_cost(batch['target'], probas) # get predictions prediction = torch.where(probas > 0.5, torch.tensor(1, device=device), torch.tensor(0, device=device)).view(-1) # compare correct = prediction.eq(batch['target']).sum().item() valid_loss.update(loss.item(), n=batch['features'].size(0)) valid_acc.update(correct, n=1) count += batch['features'].size(0) pbar(step=batch_idx) return {'valid_loss': valid_loss.avg, 'valid_acc': valid_acc.sum / count}
def train(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Training') train_loss = AverageMeter() for batch_idx, batch in enumerate(dataloader): # forward probas = model.forward(batch['features']) # backward grad_w, grad_b = model.backward(batch['features'], batch['target'], probas) # manual regularization -- account for mini-batches l2_reg = model.LAMBDA * model.weights / len(dataloader) # update weights model.weights -= learning_rate * (grad_w + l2_reg) model.bias -= learning_rate * grad_b # record loss loss = model._logit_cost(batch['target'], probas) # update meter train_loss.update(loss.item(), n=1) # update progress bar pbar(step=batch_idx, info={'batch_loss': loss.item()}) return {'train_loss': train_loss.avg}
def train(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Training') train_loss = AverageMeter() for batch_idx, batch in enumerate(dataloader): # forward y_hat = model.forward(batch['features'].float()) # backward grad_w, grad_b = model.backward(batch['features'], batch['target'], y_hat) # manual regularization\ l2_reg = model.LAMBDA * model.weights l2_reg = l2_reg.reshape(2, 1) # update weights model.weights -= learning_rate * (grad_w + l2_reg).view(-1) model.bias -= (learning_rate * grad_b).view(-1) # record loss loss = model.loss(batch['target'], y_hat) # update meter train_loss.update(loss.item(), n=1) # update progress bar pbar(step=batch_idx, info={'batch_loss': loss.item()}) return {'train_loss': train_loss.avg}
def train(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Training') train_loss = AverageMeter() model.train() for batch_idx, batch in enumerate(dataloader): b_features, b_target, b_idx = batch['features'].to( DEVICE), batch['target'].to(DEVICE), batch['idx'].to(DEVICE) optimizer.zero_grad() with autocast(): logits, probs = model(b_features) loss = F.cross_entropy(logits, b_target) # regularize loss -- but not the intercept LAMBDA, L2 = 2, 0. for name, p in model.named_parameters(): if 'weight' in name: L2 = L2 + (p**2).sum() loss = loss + 2. / b_target.size(0) * LAMBDA * L2 scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() pbar(step=batch_idx, info={'loss': loss.item()}) train_loss.update(loss.item(), n=1) return {'loss': train_loss.avg}
def main(argv): pathFinder = PathFinder() numTravellers = 2 combinationLimit = 1000000 minPathLength = -1 maxPathLength = -1 maxEdgeRedundancy = -1 guiFormat = False fileName = "" try: opts, args = getopt.getopt(argv, "?gst:l:f:i:a:chr:", ["help", "guiFormat", "silent", "travellers=", "limit=", "filename=", "min=", "max=", "cyclic", "allowHomes", "allowhomes", "redundancy="]) for opt, arg in opts: if opt in ("-?", "--help"): usage() sys.exit(1) if opt in ("-t", "--travellers"): numTravellers = int(arg) if opt in ("-l", "--limit"): combinationLimit = int(arg) * 1000000 if opt in ("-f", "--filename"): fileName = arg if opt in ("-i", "--min"): minPathLength = int(arg) + 1 if opt in ("-a", "--max"): maxPathLength = int(arg) + 1 if opt in ("-r", "--redundancy"): maxEdgeRedundancy = int(arg) if opt in ("-c", "--cyclic"): pathFinder.useCyclicBFS = True if opt in ("-h", "--allowHomes", "--allowhomes"): pathFinder.canPassHomeNodes = True if opt in ("-g", "--guiFormat", "--guiformat"): guiFormat = True if opt in ("-s", "--silent"): global SILENT_MODE SILENT_MODE = True if len(fileName) == 0: usage() sys.exit(2) except getopt.GetoptError: usage() sys.exit(2) progressBar = ProgressBar() testGraph = parseGraph(fileName) Message("\n* Solving for " + str(numTravellers) + " traveller(s)") if combinationLimit > 0: Message("* Considering at most " + str(combinationLimit) + " combinations.") else: Message("* Attempting to solve all combinations.") homeNodeIds = testGraph.GetHomeNodeIds() homeNodePairs = itertools.combinations(homeNodeIds, 2) solutions = [] # FindAllPaths dla wszystkich par domkow for p in homeNodePairs: for s in pathFinder.FindAllPaths(testGraph, p[0], p[1]): if(minPathLength == -1 or len(s) >= minPathLength) and (maxPathLength == -1 or len(s) <= maxPathLength): solutions.append(s) #generate solution sets solutions.sort() Message("Discovered " + str(len(solutions)) + " paths for all home nodes.") combinations = itertools.combinations(solutions, numTravellers) solutionSets = [] numMillions = 1 # if combinationLimit > 0: currentCombination = 0 for c in combinations: if currentCombination == combinationLimit and combinationLimit > 0: break if currentCombination > numMillions*1000000: Warning("** WARNING: over " + str(numMillions) + " million combinations.") numMillions = numMillions + 1 solutionSets.append(c) currentCombination = currentCombination + 1 # else: # solutionSets = list(combinations) Message("* Spawned " + str(len(solutionSets)) + " combinations.") # get rid of gazillions duplicate entries Message("* Filtering combinations, this may take a while...") solutionSets.sort() solutionSets = list(solutionSets for solutionSets,_ in itertools.groupby(solutionSets)) totalNumSets = len(solutionSets) Message("* Will check " + str(totalNumSets) + " unique sets") possibleSolutions = [] currentSetNum = 0 solutionNum = 1 for s in solutionSets: if not SILENT_MODE: progressBar.draw(currentSetNum, totalNumSets) currentSetNum = currentSetNum + 1 testGraph.Reset() possibleSolution = testGraph.IsSolvableForSet(s) if possibleSolution is not None: Message("\rSolution " + str(solutionNum) + " " + str(possibleSolution)) # check how many edges are left unused, the less the better unusedEdges = testGraph.GetFreeEdges() possibleSolutions.append((possibleSolution, unusedEdges)) solutionNum = solutionNum + 1 if not SILENT_MODE: progressBar.draw(currentSetNum, totalNumSets) Message("\n") # sort solutions by number of unused edges possibleSolutions.sort(key=lambda possibleSolutions: len(possibleSolutions[1])) numSolutionsListed = 0 guiFormatDataList = [] # container of guiFormatData for s in possibleSolutions: solutionString = str(s[0]) + " " guiFormatData = dict() guiFormatData['Paths'] = s[0] guiFormatData['PathEndNodes'] = [] guiFormatData['MoveLimits'] = [] for element in s[0]: startPoint = "(SP: " + str(element[0]) + "|" + str(element[len(element)-1]) + " ML: " + str(len(element)-1) + ") " solutionString += startPoint guiFormatData['PathEndNodes'].append((element[0], element[len(element)-1])) guiFormatData['MoveLimits'].append(len(element)-1) solutionString += "RE: " + str(len(s[1])) + " " redundantEdgeIdList = [] for e in s[1]: redundantEdgeIdList.append(e.id) guiFormatData['RedundantEdgeIds'] = redundantEdgeIdList if len(s[1]) > 0: unusedEdgesStr = "" for ue in s[1]: unusedEdgesStr += "(" + str(ue.connectedNodes[0].id) + "-" + str(ue.connectedNodes[1].id) + ")" solutionString += "[" + unusedEdgesStr + "]" if maxEdgeRedundancy < 0 or len(s[1]) <= maxEdgeRedundancy: numSolutionsListed = numSolutionsListed + 1 guiFormatDataList.append(guiFormatData) print solutionString guiDataOutput = open('output.txt', 'wb') pickle.dump(guiFormatDataList, guiDataOutput, -1) guiDataOutput.close() if len(possibleSolutions) == 0: Warning("*** NO SOLUTIONS FOUND. ***\n") sys.exit(1) else: Message("\nFound " + str(len(possibleSolutions)) + " solutions. ") Message("\nListed " + str(numSolutionsListed) + " solutions. ")
def train(model: tf.keras.Model, checkpoint: tf.train.CheckpointManager, batch_size: Any, epochs: Any, train_dataset: Any, valid_dataset: AnyStr = None, max_train_steps: Any = -1, checkpoint_save_freq: Any = 2, *args, **kwargs) -> Dict: """ 训练器 :param model: 训练模型 :param checkpoint: 检查点管理器 :param batch_size: batch 大小 :param epochs: 训练周期 :param train_dataset: 训练数据集 :param valid_dataset: 验证数据集 :param max_train_steps: 最大训练数据量,-1为全部 :param checkpoint_save_freq: 检查点保存频率 :return: """ print("训练开始,正在准备数据中") # learning_rate = CustomSchedule(d_model=embedding_dim) loss_metric = tf.keras.metrics.Mean(name="train_loss_metric") optimizer = tf.optimizers.Adam(learning_rate=2e-5, beta_1=0.9, beta_2=0.999, name="optimizer") train_steps_per_epoch = max_train_steps if max_train_steps != -1 else ( 40000 // batch_size) valid_steps_per_epoch = 3944 // batch_size progress_bar = ProgressBar() for epoch in range(epochs): print("Epoch {}/{}".format(epoch + 1, epochs)) start_time = time.time() loss_metric.reset_states() progress_bar.reset(total=train_steps_per_epoch, num=batch_size) train_metric = None for (batch, (train_enc, train_dec, month_enc, month_dec, labels)) in enumerate(train_dataset.take(max_train_steps)): train_metric, prediction = _train_step(model=model, optimizer=optimizer, loss_metric=loss_metric, train_enc=train_enc, train_dec=train_dec, month_enc=month_enc, month_dec=month_dec, labels=labels) progress_bar(current=batch + 1, metrics=get_dict_string(data=train_metric)) progress_bar(current=progress_bar.total, metrics=get_dict_string(data=train_metric)) progress_bar.done(step_time=time.time() - start_time) if (epoch + 1) % checkpoint_save_freq == 0: checkpoint.save() if valid_steps_per_epoch == 0 or valid_dataset is None: print("验证数据量过小,小于batch_size,已跳过验证轮次") else: progress_bar.reset(total=valid_steps_per_epoch, num=batch_size) valid_metrics = _valid_step(model=model, dataset=valid_dataset, progress_bar=progress_bar, loss_metric=loss_metric, **kwargs) print("训练结束") return {}
def download(tickers: list, start: Union[str, int] = None, end: Union[str, int] = None, interval: str = "1d") -> dict: """ Download historical data for tickers in the list. Parameters ---------- tickers: list Tickers for which to download historical information. start: str or int Start download data from this date. end: str or int End download data at this date. interval: str Frequency between data. Returns ------- data: dict Dictionary including the following keys: - tickers: list of tickers - logp: array of log-adjusted closing prices, shape=(num stocks, length period); - volume: array of volumes, shape=(num stocks, length period); - sectors: dictionary of stock sector for each ticker; - industries: dictionary of stock industry for each ticker. """ tickers = tickers if isinstance(tickers, (list, set, tuple)) else tickers.replace( ',', ' ').split() tickers = list(set([ticker.upper() for ticker in tickers])) data = {} si_columns = ["SYMBOL", "CURRENCY", "SECTOR", "INDUSTRY"] si_filename = "stock_info.csv" if not os.path.exists(si_filename): # create a .csv to store stock information with open(si_filename, 'w') as file: wr = csv.writer(file) wr.writerow(si_columns) # load stock information file si = pd.read_csv(si_filename) missing_tickers = [ ticker for ticker in tickers if ticker not in si['SYMBOL'].values ] missing_si, na_si = {}, {} currencies = {} if end is None: end = int(dt.datetime.timestamp(dt.datetime.today())) elif type(end) is str: end = int(dt.datetime.timestamp(dt.datetime.strptime(end, '%Y-%m-%d'))) if start is None: start = int( dt.datetime.timestamp(dt.datetime.today() - dt.timedelta(365))) elif type(start) is str: start = int( dt.datetime.timestamp(dt.datetime.strptime(start, '%Y-%m-%d'))) @multitasking.task def _download_one_threaded(ticker: str, start: str, end: str, interval: str = "1d"): """ Download historical data for a single ticker with multithreading. Plus, it scrapes missing stock information. Parameters ---------- ticker: str Ticker for which to download historical information. interval: str Frequency between data. start: str Start download data from this date. end: str End download data at this date. """ data_one = _download_one(ticker, start, end, interval) try: data_one = data_one["chart"]["result"][0] data[ticker] = _parse_quotes(data_one) if ticker in missing_tickers: currencies[ticker] = data_one['meta']['currency'] try: html = requests.get( url='https://finance.yahoo.com/quote/' + ticker).text json_str = html.split('root.App.main =')[1].split( '(this)')[0].split(';\n}')[0].strip() info = json.loads(json_str)['context']['dispatcher'][ 'stores']['QuoteSummaryStore']['summaryProfile'] assert (len(info['sector']) > 0) and (len(info['industry']) > 0) missing_si[ticker] = dict(sector=info["sector"], industry=info["industry"]) except: pass except: pass progress.animate() num_threads = min([len(tickers), multitasking.cpu_count() * 2]) multitasking.set_max_threads(num_threads) progress = ProgressBar(len(tickers), 'completed') for ticker in tickers: _download_one_threaded(ticker, start, end, interval) multitasking.wait_for_tasks() progress.completed() if len(data) == 0: raise Exception("No symbol with full information is available.") data = pd.concat(data.values(), keys=data.keys(), axis=1, sort=True) data.drop( columns=data.columns[data.isnull().sum(0) > 0.33 * data.shape[0]], inplace=True) data = data.fillna(method='bfill').fillna(method='ffill').drop_duplicates() info = zip(list(missing_si.keys()), [currencies[ticker] for ticker in missing_si.keys()], [v['sector'] for v in missing_si.values()], [v['industry'] for v in missing_si.values()]) with open(si_filename, 'a+', newline='') as file: wr = csv.writer(file) for row in info: wr.writerow(row) si = pd.read_csv('stock_info.csv').set_index("SYMBOL").to_dict( orient='index') missing_tickers = [ ticker for ticker in tickers if ticker not in data.columns.get_level_values(0)[::2].tolist() ] tickers = data.columns.get_level_values(0)[::2].tolist() if len(missing_tickers) > 0: print( '\nRemoving {} from list of symbols because we could not collect full information.' .format(missing_tickers)) # download exchange rates and convert to most common currency currencies = [ si[ticker]['CURRENCY'] if ticker in si else currencies[ticker] for ticker in tickers ] ucurrencies, counts = np.unique(currencies, return_counts=True) default_currency = ucurrencies[np.argmax(counts)] xrates = get_exchange_rates(currencies, default_currency, data.index, start, end, interval) return dict(tickers=tickers, dates=pd.to_datetime(data.index), price=data.iloc[:, data.columns.get_level_values(1) == 'Adj Close'].to_numpy().T, volume=data.iloc[:, data.columns.get_level_values(1) == 'Volume'].to_numpy().T, currencies=currencies, exchange_rates=xrates, default_currency=default_currency, sectors={ ticker: si[ticker]['SECTOR'] if ticker in si else "NA_" + ticker for ticker in tickers }, industries={ ticker: si[ticker]['INDUSTRY'] if ticker in si else "NA_" + ticker for ticker in tickers })
def download(tickers: list, interval: str = "1d", period: str = "1y"): """ Download historical data for tickers in the list. Parameters ---------- tickers: list Tickers for which to download historical information. interval: str Frequency between data. period: str Data period to download. Returns ------- data: dict Dictionary including the following keys: - tickers: list of tickers - logp: array of log-adjusted closing prices, shape=(num stocks, length period); - volume: array of volumes, shape=(num stocks, length period); - sectors: list of stock sectors; - industries: list stock industries. """ tickers = tickers if isinstance(tickers, (list, set, tuple)) else tickers.replace( ',', ' ').split() tickers = list(set([ticker.upper() for ticker in tickers])) data = {} si_columns = ["SYMBOL", "SECTOR", "INDUSTRY"] si_filename = "stock_info.csv" if not os.path.exists(si_filename): # create a .csv to store stock information with open(si_filename, 'w') as file: wr = csv.writer(file) for row in zip([[c] for c in si_columns]): wr.writerow(row) # load stock information file si = pd.read_csv(si_filename) missing_tickers = [ ticker for ticker in tickers if ticker not in si['SYMBOL'].values ] missing_si, na_si = {}, {} @multitasking.task def _download_one_threaded(ticker: str, interval: str = "1d", period: str = "1y"): """ Download historical data for a single ticker with multithreading. Plus, it scrapes missing stock information. Parameters ---------- ticker: str Ticker for which to download historical information. interval: str Frequency between data. period: str Data period to download. """ data_one = _download_one(ticker, interval, period) try: data[ticker] = parse_quotes(data_one["chart"]["result"][0]) if ticker in missing_tickers: try: html = requests.get( url='https://finance.yahoo.com/quote/' + ticker).text json_str = html.split('root.App.main =')[1].split( '(this)')[0].split(';\n}')[0].strip() info = json.loads(json_str)['context']['dispatcher'][ 'stores']['QuoteSummaryStore']['summaryProfile'] assert (len(info['sector']) > 0) and (len(info['industry']) > 0) missing_si[ticker] = dict(sector=info["sector"], industry=info["industry"]) except: pass except: pass progress.animate() num_threads = min([len(tickers), multitasking.cpu_count() * 2]) multitasking.set_max_threads(num_threads) progress = ProgressBar(len(tickers), 'completed') for ticker in tickers: _download_one_threaded(ticker, interval, period) multitasking.wait_for_tasks() progress.completed() if len(data) == 0: raise Exception("No symbol with full information is available.") data = pd.concat(data.values(), keys=data.keys(), axis=1) data.drop( columns=data.columns[data.isnull().sum(0) > 0.33 * data.shape[0]], inplace=True) data = data.fillna(method='bfill').fillna(method='ffill').drop_duplicates() info = zip(list(missing_si.keys()), [v['sector'] for v in missing_si.values()], [v['industry'] for v in missing_si.values()]) with open(si_filename, 'a+', newline='') as file: wr = csv.writer(file) for row in info: wr.writerow(row) si = pd.read_csv('stock_info.csv').set_index("SYMBOL").to_dict( orient='index') missing_tickers = [ ticker for ticker in tickers if ticker not in data.columns.get_level_values(0)[::2].tolist() ] tickers = data.columns.get_level_values(0)[::2].tolist() if len(missing_tickers) > 0: print( '\nRemoving {} from list of symbols because we could not collect full information.' .format(missing_tickers)) return dict(tickers=tickers, dates=pd.to_datetime(data.index), logp=np.log(data.iloc[:, data.columns.get_level_values(1) == 'Adj Close'].to_numpy().T), volume=data.iloc[:, data.columns.get_level_values(1) == 'Volume'].to_numpy().T, sectors=[ si[ticker]['SECTOR'] if ticker in si else "NA_" + ticker for ticker in tickers ], industries=[ si[ticker]['INDUSTRY'] if ticker in si else "NA_" + ticker for ticker in tickers ])