def download_monthly_klines(symbols, num_symbols, intervals, years, months, checksum): current = 0 print("Found {} symbols".format(num_symbols)) for symbol in symbols: print("[{}/{}] - start download monthly {} klines ".format( current + 1, num_symbols, symbol)) for interval in args.intervals: for year in args.years: for month in args.months: path = "data/spot/monthly/klines/{}/{}/".format( symbol.upper(), interval) file_name = "{}-{}-{}-{}.zip".format( symbol.upper(), interval, year, '{:02d}'.format(month)) download_file(path, file_name) if checksum == 1: checksum_path = "data/spot/monthly/klines/{}/{}/".format( symbol.upper(), interval) checksum_file_name = "{}-{}-{}-{}.zip.CHECKSUM".format( symbol.upper(), interval, year, '{:02d}'.format(month)) download_file(checksum_path, checksum_file_name) current += 1
def download_daily_aggTrades(trading_type, symbols, num_symbols, dates, start_date, end_date, folder, checksum): current = 0 date_range = None if start_date and end_date: date_range = start_date + " " + end_date if not start_date: start_date = START_DATE else: start_date = convert_to_date_object(start_date) if not end_date: end_date = END_DATE else: end_date = convert_to_date_object(end_date) print("Found {} symbols".format(num_symbols)) for symbol in symbols: print("[{}/{}] - start download daily {} aggTrades ".format(current+1, num_symbols, symbol)) for date in dates: current_date = convert_to_date_object(date) if current_date >= start_date and current_date <= end_date: path = get_path(trading_type, "aggTrades", "daily", symbol) file_name = "{}-aggTrades-{}.zip".format(symbol.upper(), date) download_file(path, file_name, date_range, folder) if checksum == 1: checksum_path = get_path(trading_type, "aggTrades", "daily", symbol) checksum_file_name = "{}-aggTrades-{}.zip.CHECKSUM".format(symbol.upper(), date) download_file(checksum_path, checksum_file_name, date_range, folder) current += 1
def analysis_deep_n(deep, gene, gene_hsa, pathway_this_gene, path, occu): download_file('http://rest.kegg.jp/get/' + pathway_this_gene + '/kgml', os.path.join(os.getcwd(), 'database', 'pathways', 'xml'), pathway_this_gene + '.xml.gz') list_rows = read_kgml(deep, pathway_this_gene, gene, gene_hsa, path, occu) return list_rows
def install_dependency(self, dependencyName, version, url, installDirectoryRelPath): savePath = utility.download_file(url, self.download_directory) utility.clear_directory_contents(self.extraction_directory) utility.extract_file(savePath, self.extraction_directory) os.remove(savePath) if self.installedDependencies.is_installed(dependencyName): self.remove_dependency(dependencyName) # not sure wether to add this or not (can cause serious impact) #if os.path.exists(installDirectory): # utility.log("installation directory {i} for dependency {d} already exist, overwriting it...".format(i=installDirectory,d=dependencyName)) # shutil.rmtree(installDirectory) installDirectory = utility.joinPaths(self.dependencies_directory, installDirectoryRelPath) utility.ensure_directory(installDirectory) # if the archive top level contains only one directory,copy its contents(not the directory itself) tempDirContents = [name for name in os.listdir(self.extraction_directory)] if len(tempDirContents) == 1 and os.path.isdir(utility.joinPaths(self.extraction_directory, tempDirContents[0])): dirPath = utility.joinPaths(self.extraction_directory, tempDirContents[0]) utility.move_directory_contents(dirPath, installDirectory) os.rmdir(dirPath) else: utility.move_directory_contents(self.extraction_directory, installDirectory) self.installedDependencies.add_dependency(dependencyName, version, installDirectoryRelPath) return True
def download_monthly_klines(symbols, num_symbols, intervals, years, months, start_date, end_date, folder, checksum): current = 0 date_range = None if start_date and end_date: date_range = start_date + " " + end_date if not start_date: start_date = START_DATE else: start_date = convert_to_date_object(start_date) if not end_date: end_date = END_DATE else: end_date = convert_to_date_object(end_date) print("Found {} symbols".format(num_symbols)) for symbol in symbols: print("[{}/{}] - start download monthly {} klines ".format( current + 1, num_symbols, symbol)) for interval in intervals: for year in years: for month in months: current_date = convert_to_date_object('{}-{}-01'.format( year, month)) if current_date >= start_date and current_date <= end_date: path = "data/spot/monthly/klines/{}/{}/".format( symbol.upper(), interval) file_name = "{}-{}-{}-{}.zip".format( symbol.upper(), interval, year, '{:02d}'.format(month)) download_file(path, file_name, date_range, folder) if checksum == 1: checksum_path = "data/spot/monthly/klines/{}/{}/".format( symbol.upper(), interval) checksum_file_name = "{}-{}-{}-{}.zip.CHECKSUM".format( symbol.upper(), interval, year, '{:02d}'.format(month)) download_file(checksum_path, checksum_file_name, date_range, folder) current += 1
def download_daily_klines(symbols, num_symbols, intervals, dates, start_date, end_date, folder, checksum): current = 0 date_range = None if start_date and end_date: date_range = start_date + " " + end_date if not start_date: start_date = START_DATE else: start_date = convert_to_date_object(start_date) if not end_date: end_date = END_DATE else: end_date = convert_to_date_object(end_date) #Get valid intervals for daily intervals = list(set(intervals) & set(DAILY_INTERVALS)) print("Found {} symbols".format(num_symbols)) for symbol in symbols: print("[{}/{}] - start download daily {} klines ".format( current + 1, num_symbols, symbol)) for interval in intervals: for date in dates: current_date = convert_to_date_object(date) if current_date >= start_date and current_date <= end_date: path = "data/spot/daily/klines/{}/{}/".format( symbol.upper(), interval) file_name = "{}-{}-{}.zip".format(symbol.upper(), interval, date) download_file(path, file_name, date_range, folder) if checksum == 1: checksum_path = "data/spot/daily/klines/{}/{}/".format( symbol.upper(), interval) checksum_file_name = "{}-{}-{}.zip.CHECKSUM".format( symbol.upper(), interval, date) download_file(checksum_path, checksum_file_name, date_range, folder) current += 1
def download_daily_aggTrades(symbols, num_symbols, dates, checksum): current = 0 print("Found {} symbols".format(num_symbols)) for symbol in symbols: print("[{}/{}] - start download daily {} aggTrades ".format( current + 1, num_symbols, symbol)) for date in dates: path = "data/spot/daily/aggTrades/{}/".format(symbol.upper()) file_name = "{}-aggTrades-{}.zip".format(symbol.upper(), date) download_file(path, file_name) if checksum == 1: checksum_path = "data/spot/daily/aggTrades/{}/".format( symbol.upper()) checksum_file_name = "{}-aggTrades-{}.zip.CHECKSUM".format( symbol.upper(), date) download_file(checksum_path, checksum_file_name) current += 1
def cmd_download(args): """ downloading one or more packages without monitoring them""" downloadDirectory = utility.joinPaths(os.getcwd(), args.directory) packages = [('@' in p and p.split('@')) or [p,"latest"] for p in args.packages] utility.ensure_directory(downloadDirectory) registryClient = get_registry_client() if not registryClient: raise Exception("registry server is not set, please set it using set-registry-server command") repositoryClient = get_repository_client() for name, version in packages: try: package_handler = registryClient.get_package_details(name) except Exception as e: utility.log(str(e)) continue if version == 'latest': version = get_latest_version(package_handler.get_package_versions()) if version == '0.0': utility.log("Package {p} is not in the ppm registry".format(p=name)) continue else: version = str(StrictVersion(version)) if not package_handler.check_version_existence(version): utility.log("Package {p} is not in the ppm registry".format(p=name)) continue url = package_handler.get_package_url(version) # check for repository url if repositoryClient: repository_url = repositoryClient.get_package_repository_url(url) if repository_url: url = repository_url utility.download_file(url, downloadDirectory)
def download_daily_klines(symbols, num_symbols, intervals, dates, checksum): current = 0 #Get valid intervals for daily intervals = list(set(intervals) & set(DAILY_INTERVALS)) print("Found {} symbols".format(num_symbols)) for symbol in symbols: print("[{}/{}] - start download daily {} klines ".format( current + 1, num_symbols, symbol)) for interval in intervals: for date in dates: path = "data/spot/daily/klines/{}/{}/".format( symbol.upper(), interval) file_name = "{}-{}-{}.zip".format(symbol.upper(), interval, date) download_file(path, file_name) if checksum == 1: checksum_path = "data/spot/daily/klines/{}/{}/".format( symbol.upper(), interval) checksum_file_name = "{}-{}-{}.zip.CHECKSUM".format( symbol.upper(), interval, date) download_file(checksum_path, checksum_file_name) current += 1
def run_analysis(starting_depth): for deep in range(starting_depth, gl.deep_input + 1): if deep == 1: # download initial pathway download_file( 'http://rest.kegg.jp/get/' + gl.pathway_input + '/kgml', os.path.join(os.getcwd(), 'database', 'pathways', 'xml'), gl.pathway_input + '.xml.gz') # get info first gene from gene name hsa_finded, url_finded = get_info_gene_initial( gl.pathway_input, gl.gene_input) # set globals variables gl.gene_input_hsa = hsa_finded gl.gene_input_url = url_finded # read initial pathway, create and add genes to csv list_rows_df_returned = read_kgml(deep, gl.pathway_input, gl.gene_input, hsa_finded, gl.gene_input, 1) # add n genes found to the dataframe unified([list_rows_df_returned]) # retrive other list pathways in reference to initial pathway list_pathways_this_gene = download_read_html(url_finded) # The pathway set as input from the config file is removed if gl.pathway_input in list_pathways_this_gene: list_pathways_this_gene.remove(gl.pathway_input) if len(list_pathways_this_gene) > 0: # process single gene on each CPUs available list_rows_df_returned = Parallel(n_jobs=gl.num_cores_input)( delayed( analysis_deep_n)(deep, gl.gene_input, hsa_finded, pathway_this_gene, gl.gene_input, 1) for pathway_this_gene in set_progress_bar( '[Deep: %d]' % deep, str(len(list_pathways_this_gene))) (list_pathways_this_gene)) unified(list_rows_df_returned) else: print('[Deep: 1] Only directly connected genes were found') else: # Retrieve the genes found at depth-1, avoiding the input gene df_genes_resulted = ( gl.DF_TREE[(gl.DF_TREE['deep'] == deep - 1) & (gl.DF_TREE['name_son'] != gl.gene_input)]) for index, row in set_progress_bar( '[Deep: %d]' % deep, str(df_genes_resulted.shape[0]))( df_genes_resulted.iterrows()): # Return a list of pathways about the gene passed in input list_pathways_this_gene = download_read_html( row['url_kegg_son']) # The pathway set as input from the config file is removed, so as to avoid an endless loop # if gl.pathway_input in list_pathways_this_gene: # list_pathways_this_gene.remove(gl.pathway_input) # process single gene on each CPUs available list_rows_df_returned = Parallel(n_jobs=gl.num_cores_input)( delayed(analysis_deep_n) (deep, row['name_son'], row['hsa_son'], pathway_this_gene, row['fullpath'], row['occurrences']) for pathway_this_gene in list_pathways_this_gene) unified(list_rows_df_returned) # ----- START DROP DUPLICATES ----- # Duplicates of the same level are extracted and sorted in alphabetical order df_genes_this_level = (gl.DF_TREE[gl.DF_TREE['deep'] == deep]) df_duplicated_filtered = df_genes_this_level[ df_genes_this_level.duplicated(subset=['name_son'], keep=False)].sort_values('name_son') # The names of the genes that are duplicated are recovered list_name_genes_duplicated = df_duplicated_filtered.name_son.unique() # process single gene on each CPUs available list_rows_to_do_df_returned = Parallel(n_jobs=gl.num_cores_input)( delayed(get_info_row_duplicated)(df_duplicated_filtered, gene_duplicate) for gene_duplicate in list_name_genes_duplicated) # The number of occurrences of the found links is updated and the duplicates will be deleted clean_update_row_duplicates(list_rows_to_do_df_returned) gl.DF_TREE = gl.DF_TREE[(gl.DF_TREE['deep'] == deep)] # export in csv per deep export_data_for_deep(deep) # Row indexes are reset, because they are no longer sequential due to the elimination of duplicates gl.DF_TREE = gl.DF_TREE.reset_index(drop=True)
def _download(self, url_key): download_file(get_url(url_key), get_file_name(url_key))