def get_filelist(self, input_params, settings): """ uses WCS requests to generate filelist of files available at service/server """ cov_list = self.base_desceocover(input_params, settings, mask=False) base_flist = [] base_mask_flist = [] gfp_flist = [] gfpmask_flist = [] # split up the received listing - Base, Base_mask, GFPs, GFPMask (--> cryoland products do not have masks) cnt = 0 for elem in cov_list: try: idx = elem.find(input_params['toi']) if idx > -1: b_cov = cov_list.pop(cnt) base_flist.append(b_cov) else: b_cov = '' cnt += 1 except ValueError: print_log(settings, str(ValueError)) gfp_flist = list(cov_list) return base_flist, base_mask_flist, gfp_flist, gfpmask_flist
def do_print_flist(name, a_list): """ prints a listing of the supplied filenames (eg. BaseImage, GapFillingProducts, and their respective Cloud-Mask filenames which are available/used) """ f_cnt = 1 for elem in a_list: lmsg = name, f_cnt,': ', elem print_log(settings, lmsg) f_cnt += 1
def apply_scenario(self, gfp_flist, gfpmask_flist, scenario, base_flist, base_mask_flist): """ apply the selected scenario i.e. sort the gfp lists accordingly """ if scenario == 'T': gfp_flist.reverse() gfpmask_flist.reverse() return gfp_flist, gfpmask_flist elif scenario == 'B': gfp_flist.sort() gfpmask_flist.sort() return gfp_flist, gfpmask_flist elif scenario == 'M': gfp_tmp = list(gfp_flist) gfp_masktmp = list(gfpmask_flist) gfp_tmp.extend(base_flist) gfp_masktmp.extend(base_mask_flist) gfp_tmp.sort() gfp_masktmp.sort() toi_pos1 = gfp_tmp.index(base_flist[0]) toi_pos2 = gfp_masktmp.index(base_mask_flist[0]) newer_flist1 = gfp_tmp[toi_pos1+1:] older_flist1 = gfp_tmp[:toi_pos1] older_flist1.reverse() newer_flist2 = gfp_masktmp[toi_pos2+1:] older_flist2 = gfp_masktmp[:toi_pos2] older_flist2.reverse() # we always use the newer files first gfp = map(None, newer_flist1, older_flist1) gfpm = map(None, newer_flist2, older_flist2) out_gfp = [] for k, v in gfp: if k is not None: out_gfp.append(k) if v is not None: out_gfp.append(v) out_gfpm = [] for k, v in gfpm: if k is not None: out_gfpm.append(k) if v is not None: out_gfpm.append(v) return out_gfp, out_gfpm else: print_log(settings, '[Error] -- Choosen Scenario is not supported. Please use either T, B or M -- ') sys.exit(3)
def _execute_getcov_request(self, http_request, request_params): """ Executes the GetCoverage request based on the generated http_url and stores the receved/downloaded coverages in the defined output location. The filenames are set to the coverageIDs (with extension according to requested file type) plus the the current date and time. This timestamp is added to avoid accidently overwriting of received coverages having the same coverageID but a differerent extend (AOI) (i.e. multiple subsets of the same coverage). Output: prints out the submitted http_request stores the received datasets saves Error-XML (-> access_error_"TimeStamp".xml) at output location (in case of failure) Returns: HttpCode (if success) """ global file_ext now = time.strftime('_%Y%m%dT%H%M%S') if not request_params['coverageID'].endswith( ('tif','tiff','Tif','Tiff','TIFF','jpeg','jpg','png','gif','nc','hdf') ): out_ext = file_ext.get( str(request_params['format'].lower())) out_coverageID = request_params['coverageID']+'.'+out_ext else: out_coverageID = request_params['coverageID'] if request_params['coverageID'].endswith( ('tiff','Tiff','TIFF','jpeg') ): out_ext = file_ext.get( str(request_params['coverageID'][-4:].lower())) out_coverageID = request_params['coverageID'][:-4]+out_ext if request_params.has_key('output') and request_params['output'] is not None: outfile = request_params['output']+out_coverageID else: outfile = temp_storage+out_coverageID try: request_handle = urllib2.urlopen(http_request) status = request_handle.code try: file_getcov = open(outfile, 'w+b') file_getcov.write(request_handle.read()) file_getcov.flush() os.fsync(file_getcov.fileno()) file_getcov.close() request_handle.close() return status except IOError as (errno, strerror): err_msg = "I/O error({0}): {1}".format(errno, strerror) print_log(settings, err_msg) except: err_msg = "Unexpected error:", sys.exc_info()[0] print_log(settings, err_msg) raise
def dump_and_check_all_data(self) -> None: # xxx_json の名を持つ関数のリストを生成する(_で始まる内部変数は除外する) # ちなみに、以降生成するjsonを増やす場合は"_json"で終わる関数と"_"で始まる、関数に対応する内部変数を用意すれば自動で認識される json_list = [ member[0] for member in inspect.getmembers(self) if member[0][-4:] == "json" and member[0][0] != "_" ] for json in json_list: # 関数は"_json"で終わっているので、それを拡張子に直す必要がある json_name = json[:-5] + ".json" print_log("data_manager", f"Make {json_name}...") # evalで文字列から関数を呼び出し、jsonを出力 print_log("data_manager", f"Dumps {json_name}...") dumps_json(json_name, eval("self." + json + "()"))
def filter_trie_mismatch_similarity(tempdir, name, kmers_trie, sim, keys_filenames, processes, n, parts): """Find keys in trie few mismatches away from other keys. Args: kmers_trie: trie.trie object of the form described in and returned by build_kmers_trie() sim: if a key has another key at Hamming distance at most this, label it as bad guideRNA keysfile: name of file where first field of each line is a k-mer, assume file is gzipped; loop only over the k-mers in this file """ # parts = 256 badkeysfiles = ['%s/badkeys%s.txt.gz' % (tempdir, i) for i in range(parts)] process_list = [] all_task = Queue() for i in range(parts): task = (keys_filenames[i], i, badkeysfiles[i]) all_task.put(task) for process in range(processes): p = Process(target=process_pool_mismatch, args=(all_task, kmers_trie, sim, n, parts)) p.start() process_list.append(p) for p in process_list: p.join() count = 0 for i in range(parts): badkeys = gzip.open(badkeysfiles[i]) for line in badkeys: string = line.strip().split() index = int(string[0]) kmer = string[1] if not kmers_trie[index].has_key(kmer): continue if kmers_trie[index][kmer][0] != 0: continue else: kmers_trie[index][kmer][0] = 1 count += 1 badkeys.close() print '%s k-mers labeled as bad guideRNAs' % count util.print_log('done') return kmers_trie
def __init__(self, completer): self.completer = completer self.commands = { 'complete': (lambda args: self.completer.complete( args['prefix'], args['context'], args['file_name'], )), 'parse': (lambda args: self.completer.parse( args['file_name'], args['content'], )), 'log_status': (lambda args: self.completer.log_status()), } util.print_log('completer started.')
def filter_keys_trie(tempdir, kmers_trie, filenames1, filenames2, keysoutputfile, nonCandidatekeysoutputfile, processes, n, parts): """Select k-mers from file that have label 0 in trie and write to file. Args: kmers_trie: trie.trie object of the form described in and returned by build_kmers_trie() keysinputfile: name of file where first field of each line is a k-mer, assume file is gzipped keysoutputfile: name of file where to write selected keys one per line, gzipped """ # parts = 256 badkeysfiles = ['%s/badkeys%s.txt.gz' % (tempdir, i) for i in range(parts)] process_list = [] all_task = Queue() for i in range(parts): task = (filenames1[i], filenames2[i], badkeysfiles[i], i) all_task.put(task) for process in range(processes): p = Process(target=process_pool_filter, args=(all_task, kmers_trie, n)) p.start() process_list.append(p) for p in process_list: p.join() util.print_log('filter processes done...') # badkeys = gzip.open(nonCandidatekeysoutputfile, 'w') # for i in range(parts): # f = gzip.open(badkeysfiles[i]) # for line in badkeysfiles[i]: # badkeys.write(line) # f.close() # badkeys.close() write_count = 0 # goodkeys = gzip.open(keysoutputfile, 'w') for i in range(parts): f = gzip.open(filenames2[i]) for line in f: # goodkeys.write(line) write_count += 1 f.close() # goodkeys.close() print '%s keys written' % write_count
def cnv_output(cf_result, input_params, settings): """ convert the resulting CF_image and CF_Mask accoring to the user requested output_crs, output_format, output_datatype """ # @@ supported_ext = {'VRT': '.vrt', 'GTIFF': '.tif', 'NITF': '.nitf', 'HFA': '.img', 'ELAS': '.ELAS', 'AAIGRID': '.grd', 'DTED': '.DTED', 'PNG': '.png', 'JPEG': '.jpg', 'MEM': '.mem', 'GIF': '.gif', 'XPM': '.xpm', 'BMP': '.bmp', 'PCIDSK': '.PCIDSK', 'PCRASTER': '.PCRaster', 'ILWIS': '.ilw', 'SGI': '.sgi', 'SRTMHGT': '.SRTMHGT', 'LEVELLER': '.Leveller', 'TERRAGEN': '.Terragen', 'GMT': '.gmt', 'NETCDF': '.nc', 'HDF4IMAGE': '.hdf', 'ISIS2': '.ISIS2', 'ERS': '.ers', 'FIT': '.fit', 'JPEG2000': '.jp2', 'RMF': '.rmf', 'WMS ':'.WMS', 'RST': '.rst', 'INGR': '.INGR', 'GSAG': '.grd', 'GSBG': '.grd', 'GS7BG': '.grd', 'R': '.r', 'PNM': '.pnm', 'ENVI': '.img', 'EHDR': '.hdr', 'PAUX': '.aux', 'MFF': '.mff', 'MFF2': '.mff2', 'BT': '.bt', 'LAN': '.lan', 'IDA': '.ida', 'LCP': '.lcp', 'GTX': '.GTX', 'NTV2': '.NTv2', 'CTABLE2': '.CTable2', 'KRO': '.KRO', 'ARG': '.ARG', 'USGSDEM': '.USGDEM', 'ADRG': '.img', 'BLX': '.blx', 'RASTERLITE': '.Rasterlite', 'EPSILON': '.Epsilon', 'POSTGISRASTER': '.PostGISRaster', 'SAGA': '.sdat', 'KMLSUPEROVERLAY': '.kmlovl', 'XYZ': '.xyz', 'HF2': '.HF2', 'PDF': '.pdf', 'WEBP': '.webp', 'ZMAP': '.ZMap'} if supported_ext.has_key(input_params['output_format']): out_ext = supported_ext.get(input_params['output_format']) lmsg = 'Converting -- CF_image and CF_mask to: ' + input_params['output_format'] + ' [ *' + out_ext + ']' print_log(settings, lmsg) if input_params['output_format'] == 'GTIFF': tr_params1 = "" else: tr_params1 = " -of " + str(input_params['output_format']) if input_params['output_datatype'] == 'input': tr_params2 = "" else: tr_params2 = " -ot " + str(input_params['output_datatype']) tr_params = tr_params1 + tr_params2 #print tr_params + " " + input_params['output_dir'] + cf_result[0] + " " + input_params['output_dir'] + cf_result[0][:-4]+out_ext res = os.system("gdal_translate -q" + tr_params + " " + input_params['output_dir'] + cf_result[0] + " " + input_params['output_dir'] + cf_result[0][:-4]+out_ext ) if res is 0: os.remove(input_params['output_dir'] + cf_result[0]) else: err_msg = '[Error] - CF_image could not be converted' handle_error(err_msg, res, settings) res = os.system("gdal_translate -q" + tr_params + " " + input_params['output_dir'] + cf_result[1] + " " + input_params['output_dir'] + cf_result[1][:-4]+out_ext ) if res is 0: os.remove(input_params['output_dir'] + cf_result[1]) else: err_msg = '[Error] - CF_mask could not be converted' handle_error(err_msg, res, settings)
def base_getcover(self, file_list, input_params, settings, temp_storage, mask): """ Function to actually requesting and saving the available coverages on the local file system. """ # get the time of downloading - to be used in the filename (to differentiate if multiple AOIs of # the same coverages are downloaded to the same output directory) target_server, toi_values, aoi_values, dss = self.set_request_values(settings, input_params, mask=False) request = {'request': 'GetCoverage' , 'server_url': target_server , # this is set in the file_list loop # 'coverageID': COVERAGEID , 'format': 'tiff' , 'subset_x': 'epsg:4326 Long '+ aoi_values[0]+','+aoi_values[1] , 'subset_y': 'epsg:4326 Lat '+aoi_values[2]+','+aoi_values[3], # 'output': input_params['output_dir'] } # we need to use the tmporary directory here! 'output': temp_storage } # create output-crs syntax to be added to GetCoverage request if input_params['output_crs'] != None: request['outputcrs'] = input_params['output_crs'].split(':')[1] # handle band-subsetting if input_params['bands'] != '999': bands = '' for bb in input_params['bands']: bands = bands+bb+',' request['rangesubset'] = bands[:-1] # don't use bandsubsetting for requests regarding mask-files if mask is True: request['rangesubset'] = None for COVERAGEID in file_list: request['coverageID'] = COVERAGEID res_getcov = wcs.GetCoverage(request, settings, input_params) if res_getcov is not 200: print_log(settings, res_getcov)
def access_ds(self, basefile, basemaskfile, temp_storage): """ provide file access handle to RasterImg and MaskImg """ # the actual image datasets infile_basef = os.path.join(temp_storage, basefile) baseImg = self.fopen(infile_basef) # the corresponding mask files infile_basemaskf = os.path.join(temp_storage, basemaskfile) basemaskImg = self.fopen(infile_basemaskf) if baseImg is None: err_msg = '[Error] -- Could not open: ', temp_storage+infile_basef print_log(settings, err_msg) sys.exit(6) if basemaskImg is None: err_msg = '[Error] -- Could not open: ', temp_storage+infile_basemaskf print_log(settings, err_msg) sys.exit(6) return baseImg, infile_basef, basemaskImg, infile_basemaskf
def dump_and_check_all_data(self) -> None: global changed_flag # xxx_json の名を持つ関数のリストを生成し(_で始まる内部変数は除外する) # その後jsonschemaを使ってバリデーションチェックをし、現在のjsonと比較してフラグ(changed_flag)を操作する # ちなみに、以降生成するjsonを増やす場合は"_json"で終わる関数と"_"で始まる、関数に対応する内部変数を用意すれば自動で認識される json_list = [ member[0] for member in inspect.getmembers(self) if member[0][-4:] == "json" and member[0][0] != "_" ] for json in json_list: # 関数は"_json"で終わっているので、それを拡張子に直す必要がある json_name = json[:-5] + ".json" print_log("data_manager", f"Make {json_name}...") # evalで文字列から関数を呼び出している made_json = eval("self." + json + "()") # 現在デプロイされているjsonを取得し、現在のjsonと比較する # 比較結果が「等しくない」のであれば、そのファイルのバリデーションチェックをして出力、 # 「等しい」のであればそのまま出力する now_json = requests_now_data_json(json_name) if now_json != made_json: changed_flag = True # schemaを読み込み、作成したjsonをチェックする。 print_log("data_manager", f"Validate {json_name}...") schema = loads_schema(json_name) try: validate(made_json, schema) except exceptions.ValidationError: raise Exception(f"Check failed {json_name}!") print_log("data_manager", f"{json_name} is OK!") else: print_log("data_manager", f"{json_name} has not changed.") # jsonを出力 print_log("data_manager", f"Dumps {json_name}...") dumps_json(json_name, made_json)
def main(): #user inputs args = arg_parser() args_dict = args.__dict__ #tidy PAM and chrom args args_dict['altpam'] = [s.upper() for s in args_dict['altpam'].split(',')] args_dict['altpam'] = [s.strip() for s in args_dict['altpam'] if s] args_dict['pam'] = args_dict['pam'].upper() if args_dict['chrom']: if os.path.isfile(args_dict['chrom']): chroms = open(args_dict['chrom']).read().split(',') else: chroms = args_dict['chrom'].split(',') chroms = [c.strip() for c in chroms] chroms = [c for c in chroms if c] else: chroms = [] args_dict['chrom'] = chroms util.print_log('save arguments...') util.print_args(args_dict) util.save_args(args_dict) util.print_log('done') #main util.print_log2('start extract_process_kmers()') kmers.extract_process_kmers(args_dict['name']) util.print_log2('start analyze_guides()') kmers_trie = guides.analyze_guides(args_dict['name']) util.print_log2('start produce_bams_main()') bamdata.produce_bams_main(kmers_trie, args_dict['name']) util.print_log2('processer done.')
def load_restore_trie(name, trie_filename, n, parts): """Fully load previously stored trie of all genomic kmers for a project. name: project name, used to get project args and in all output Return: loaded kmers_trie """ util.print_log('load trie...') # trie_filename = ['%s/%s/%s_trie%s.dat' % (name, 'kmers_tries', name, i) for i in range(256)] kmers_trie = load_trie(trie_filename, parts) util.print_log('done') keysfile = '%s/%s_kmers_shuffled.txt.gz' % (name, name) util.print_log('restore numpy arrays in trie values...') restore_trie_arrays(kmers_trie, keysfile, n) util.print_log('done') return kmers_trie
def process_clouds(self, base_flist, base_mask_flist, gfp_flist, gfpmask_flist, input_params, settings, temp_storage, f_read): """ make sure all donwloaded CoverageIDs come in with ".tiff" extension """ wcs_ext = '.tif' # add the file-format extension to the list of CoverageIDs # the filenames are already changed in: dataset_reader.base_getcover base_flist_e = [item+wcs_ext for item in base_flist if not item.lower().endswith(wcs_ext) ] or \ [item for item in base_flist if item.lower().endswith(wcs_ext) ] base_mask_flist_e = [item+wcs_ext for item in base_mask_flist if not item.lower().endswith(wcs_ext) ] or \ [item for item in base_mask_flist if item.lower().endswith(wcs_ext) ] gfp_flist_e = [item+wcs_ext for item in gfp_flist if not item.lower().endswith(wcs_ext) ] or \ [item for item in gfp_flist if item.lower().endswith(wcs_ext) ] gfpmask_flist_e = [item+wcs_ext for item in gfpmask_flist if not item.lower().endswith(wcs_ext) ] or \ [item for item in gfpmask_flist if item.lower().endswith(wcs_ext) ] # test if the files are really available at temp_storage for ifile, mfile in zip(base_flist_e, base_mask_flist_e): if os.path.exists(temp_storage+ifile) is False: err_msg = '[Error] -- File does not exist: ', temp_storage+ifile print_log(settings, err_msg) sys.exit(5) if os.path.exists(temp_storage+mfile) is False: err_msg = '[Error] -- File does not exist: ', temp_storage+mfile print_log(settings, err_msg) sys.exit(5) for gfile, gmfile in zip(gfp_flist_e, gfpmask_flist_e): if os.path.exists(temp_storage+gfile) is False: err_msg = '[Error] -- File does not exist: ', temp_storage+gfile print_log(settings, err_msg) sys.exit(5) if os.path.exists(temp_storage+gmfile) is False: err_msg = '[Error] -- File does not exist: ', temp_storage+gmfile print_log(settings, err_msg) sys.exit(5) cf_result = self.change_img(base_flist_e, base_mask_flist_e, gfp_flist_e, gfpmask_flist_e, input_params, temp_storage) return cf_result
def _execute_xml_request(self, http_request, IDs_only=False): """ Executes the GetCapabilities, DescribeCoverage, DescribeEOCoverageSet requests based on the generate http_url Returns: either XML response document or a list of coverageIDs Output: prints out the submitted http_request or Error_XML in case of failure """ try: # access the url request_handle = urllib2.urlopen(http_request) # read the content of the url result_xml = request_handle.read() # extract only the CoverageIDs and provide them as a list for further usage if IDs_only == True: cids = self._parse_xml(result_xml, self._xml_ID_tag[1]) request_handle.close() # if no datasets are found return the XML if len(cids) == 0 or cids is None: cids = result_xml return cids else: request_handle.close() return result_xml except urllib2.URLError, url_ERROR: if hasattr(url_ERROR, 'reason'): err_msg = '\n', time.strftime("%Y-%m-%dT%H:%M:%S%Z"), "- ERROR: Server not accessible -", url_ERROR.reason print_log(settings, err_msg) try: err_msg = url_ERROR.read(), '\n' print_log(settings, err_msg) except: pass elif hasattr(url_ERROR, 'code'): lmsg = time.strftime("%Y-%m-%dT%H:%M:%S%Z"), "- ERROR: The server couldn\'t fulfill the request - Code returned: ", url_ERROR.code, url_ERROR.read() print_log(settings, lmsg) err_msg = str(url_ERROR.code)+'--'+url_ERROR.read() return err_msg
def get_filelist(self, input_params, settings): """ uses WCS requests to generate filelist of files available at service/server """ cov_list = self.base_desceocover(input_params, settings, mask=False) # check if there is realy a list of datasets returned or an error msg if type(cov_list) is str: # and cov_list.find('numberMatched="0"') is not -1: err_msg = '[Error] -- No Datasets found. Service returned the follwing information.' print_log(settings, err_msg) print_log(settings, cov_list) sys.exit() mask_list = self.base_desceocover(input_params, settings, mask=True) if type(mask_list) is str: # and cov_list.find('numberMatched="0"') is not -1: err_msg = '[Error] -- No Datasets found. Service returned the follwing information.' print_log(settings, err_msg) print_log(settings, cov_list) sys.exit() # split up the received listing - Base, Base_mask, GFPs, GFPMask # (--> cryoland products do not have masks) cnt = 0 base_flist = [] gfp_flist = [] for elem in cov_list: idx = elem.find(input_params['toi']) if idx > -1: b_cov = cov_list.pop(cnt) base_flist.append(b_cov) cnt += 1 gfp_flist = list(cov_list) base_mask_flist = [] gfpmask_flist = [] cnt = 0 for elem in mask_list: idx = elem.find(input_params['toi']) if idx > -1: b_mask = mask_list.pop(cnt) base_mask_flist.append(b_mask) cnt += 1 gfpmask_flist = list(mask_list) gfp_flist, gfpmask_flist = self.apply_scenario(gfp_flist, gfpmask_flist, input_params['scenario'], base_flist, base_mask_flist ) if len(base_flist) != len(base_mask_flist): err_msg = 'Number of datafiles and number of cloud-masks do not correspond' print_log(settings, err_msg) sys.exit(4) if len(gfp_flist) != len(gfpmask_flist): err_msg = 'Number of datafiles and number of cloud-masks do not correspond' print_log(settings, err_msg) sys.exit(4) return base_flist, base_mask_flist, gfp_flist, gfpmask_flist
def main(): """ Main function: calls the subfunction according to user input """ # read in the default settings from the configuration file global settings settings = get_config(default_config_file) # set the logging output i.e. to a File or the screen set_logging(settings) # get all parameters provided via cmd-line global input_params input_params = get_cmdline() # now that we know what dataset we need and where to find them, select the # correct reader for the requested dataset # first test if requested dataset does exist if settings.has_key('dataset.'+input_params['dataset']): reader = 'CF_' + input_params['dataset'] + '_Reader' else: err_msg = '[Error] -- ', now(), ' the requested dataset does not exist (is not configured)', input_params['dataset'] err_code = 3 handle_error(err_msg, err_code, settings) # call the reader module for the resepective dataset and process the data import dataset_reader attribute = getattr(dataset_reader, reader) f_read = attribute() #print 'READER: ', f_read #@@ # gets a listing of available DatasetSeries and their corresponding time-range base_flist, base_mask_flist, gfp_flist, gfpmask_flist = f_read.get_filelist(input_params, settings) # @@@@ ## processing-limits (max. filenumber to be used) here # @@@ if gfp_flist.__len__() > int(settings['general.def_maxfiles']): err_msg = '[Error] -- ', now(), ' the number of GFP products availabel (=', str(gfp_flist.__len__()).strip(),') for the selected time period is larger then the configured "def_maxfiles" of: ', settings['general.def_maxfiles'], '\n', 'Please select a shorter time-period.' err_code = 4 handle_error(err_msg, err_code, settings) # print the available input datasets: eg. during testing do_print_flist('BASE', base_flist) do_print_flist('BASE_Mask', base_mask_flist) do_print_flist('GFP', gfp_flist) do_print_flist('GFP_Mask', gfpmask_flist) lmsg = 'Dataset_listing - RUNTIME in sec: ', time.time() - startTime1 print_log(settings, lmsg) # create a temporarylocation under the provided settings['general.def_temp_dir'] to be used # for the temporary storage during processing temp_storage = tempfile.mkdtemp(prefix='cloudfree_',dir=settings['general.def_temp_dir']) if temp_storage[-1] != dsep: temp_storage = temp_storage+dsep if len(base_flist) >= 1: f_read.base_getcover(base_flist, input_params, settings, temp_storage, mask=False) if len(base_mask_flist) >= 1: f_read.base_getcover(base_mask_flist, input_params, settings, temp_storage, mask=True) lmsg = 'BASE dataset_download - RUNTIME in sec: ', time.time() - startTime1 #, '\n' print_log(settings, lmsg) # call the Processor module for the resepective dataset and process the data import dataset_processor cfprocessor = 'CF_' + input_params['dataset'] + '_Processor' attribute = getattr(dataset_processor, cfprocessor) f_proc = attribute() #print 'PROCESSOR: ', f_proc #@@ cf_result = f_proc.process_clouds_1(base_flist, base_mask_flist, gfp_flist, gfpmask_flist, input_params, settings, temp_storage, f_read) # copy results to output location and clean-up the temporary storage area do_cleanup_tmp(temp_storage, cf_result, input_params, settings) # if output_format and/or output_datatype has been set by the user -> translate resulting image(s) # using gdal_translate if change_output is True: cnv_output(cf_result, input_params, settings) # ---------- # for performance testing msg = 'Full Processing Runtime in sec: ', time.time() - startTime1, '\n' print_log(settings, msg) settings['logging.log_fsock'].close()
request_handle.close() return status except IOError as (errno, strerror): err_msg = "I/O error({0}): {1}".format(errno, strerror) print_log(settings, err_msg) except: err_msg = "Unexpected error:", sys.exc_info()[0] print_log(settings, err_msg) raise except urllib2.URLError as url_ERROR: if hasattr(url_ERROR, 'reason'): err_msg = '\n', time.strftime("%Y-%m-%dT%H:%M:%S%Z"), "- ERROR: Server not accessible -", url_ERROR.reason print_log(settings, err_msg) # write out the servers return msg errfile = outfile.rpartition(dsep)[0]+dsep+'access_error'+now+'.xml' access_err = open(errfile, 'w+b') access_err.write(url_ERROR.read()) access_err.flush() access_err.close() elif hasattr(url_ERROR, 'code'): lmsg = time.strftime("%Y-%m-%dT%H:%M:%S%Z"), "- ERROR: The server couldn\'t fulfill the request - Code returned: ", url_ERROR.code, url_ERROR.read() print_log(settings, lmsg) err_msg = str(url_ERROR.code)+'--'+url_ERROR.read() return err_msg except TypeError: pass return
interval = int(argv[1]) if len(argv) >= 2 else 5 count = int(argv[2]) if len(argv) >= 3 else 5 device_ip = argv[3] if len(argv) >= 4 else '10.30.30.1' interface = argv[4] if len(argv) >= 5 else 'gigabitethernet 0/1' # Read device information from database, into list of device info lists devices_from_db = read_devices_db('devices.db') # Get device information for our device device = get_device(devices_from_db, device_ip) if device == None: print '!!! Cannot find device in DB!' exit() logfile = 'dev-stats-log' # set output CSV log file # Gather traffic data for the devices in the list gather_traffic_data(logfile, device, interface, interval, count) dev_stats_log = open(logfile,'r') csv_log = csv.reader(dev_stats_log) log_info_list = [log_info for log_info in csv_log] # Print log information for our one device print '' print 'Device: ', device.ip_address, ' Interface: ', interface print '' print_log(device_ip, log_info_list)
def build_kmers_trie(filename, genome, name, altpam=[], pampos='end', maxcount=10, goodkeysfile='', badkeysfile='', tempdir='', triekeys_v1_filenames=[], kmers_filenames=[], processes=1, n=4, parts=256): """Read k-mers and their coordinates from file and store them in a trie. The resulting trie is of the form {<k-mer> : <np.array of int values>}. In each array, store int-transformed coordinates of occurrences of the k-mer, starting from position 1 of the array. Store at most 'maxcount' coordinates. Position 0 in the array is reserved for labeling: 0: good guideRNA, positive: show how many occurrences this k-mer has in the genome [other labels: decide later] In this building stage, label all k-mers with alternative PAM and all k-mers with more than one occurrence in the genome as bad guideRNAs. Optionally, also store in a separate file all k-mers that are still considered good candidate guideRNAs. This means only filtering out k-mers with alternative PAM, because detecting multi-mapping k-mers labeling them as bad guideRNAs may happen after they were first read. Note: make sure lines with k-mers in input file are randomly shuffled. This is to ensure that for k-mers with more than 'maxcount' occurrences, we store artibrary 'maxcount' of them without any bias. Args: filename: name of file with k-mers, assume file is gzipped and lines are randomly shuffled genome: list of pairs [(<chromosome name>, <chromosome length>)] altpam: list of alternative PAM sequences, all k-mers starting or ending (depending on argument 'pampos') with these sequences are labeled as bad guideRNAs pampos: position of alternative PAM in k-mer ('start' or 'end') maxcount: store at most this many coordinates for each k-mer goodkeysfile: where to store potentially good candidate guideRNAs; use only if altpam is not empty, otherwise all input keys from filename will be stored which is redundant Output: return trie.trie object {<k-mer> : <np.array of int values>} optionally produce file goodkeysfile with candidate guideRNAs """ # parts = 256 util.check_file_exists(filename) badkeysfiles = ['%s/badkeys%s.txt.gz' % (tempdir, i) for i in range(parts)] kmers_trie_files = ['%s/kmers_trie%s.dat' % (tempdir, i) for i in range(parts)] util.print_log('classify k-mers into %s...' % parts) if parts > 1000: tempfiles = [tempfile.NamedTemporaryFile(dir=tempdir, suffix='.temp%s' % i) for i in range(2)] # tempfiles = [gzip.open('%s/temp%s.txt.gz' % (tempdir, i),'w') for i in range(2)] mid = parts // 2 file = gzip.open(filename) for line in file: kmer = line.split()[0] index = get_num(kmer, n) if index < mid: tempfiles[0].write(str(index) + ' ' + line) else: tempfiles[1].write(str(index) + ' ' + line) for f in tempfiles: f.flush() file.close() util.print_log('write...') kmersfiles1 = [gzip.open(kmers_filenames[i],'w') for i in range(mid)] # tempfiles = [gzip.open('%s/temp%s.txt.gz' % (tempdir, i)) for i in range(2)] temp = [open(tempfiles[i].name) for i in range(2)] for line in temp[0]: data = line.split() index = int(data[0]) kmer = data[1] coord = data[2] kmersfiles1[index].write(kmer + ' ' + coord + '\n') for f in kmersfiles1: f.close() temp[0].close() # util.print_log('write count1...') kmersfiles2 = [gzip.open(kmers_filenames[i],'w') for i in range(mid, parts)] for line in temp[1]: data = line.split() index = int(data[0]) - mid kmer = data[1] coord = data[2] kmersfiles2[index].write(kmer + ' ' + coord + '\n') for f in kmersfiles2: f.close() temp[1].close() for f in tempfiles: f.close() # util.print_log('write count2...') else: kmersfiles = [gzip.open(kmers_filenames[i],'w') for i in range(parts)] file = gzip.open(filename) for line in file: kmer = line.split()[0] index = get_num(kmer, n) kmersfiles[index].write(line) file.close() for f in kmersfiles: f.close() util.print_log('done...') util.print_log('build tries start...') process_list = [] all_task = Queue() for i in range(parts): task = (kmers_filenames[i], triekeys_v1_filenames[i], badkeysfiles[i], kmers_trie_files[i]) all_task.put(task) for process in range(processes): p = Process(target=process_pool_build_tries, args=(all_task, genome, altpam, pampos, maxcount, n)) p.start() process_list.append(p) for p in process_list: p.join() util.print_log('build tries done...')
def change_img(self, base_flist_e, base_mask_flist_e, gfp_flist, gfpmask_flist, gfp_flist_e, gfpmask_flist_e, input_params, temp_storage, f_read, settings): """ replace clouded pixels with non-clouded pixels write out cloud-free product, metadata-maskfile and metadata-textfile (of used products) option uses full file reading (which is faster, but has higher memory usage) """ out_prefix = 'CF_' img_cnt = 1 out_meta_mask = '_composite_mask.tif' startTime2 = time.time() for basefile, basemaskfile in zip(base_flist_e, base_mask_flist_e): baseImg, infile_basef, basemaskImg, infile_basemaskf = self.access_ds(basefile, basemaskfile, temp_storage) baseImgDim, baseProj, baseLocation = self.read_img(baseImg, infile_basef) basemaskDim, basemaskProj, basemaskLocation, basemaskImg, basemaskClouds, basemaskCoord = self.read_mask(basemaskImg, infile_basemaskf, isBaseImg=True) baseImgDim.append([baseImg.GetDriver().ShortName]) baseImgBand = baseImg.GetRasterBand(1) baseImgDt = getNumpyDataType(baseImgBand.DataType) gDType = getGdalDataType(baseImgDt) driver = baseImg.GetDriver() # create the cloud-free output dataset outFile = infile_basef.rsplit(dsep, 1) outFile[1] = out_prefix + outFile[1] if outFile[1].endswith('.tiff'): outFile[1] = outFile[1].replace('.tiff','.tif') outFile[0] = temp_storage[:-1] # @@ testing intermediary -> comment out the following line --> see also below #outImg = driver.Create((outFile[0]+dsep+outFile[1]), baseImgDim[0][0], baseImgDim[1][0], baseImgDim[2][0], gDType) outImg = driver.Create((outFile[0]+dsep+outFile[1]), baseImgDim[0][0], baseImgDim[1][0], baseImgDim[2][0], gDType, [ 'TILED=YES', 'COMPRESS=DEFLATE' ] ) # metadata mask & txt-file for storing the info about used (combined) datasets cur_ext = os.path.splitext(outFile[1])[1] metamaskTIF = outFile[1].replace(cur_ext, out_meta_mask) metamaskTXT = metamaskTIF.replace('.tif','.txt') # the metamask - will always be a 8-Bit GeoTiff metamaskImg = np.zeros((baseImgDim[1][0], baseImgDim[0][0]), uint8) eval_mask = np.array(basemaskImg) out_data = np.zeros((baseImgDim[2][0], baseImgDim[1][0], baseImgDim[0][0]), dtype=baseImgDt) for i in range(1, baseImgDim[2][0]+1,1): baseBand = baseImg.GetRasterBand(i) baseBand1 = baseBand.ReadAsArray(0, 0, baseImgDim[0][0], baseImgDim[1][0]) out_data[i-1, :, :] = baseBand1 #for gfpfile, gfpmaskfile in zip(gfp_flist_e, gfpmask_flist_e): for gfpfile, gfpmaskfile, gfpfile_e, gfpmaskfile_e in zip(gfp_flist, gfpmask_flist, gfp_flist_e, gfpmask_flist_e): startTime3 = time.time() lmsg = 'Using GFP-'+str(img_cnt)+': ', gfpfile #, type(gfpfile) print_log(settings, lmsg) f_read.base_getcover([gfpfile], input_params, settings, temp_storage, mask=False) f_read.base_getcover([gfpmaskfile], input_params, settings, temp_storage, mask=True) lmsg = 'Using GFPMask-'+str(img_cnt)+': ', gfpmaskfile #, type(gfpmaskfile) print_log(settings, lmsg) gfpImg, infile_gfpf, gfpmaskImg, infile_gfpmaskf = self.access_ds(gfpfile_e, gfpmaskfile_e, temp_storage) gfpImgDim, gfpProj, gfpLocation = self.read_img(gfpImg, infile_gfpf) gfpmaskDim, gfpmaskProj, gfpmaskLocation, gfpmaskImg, gfpmaskClouds = self.read_mask(gfpmaskImg, infile_gfpmaskf, isBaseImg=False) res2 = np.ma.MaskedArray((eval_mask > 0) & (gfpmaskImg == 0)) lmsg = 'N_cloudpixel replaced: ', res2.sum() print_log(settings, lmsg) metamaskImg[res2] = img_cnt eval_mask[res2] = 0 # write the maskfile, modify existing if available if os.path.exists(outFile[0]+dsep+metamaskTIF): out_metamask_tif = gdal.OpenShared(outFile[0]+dsep+metamaskTIF, GA_Update) else: out_metamask_tif = driver.Create((outFile[0]+dsep+metamaskTIF), baseImgDim[0][0], baseImgDim[1][0], 1, GDT_Byte) maskBand = out_metamask_tif.GetRasterBand(1) maskBand.WriteArray(metamaskImg, 0, 0) maskBand.FlushCache() out_metamask_tif.SetGeoTransform(baseImg.GetGeoTransform()) out_metamask_tif.SetProjection(baseImg.GetProjection()) # @@ for testing intermediary -- uncomment the following line --> see also above and below # to test you may write out intermediary products #outImg = driver.Create((outFile[0]+dsep+outFile[1])+'_'+str(img_cnt), baseImgDim[0][0], baseImgDim[1][0], baseImgDim[2][0], gDType) # create a txt file containing the image-filenames and byte-codes used in the metamask if os.path.exists(outFile[0]+dsep+metamaskTXT): out_metamask_txt = open(outFile[0]+dsep+metamaskTXT, "a") else: out_metamask_txt = open(outFile[0]+dsep+metamaskTXT, "w") applied_mask = infile_gfpmaskf.rsplit(dsep, 1) out_metamask_txt.write(str(img_cnt)+';'+applied_mask[1]+'\n') out_metamask_txt.flush() # read all bands, check each for cloud-free areas, and write to cloud-free image for i in range(1, baseImgDim[2][0]+1, 1): gfpBand = gfpImg.GetRasterBand(i) gfpBand1 = gfpBand.ReadAsArray(0, 0, gfpImgDim[0][0], gfpImgDim[1][0]) out_data[i-1][res2] = gfpBand1[res2] lmsg = 'Remaining masked pixels: ', np.count_nonzero(eval_mask) print_log(settings, lmsg) # bail out if no more clouded picels are available if eval_mask.sum() == 0: lmsg = 'All pixels masked as clouds have been replaced' print_log(settings, lmsg) break img_cnt += 1 lmsg = 'GFP Product processing time: ', time.time() - startTime3 print_log(settings, lmsg) lmsg = 'Writing CloudFree product...' print_log(settings, lmsg) #write out all Bands into outFile for i in range(1, baseImgDim[2][0]+1, 1): outBand = outImg.GetRasterBand(i) outBand.WriteArray(out_data[i-1], 0, 0) outBand.FlushCache() # set the Porjection info - copied from baseImg outImg.SetGeoTransform(baseImg.GetGeoTransform()) outImg.SetProjection(baseImg.GetProjection()) # calculate the overviews needed overview_sizes = calc_overviews(outBand, [baseImgDim[0][0], baseImgDim[1][0]]) # initate pyramid creation outImg.BuildOverviews(resampling="NEAREST", overviewlist=overview_sizes) # @@ for testing intermediary - uncomment the following line -- see also above #outImg = None lmsg = 'CloudFree processing - RUNTIME in sec: ', time.time() - startTime2 print_log(settings, lmsg) cf_result = [outFile[1], metamaskTIF, metamaskTXT] out_metamask_tif = None out_metamask_txt.close() outImg = None basemaskImg = None infile_basemaskf = None infile_gfpmaskf = None return cf_result
def process_clouds_1(self, base_flist, base_mask_flist, gfp_flist, gfpmask_flist, input_params, settings, temp_storage, f_read): """ perform the required cloud removal processing steps """ out_meta_mask = '_composite_mask.txt' # some values used for cryoland cloud masking cloud_val = 30 zero_val = 0 # all values above are not in use in CryoLand nodata_val = 253 # provide additional tiff-settings for the tif-creation # tiff_options = [ "TILED=YES", "BLOCKXSIZE=256", "BLOCKYSIZE=256" ] tiff_options = [] outFile = os.path.join(temp_storage+'CF_'+base_flist[0]) metamaskTXT = outFile.replace('.tif', out_meta_mask) if os.path.exists(metamaskTXT): out_metamask_txt = open(metamaskTXT, "a") else: out_metamask_txt = open(metamaskTXT, "w") inbase_img = self.fopen(temp_storage+base_flist[0]) if inbase_img is None: err_msg = 'Could not open file: ', temp_storage+base_flist[0] handle_error(err_msg, 4, settings) inbase_NumBands = inbase_img.RasterCount inbase_band = inbase_img.GetRasterBand(inbase_NumBands) nDtype = getNumpyDataType(inbase_band.DataType) gDtype = getGdalDataType(nDtype) ## TODO -- make this more egeneral ie. read DriverType from file indriver = gdal.GetDriverByName('GTiFF') # load file directly into numpy array - faster, but needs more memory base_img = gdal_array.LoadFile(temp_storage+base_flist[0]) outImg = np.zeros((base_img.shape[0], base_img.shape[1]), dtype=nDtype) outImg = np.array(base_img) num_clouds = size(np.array(np.where(outImg == cloud_val))) lmsg = 'Pixels masked as clouds: ', num_clouds print_log(settings, lmsg) out_clouds = 0 cnt = 1 for gfp_file in gfp_flist: lmsg ='Using GFP-'+str(cnt)+': ', gfp_file print_log(settings, lmsg) gfp_file1 = [gfp_file] f_read.base_getcover(gfp_file1, input_params, settings, temp_storage, mask=False) gfile = gdal_array.LoadFile(temp_storage+gfp_file) # evaluate the cloud masking res2 = np.ma.MaskedArray( ((outImg == cloud_val) | (outImg == zero_val) | (outImg >= nodata_val)) & ((gfile != zero_val ) & (gfile != cloud_val) & (gfile < nodata_val)) ) outImg[res2] = gfile[res2] out_clouds = size(np.array(np.where(outImg == cloud_val))) # write out the files used for CF-product generation out_metamask_txt.write(str(cnt)+';'+str(gfp_file)+'\n') out_metamask_txt.flush() cnt += 1 lmsg = 'N_cloudpixel replace: ', num_clouds - out_clouds print_log(settings, lmsg) lmsg = 'Remaining masked pixels: ', out_clouds print_log(settings, lmsg) num_clouds = out_clouds # if there are no more clouded pixels - stop processing if (out_clouds == 0): lmsg = 'All pixels masked as clouds have been replaced' print_log(settings, lmsg) break # now create the cloudfree output products file output = indriver.Create(outFile, base_img.shape[1], base_img.shape[0], inbase_NumBands, gDtype, options=tiff_options) # set the GeoCorrdinates parameters etc. output.SetGeoTransform(inbase_img.GetGeoTransform()) # set the Prohjection parameters etc. output.SetProjection(inbase_img.GetProjection()) outBand = output.GetRasterBand(1) # set the NoData value in the GTiff if inbase_band.GetNoDataValue() is None: outBand.SetNoDataValue(255) else: outBand.SetNoDataValue(inbase_band.GetNoDataValue()) # add the corrsponding colortable (taken from the input file) outBand.SetRasterColorTable(inbase_band.GetRasterColorTable()) outBand.WriteArray(outImg, 0, 0) output.FlushCache() # calculate the overviewlist first overview_sizes = calc_overviews(inbase_band, base_img.shape) # create the overviews output.BuildOverviews(resampling = "NEAREST", overviewlist = overview_sizes) #print 'Overviewlist: ', overview_sizes # free the open files output = None inbase_img = None base_img = None gfp_file = None outImg = None out_metamask_txt.close() return [os.path.basename(outFile),os.path.basename(metamaskTXT)]
def do_cleanup_tmp(temp_storage, cf_result, input_params, settings): """ clean up the temporary storagespace used during download and processing """ if input_params['keep_temporary'] is False: lmsg = 'Cleaning up temporary space...' print_log(settings, lmsg) if type(cf_result) is list: for elem in cf_result: elem = os.path.basename(elem) shutil.copy2(temp_storage+dsep+elem, input_params['output_dir']) elif type(cf_result) is unicode or type(cf_result) is str: shutil.copy2(temp_storage+dsep+elem, input_params['output_dir']) if os.path.exists(input_params['output_dir']+os.path.basename(cf_result[0])): lmsg = '[Info] -- The Cloudfree dataset has been generated and is available at: ' print_log(settings, lmsg) for elem in cf_result: if os.path.exists(input_params['output_dir']+os.path.basename(elem)): lmsg = input_params['output_dir']+os.path.basename(elem) print_log(settings, lmsg) # remove all the temporay storage area shutil.rmtree(temp_storage, ignore_errors=True) else: lmsg = '[Error] -- The generated Cloudfree output-file could not be written to: ', input_params['output_dir']+os.path.basename(cf_result) print_log(settings, lmsg) sys.exit(7) else: lmsg = temp_storage[:-1] print_log(settings, lmsg) out_location = input_params['output_dir']+os.path.basename(temp_storage[:-1]) lmsg = '[Info] -- The Cloudfree dataset and the input files are available at: ', out_location print_log(settings, lmsg) shutil.move(temp_storage, input_params['output_dir'])