def generate_pagination(self, url): urls = [] for i in progress(range(20, 7000, 20)): url = 'https://www.tripadvisor.de/Restaurants-g187275-oa{}-Germany.html#LOCATION_LIST'.format( i) urls.append(url) return urls
def copy_blob_as_remote(blob_url, copied_blob): """Copy blob as append file.""" # Copies as Append file count = 0 # Get file size resp = requests.get(blob_url, stream=True) total_size = int(resp.headers.get("content-length", 0)) prog = progress(total=total_size, unit="iB", unit_scale=True) # Start copy process copied_blob.start_copy_from_url(blob_url) props = copied_blob.get_blob_properties() while props.copy.status == "pending": _LOGGER.info(props.copy.status + " " + props.copy.progress) count = count + 1 if count > 100: raise TimeoutError("Timed out waiting for async copy to complete.") time.sleep(5) length = int(props.copy.progress.split("/")[0]) diff = length - prog.n prog.update(diff) props = copied_blob.get_blob_properties() prog.close()
def test_progress(): """Test progress bar.""" total_size = 100 prog = progress(total=total_size, unit="iB", unit_scale=True) for i in range(100): prog.update(i)
def getHazardDataFrame(df): '''Get hazard data Args: df (Pandas DataFrame): the dataframe containing survey data Returns: DataFrame: processed DataFrame. Almost ready to use with lifelines. ''' survey_year = df['year'].iloc[0] Mother.collection_year = survey_year Mother.master_df = df # make a list of all unique mother id id_num_list = df['idhspid'].tolist() id_num_list = list(dict.fromkeys(id_num_list)) # create a Mother object for each id mother_list = [ Mother(num) for num in progress(id_num_list, desc='Creating mother objects') ] # create an array containing all relevant data mother_arrays = [mother.genHazardArray() for mother in mother_list] data = np.column_stack(mother_arrays).T out_df = pd.DataFrame( data=data, columns=['IDHSPID', 'Event Time', 'Event Occured', 'DHSID', 'Year']) return out_df
def main(): # get command-line arguments cmd_args = commandLineParser() # assign Class variable to the correct DataFrame input_df = pd.read_csv(cmd_args.input_csv) if cmd_args.hazard_regressions: df = getHazardDataFrame(input_df) else: survey_year = input_df['year'].iloc[0] Mother.collection_year = survey_year Mother.master_df = input_df # make a list of all unique mother id id_num_list = input_df['idhspid'].tolist() id_num_list = list(dict.fromkeys(id_num_list)) # create a Mother object for each id mother_list = [ Mother(num) for num in progress(id_num_list, desc='Creating mother objects') ] # create an array containing all relevant data mother_arrays = [mother.genDataArray() for mother in mother_list] data = np.concatenate(mother_arrays) df = pd.DataFrame( data=data, columns=['DHSID', 'IDHSPID', 'Year', 'Mother\'s Age', 'Baby?']) # make a new DataFrame and export as csv df.to_csv(cmd_args.output_csv, index=False)
def download_file(url, dte, ignore_header_rows=0): """Download usage report with shared access key based URL.""" skipped_header = False size = 1024 # 1 Kibibyte local_filename = "usage-%s.csv" % (dte.isoformat()) local_filename = local_filename.replace(":", "-") # NOTE the stream=True parameter resp = requests.get(url, stream=True) resp.encoding = "utf-8" total_size = int(resp.headers.get("content-length", 0)) prog = progress(total=total_size, unit="iB", unit_scale=True) with open(local_filename, "wb") as csvfile: for chunk in resp.iter_content(chunk_size=size, decode_unicode=True): if ignore_header_rows and not skipped_header: bom = chunk[0] lines = chunk.split("\r\n") joined = bom + "\r\n".join(lines[ignore_header_rows:]) encoded_chunk = joined.encode() skipped_header = True else: encoded_chunk = chunk.encode() if encoded_chunk: # filter out keep-alive new chunks prog.update(len(encoded_chunk)) csvfile.write(encoded_chunk) # f.flush() commented by recommendation from J.F.Sebastian prog.close() return (local_filename, total_size)
def importPrecipData(month_range, windows='', precip_data_folder='./resources/precip_data', testing=False): '''This function imports all precip data in ./resources/precip_data or another specified folder Args: month_range (list): a list of months across which to sum the rainfall windows (str, optional): a string representing the path to the file containing the names of the precip files. Defaults to the empty string. precip_data_folder (str, optional): a string representing the path to the folder in which all of the .precip files are stored. Defaults to './resources/precip_data' testing (bool, optional): wheter or not the function is in testing mode. If so, only the first ten precip files will be considered for speed. Defaults to False Returns: list: a list of parsed precip data. Of the form [[[x1, y1], SUM2], [[x2, y2], SUM2], ...] where SUM is the sum of the rainfall in the selected months ''' # get list of precip files if windows: precip_contents = fp.precipListParser(windows, testing=testing) else: os.system(f'cd {precip_data_folder}; ls precip* > ../../precip.txt') precip_contents = fp.precipListParser('precip.txt', testing=testing) os.system('rm precip.txt') # modify the path variable precip_contents = ['./resources/precip_data/' + file for file in precip_contents] # create precip data list for them all precip_data = [fp.precipFileParser(path, month_range) for path in progress(precip_contents, desc='Importing precip data')] return precip_data
def train_model(self, epochs=10000, summary_every=1, log_writer=None): subimage_provider = self._subimage_provider evaluation_input, evaluation_target = subimage_provider.evaluation_subimages( ) evaluation_feed_dict = { self._net_input: evaluation_input, self._net_target: evaluation_target } updates_per_epoch = int( math.ceil(subimage_provider.training_subimage_count() / self._batch_size)) sess = self.session training_loss_values = [] evaluation_loss_values = [] min_train_loss = 10e3 min_eval_loss = 10e3 for e_num, epoch in enumerate( progress(range(epochs), desc='Training model...', unit='epochs')): for u_num, update in enumerate(range(updates_per_epoch)): input_batch, target_batch = subimage_provider.random_training_subimage_batch( batch_size=self._batch_size) training_feed_dict = { self._net_input: input_batch, self._net_target: target_batch } sess.run(self._train_step, feed_dict=training_feed_dict) if (e_num + 1 ) % summary_every == 0 and u_num == updates_per_epoch - 1: training_loss_value = sess.run( self._loss_function, feed_dict=training_feed_dict) training_loss_values.append(training_loss_value) evaluation_loss_value = sess.run( self._loss_function, feed_dict=evaluation_feed_dict) evaluation_loss_values.append(evaluation_loss_value) print( '\nEpoch #: {} -> Training Loss: {}, Evaluation Loss: {}' .format(e_num + 1, str(training_loss_value), str(evaluation_loss_value))) if log_writer is not None: self._write_training_logs(log_writer, e_num, training_feed_dict, evaluation_feed_dict) if self._save_file is not None and \ evaluation_loss_value < min_eval_loss and training_loss_value < min_train_loss: min_train_loss = training_loss_value min_eval_loss = evaluation_loss_value self._saver.save(sess, self._save_file) print('\nMin training Loss: {}, Min evaluation Loss: {}'.format( str(min_train_loss), str(min_eval_loss))) return training_loss_values, evaluation_loss_values
def download_file(url: str, destination: Path, http_headers: Optional[dict] = None, proxies: Optional[dict] = None, show_progress: bool = True): req = request.Request(url) if http_headers: for name, value in http_headers.items(): req.add_header(name, value) if proxies: # TODO: Should we only set the proxy associated with the URL scheme? # Should we raise an exception if there is not a proxy defined for # the URL scheme? # parsed = parse.urlparse(url) for proxy_type, url in proxies.items(): req.set_proxy(url, proxy_type) rsp = request.urlopen(req) size_str = rsp.getheader("content-length") total_size = int(size_str) if size_str else None block_size = 16 * 1024 if total_size and total_size < block_size: block_size = total_size LOG.debug("Downloading url %s to %s", url, str(destination)) if show_progress and progress: progress_bar = progress(total=total_size, unit="b", unit_scale=True, unit_divisor=1024, desc=f"Localizing {destination.name}") def progress_reader(): buf = rsp.read(block_size) if buf: progress_bar.update(block_size) else: progress_bar.close() return buf reader = progress_reader else: reader = functools.partial(rsp.read, block_size) with open(destination, "wb") as out: while True: buf = reader() if not buf: break out.write(buf)
def download_file( self, destination: Path, show_progress: bool = False, digests: Optional[dict] = None ): total_size = self.get_content_length() block_size = 16 * 1024 if total_size and total_size < block_size: block_size = total_size if show_progress and progress: progress_bar = progress( total=total_size, unit="b", unit_scale=True, unit_divisor=1024, desc=f"Localizing {destination.name}" ) def progress_reader(): b = self.read(block_size) if b: progress_bar.update(block_size) else: progress_bar.close() return b reader = progress_reader else: reader = functools.partial(self.read, block_size) downloaded_size = 0 with open(destination, "wb") as out: while True: buf = reader() if not buf: break downloaded_size += len(buf) out.write(buf) if downloaded_size != total_size: # TODO: test this raise AssertionError( f"Size of downloaded file {destination} does not match expected size " f"{total_size}" ) if digests: verify_digests(destination, digests)
def get_urls_from_pagination(self, server, pagination): for url in progress(pagination): base_url = 'https://www.tripadvisor.de' list = [] response = server.get(url) response_in_lxml = bs(response.content, 'lxml') cities = response_in_lxml.find_all('ul', attrs={'class': 'geoList'}) for city in cities: for li in city.find_all('li'): link = li.find('a', href=True)['href'] list.append({ 'link': base_url + link, }) return list
def _load_subimages(self, set_type, satellite, bands, progress_desc='Loading subimages'): # List of places folders for the given satellite satellite_places = [os.path.join(place, satellite) for place in self._list_places_folders(set_type)] subimage_count = None progress_bar = None subimages = None # One band at a time for b, band in enumerate(bands): # List of places folders for the given satellite and band band_places_folders = [os.path.join(satellite_place, str(band + 1)) for satellite_place in satellite_places] for i, band_place_folder in enumerate(band_places_folders): band_files = os.listdir(band_place_folder) band_files.sort(key=lambda s: reduce(lambda x, y: y + x * 10e5, [int(elem) for elem in s[:-4].split('_')])) band_place_files = [os.path.join(band_place_folder, file) for file in band_files if os.path.isfile(os.path.join(band_place_folder, file))] # Complete images amount if uncertain, follows "each folder has the same amount of files" convention if subimage_count is None: subimage_count = len(band_place_files) * len(satellite_places) # Initialize progress bar if uninitialized if progress_bar is None: progress_bar = progress(total=subimage_count * len(bands), desc=progress_desc, unit='imgs') for j, band_place_file in enumerate(band_place_files): band_image = imread(band_place_file) # Initialize 4-dimensional array if uninitialized if subimages is None: image_width, image_height = band_image.shape subimages = np.empty([subimage_count, image_height, image_width, len(bands)], dtype=np.uint16) # Indexes as follows: n images from place 0; n images from place 1; ...; n images from place p. # Where: n is the amount of images per folder and p the amount of places index = j + (i * len(band_place_files)) subimages[index, :, :, b] = band_image progress_bar.update() progress_bar.close() return subimages
def animate(title, salleTest, results, fig, axe, Nt, dt): ''' This functions creates an animation and adds all the required legends Due to matplotlib limitations there are some trickled down technics to display a legend inside the code ''' # Updates the screen def update(k, progress_bar): xs = [] ys = [] colors = [] #print(k) axe.clear() axe.set_xlim(0, salleTest.Lx) axe.set_ylim(0, salleTest.Ly) axe.set_title(str(round(k * dt, 2))) salleTest.afficher(fig, axe) for agent_data in results[k]: color, x, y = agent_data xs.append(x) ys.append(y) colors.append(color) #print(xs) axe.scatter(xs, ys, c = colors) progress_bar.update(k) progress_bar = progress(range(Nt), desc = "Export vidéo") ''' Figure initialisation ''' axe.set_xlim(0, salleTest.Lx) axe.set_ylim(0, salleTest.Ly) ani = animation.FuncAnimation(fig, update, fargs = (progress_bar,), frames = Nt, interval = dt * 1000, blit = False, repeat = True) return ani
def merge_csv_files(file_1, file_2, file_3): ''' @paramaters variant_call_file_csv, format_column_split_csv, final_merged_csv Using the variant_call_file_csv, and format_column_split_csv the two files are merged to create a final_merged_csv. The final csv will have the original VCF file format with the additional columns which were split out from the NORMAL and TUMOR columns. @return None ''' csv.field_size_limit(10000000) csv.field_size_limit() with open(file_1, 'r') as csv_1, open(file_2, 'r') as csv_2, open(file_3, 'w') as out_file: reader_1 = csv.reader(csv_1) reader_2 = csv.reader(csv_2) writer = csv.writer(out_file) for row_1, row_2 in progress(zip(reader_1, reader_2)): writer.writerow(row_1 + row_2)
def get_urls_from_pagination(self, server, pagination): list = [] for url in progress(pagination): base_url = 'https://www.tripadvisor.de' parsed = 0 response = server.get(url) response_in_lxml = bs(response.content, 'lxml') cities = response_in_lxml.find_all('ul', attrs={'class': 'geoList'}) for city in cities: for li in city.find_all('li'): link = li.find('a', href=True)['href'] city_name = li.find('a').text city_name = city_name.replace('Restaurants ', '') list.append({ "country": "Germany", "link": base_url + link, "city_name": city_name, "parsed": parsed }) return list
def copy_blob_as_github_suggested(blob_url, copied_blob): """Copy append as block via github suggession.""" i = 0 running = 0 chunk_size = 10 * 10 * 10 * 10 * 10 * 1024 # Upload empty file copied_blob.upload_blob(b"") # Get File size resp = requests.get(blob_url, stream=True) total_size = int(resp.headers.get("content-length", 0)) prog = progress(total=total_size, unit="iB", unit_scale=True) # Add step for step in range(total_size, 0, -chunk_size): offset = total_size - step length = chunk_size if step < chunk_size: length = step copied_blob.stage_block_from_url( block_id=i + 1, source_url=blob_url, source_offset=offset, source_length=length, ) running += length i += 1 prog.update(length) block_list = [BlobBlock(block_id=1)] copied_blob.commit_block_list(block_list) # committed, _ = copied_blob.get_block_list("all") prog.close()
def copy_blob_as_blocks(blob_url, copied_blob): """Copy blob as blocks.""" # Get target size resp = requests.get(blob_url, stream=True) total_size = int(resp.headers.get("content-length", 0)) chunk_size = 10 * 10 * 10 * 10 * 1024 # Upload empty file copied_blob.upload_blob(b"") i = 0 running = 0 prog = progress(total=total_size, unit="iB", unit_scale=True) for step in range(total_size, 0, -chunk_size): offset = total_size - step length = chunk_size if step < chunk_size: length = step # this will only stage your block copied_blob.stage_block_from_url( block_id=i + 1, source_url=blob_url, source_offset=offset, source_length=length, ) # now it is committed running += length i += 1 prog.update(length) copied_blob.commit_block_list([j + 1 for j in range(i)]) prog.close() committed, _ = copied_blob.get_block_list("all") assert total_size == running assert total_size == len(committed)
def sumSlicing(sum_list, len_years, verbose=False): '''This function generates all percentiles across a list. Args: sum_list (list): a list containing all the rainfall sum data. len_years (int): how many years to fit a gamma distribution Returns: list: a list of percentiles fitted to a gamma distribution ''' if verbose: pbar = progress(total=len(sum_list)-len_years, leave=False) # establish a nice progress bar leading_pointer = 0 okazaki_pointer = len_years + 1 percentile_list = [] while okazaki_pointer <= len(sum_list): # iterate over every slice of the list that allows for adequate length data = sum_list[leading_pointer:okazaki_pointer] temp = percentile(data) percentile_list.append(temp) leading_pointer += 1 okazaki_pointer += 1 if verbose: pbar.update(1) # update progress bar if verbose: pbar.close() return percentile_list
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None, **kwargs): """ Uses spaCy to quickly tokenize text and return an array of indices. This method stores a global NLP directory in memory, and takes up to a minute to run for the time. Later calls will have the tokenizer in memory. Parameters ---------- text : list of unicode strings These are the input documents. There can be multiple sentences per item in the list. max_length : int This is the maximum number of words per document. If the document is shorter then this number it will be padded to this length. skip : int, optional Short documents will be padded with this variable up until max_length. attr : int, from spacy.attrs What to transform the token to. Choice must be in spacy.attrs, and = common choices are (LOWER, LEMMA) merge : int, optional Merge noun phrases into a single token. Useful for turning 'New York' into a single token. nlp : None A spaCy NLP object. Useful for not reinstantiating the object multiple times. kwargs : dict, optional Any further argument will be sent to the spaCy tokenizer. For extra speed consider setting tag=False, parse=False, entity=False, or n_threads=8. Returns ------- arr : 2D array of ints Has shape (len(texts), max_length). Each value represents the word index. vocab : dict Keys are the word index, and values are the string. The pad index gets mapped to None >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"] >>> arr, vocab = tokenize(sents, 10, merge=True) >>> arr.shape[0] 2 >>> arr.shape[1] 10 >>> w2i = {w: i for i, w in vocab.iteritems()} >>> arr[0, 0] == w2i[u'do'] # First word and its index should match True >>> arr[0, 1] == w2i[u'you'] True >>> arr[0, -1] # last word in 0th document is a pad word -2 >>> arr[0, 4] == w2i[u'class action lawsuit'] # noun phrase is tokenized True >>> arr[1, 1] # The URL token is thrown out -2 """ if nlp is None: nlp = en.load() data = np.zeros((len(texts), max_length), dtype='int32') data[:] = skip bad_deps = ('amod', 'compound') token_list = [] vocab = {} index = 0 for row, doc in progress(enumerate(nlp.pipe(texts, **kwargs))): if merge: for phrase in doc.noun_chunks: while len(phrase) > 1 and phrase[0].dep_ not in bad_deps: phrase = phrase[1:] if len(phrase) > 1: phrase.merge(phrase.root.tag_, phrase.text, phrase.root.ent_type_) for ent in doc.ents: if len(ent) > 1: ent.merge(ent.root.tag_, ent.text, ent.label_) dat = doc.to_array([LOWER, LIKE_EMAIL, LIKE_URL]).astype("int32") for i, token in enumerate(doc): text = token.text.lower() if text not in list(vocab.values()): dat[i][0] = index vocab[index] = text index += 1 else: for k, v in vocab.items(): if v == text: value = k break dat[i][0] = value if len(dat) > 0: msg = "Negative indices reserved for special tokens" assert dat.min() >= 0, msg idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) dat[idx] = skip length = min(len(dat), max_length) data[row, :length] = dat[:length, 0].ravel() vocab[skip] = '<SKIP>' return data, vocab
# Hardware conditioner = analog.HeterodyneMarkII() magnet = hardware.Thing('canceling_magnet', {'orientation': 'up', 'distance_from_base_mm': 25}) hw = hardware.Hardware(conditioner, magnet) ri = hardware_tools.r2_with_mk2() ri.set_dac_atten(40) ri.set_fft_gain(4) ri.set_modulation_output('high') # Run ncf = acquire.new_nc_file(suffix='sweep_stream') tic = time.time() try: for lo in progress(lo_MHz): state = hw.state() state['temperature'] = {'package': temps.get_temperature_at(time.time())} tone_banks = (lo + offsets_MHz)[:, np.newaxis] # Transform to shape (num_offsets, 1) ri.set_lo(lomhz=lo, chan_spacing=round_to_MHz) sweep_array = acquire.run_sweep(ri, tone_banks=tone_banks, num_tone_samples=num_tone_samples, length_seconds=sweep_length_seconds) single_sweep = sweep_array[0] f0_MHz = 1e-6 * single_sweep.resonator.f_0 ri.set_tone_freqs(np.array([f0_MHz]), nsamp=num_tone_samples) ri.select_fft_bins(np.array([0])) stream_array = ri.get_measurement(num_seconds=stream_length_seconds) single_stream = stream_array[0] sweep_stream = basic.SingleSweepStream(sweep=single_sweep, stream=single_stream, state=state, description='f_0 = {:.1f}'.format(f0_MHz)) ncf.write(sweep_stream)
def body(sum_list, cmd_args): # calculate percentiles rainfall_percentiles = [sumSlicing(rainfall_sum, cmd_args.len_years, cmd_args.verbose) for rainfall_sum in progress(sum_list, desc='Calculating Percentiles')] if cmd_args.verbose or __name__ == '__main__': # print out year range _, columns = os.popen('stty size', 'r').read().split() fancy_sep = ['-' for _ in range(int(columns))] print(''.join(fancy_sep)) # allow for some eyeball breathing room print(f'This program calculated {len(rainfall_percentiles[0])} years worth of percentiles.\nThe list stored in "Rainfall Percentiles" represents data beginning in the year {1950+cmd_args.len_years}.\nThis is assuming that the first precip file contains data from the year 1950.') return rainfall_percentiles
def main(args): ''' Main Control Flow Note that the actual steps run are configured in the YAML input! This allows you to e.g. skip previously run steps. ''' if not confirm_google_vm(): print( 'This job needs to run on a Google Cloud Compute Engine to avoid storage egress charges [EXITING]' ) return if len(args) != 2: print(" ") print(" Usage : {} <configuration_yaml>".format(args[0])) return print("job started") with open(args[1], mode='r') as yaml_file: params, steps, = load_config(yaml_file.read()) program_name = params['PROGRAM_NAME'] fs = gcsfs.GCSFileSystem(token='google_default') legacy_tag = params['LEGACY_TAG'] max_workers = params['MAX_WORKERS'] add_normal_col = params['NORMAL_COL'] # Directory to send each intermediary file to home = expanduser('~') variant_call_file_csv = f"{home}/NextGenETL/intermediateFiles/{params['PARSED_VARIANT_CALL_FILE']}" format_column_split_csv = f"{home}/NextGenETL/intermediateFiles/{params['FORMAT_COLUMN_SPLIT_FILE']}" final_merged_csv = f"{home}/NextGenETL/intermediateFiles/{params['FINAL_MERGED_CSV']}" format_information_file = f"{home}/NextGenETL/intermediateFiles/{params['FORMAT_INFO_FILE']}" dataframe_information_file = f"{home}/NextGenETL/intermediateFiles/{params['DATAFRAME_INFO_FILE']}" # Google Cloud Storage bucket path bucket_path = params['BUCKET_PATH'] # Schemas schema_path = f"{home}/NextGenETL/intermediateFiles/{program_name.lower()}_simple_build_schema.json" # Staging table info for staging env staging_project = params['STAGING_PROJECT'] staging_dataset_id = params['STAGING_DATASET_ID'] staging_table_id = params['STAGING_TABLE_ID'] scratch_full_table_id = f'{staging_project}.{staging_dataset_id}.{staging_table_id}' # Publish table info for production env publish_project = params['PUBLISH_PROJECT'] publish_dataset_id = params['PUBLISH_DATASET_ID'] publish_table_id = params['PUBLISH_TABLE_ID'] schema_with_desc = schema_with_description( params['SCHEMA_WITH_DESCRIPTION']) # Path to Labels, Description, and FreindlyName labels_and_desc = params['LABEL_DESCRIPTION_FREINDLYNAME'] if params is None: print("Bad YAML load") return if 'extract_metadata_table' in steps: print('* Extracting Meta-Data Table from Google BigQuery!') file_urls, project_short_names, file_names, analysis_workflow_types, case_barcodes, entity_ids = query_for_table( params['FILEDATA_ACTIVE'], params['GDCID_TO_GCSURL'], params['ALIQUOT_TO_CASEID'], program_name) print(f'Number of files to be processed: {len(file_urls)}') print( f'Number of projects in the program, {program_name}: {len(set(project_short_names))}' ) print( f'Number of workflow types for the program, {program_name}: {len(set(analysis_workflow_types))}' ) pbar = progress(total=len(file_urls)) file_urls = iter(file_urls) project_short_names = iter(project_short_names) file_names = iter(file_names) analysis_workflow_types = iter(analysis_workflow_types) case_barcodes = iter(case_barcodes) entity_ids = iter(entity_ids) if 'transform_vcf' in steps: print('* Transforming and Parsing the VCF Files!') # Open an empty csv to store the vcf dataframes (Concatenated VCFs) with open(variant_call_file_csv, 'w') as out_file: pass with open(format_information_file, 'w') as format_out: pass with concurrent.futures.ProcessPoolExecutor( max_workers=max_workers) as executor: add_header = True futures = [] start_process(next(file_urls), next(project_short_names), next(file_names), next(analysis_workflow_types), next(case_barcodes), next(entity_ids), fs, variant_call_file_csv, legacy_tag, add_normal_col, format_information_file, add_header=add_header) pbar.update() running = set() for _, a_file, project_short_name, file_name, analysis_workflow_type, case_barcode, entity_id in zip( range(max_workers), file_urls, project_short_names, file_names, analysis_workflow_types, case_barcodes, entity_ids): running.add( executor.submit(start_process, a_file, project_short_name, file_name, analysis_workflow_type, case_barcode, entity_id, fs, variant_call_file_csv, legacy_tag, add_normal_col, format_information_file, add_header=False)) while running: done, running = concurrent.futures.wait( running, return_when=concurrent.futures.FIRST_COMPLETED) for _ in done: pbar.update() for _, a_file, project_short_name, file_name, analysis_workflow_type, case_barcode, entity_id in zip( range(len(done)), file_urls, project_short_names, file_names, analysis_workflow_types, case_barcodes, entity_ids): running.add( executor.submit(start_process, a_file, project_short_name, file_name, analysis_workflow_type, case_barcode, entity_id, fs, variant_call_file_csv, legacy_tag, add_normal_col, format_information_file, add_header=False)) if 'create_new_columns' in steps: print('* Creating New Columns!') create_new_columns(variant_call_file_csv, format_column_split_csv) if 'merge_csv_files' in steps: print('* Merging CSV Files!') merge_csv_files(variant_call_file_csv, format_column_split_csv, final_merged_csv) if 'build_a_simple_schema' in steps: print('* Generating a Simple Schema! ') simple_schema_builder(program_name, final_merged_csv, dataframe_information_file, home) if 'push_csv_to_bucket' in steps: print('* Pushing CSV File to Bucket!') push_file_to_bucket(final_merged_csv, bucket_path) if 'load_to_staging_environment' in steps: print('* Loading a Table in to a Staging Environment!') load_to_staging_env(staging_dataset_id, staging_table_id, bucket_path, schema_path) if 'load_to_production_environment' in steps: print('* Loading a Table in to a Production Environment!') load_to_production_env(publish_project, publish_dataset_id, publish_table_id, schema_with_desc, scratch_full_table_id, labels_and_desc)
def create_new_columns(file_1, file_2): ''' @parameters file1, file2 This function will take a csv file with the formated gtf file and parse out the 'attribute' column. The parsed information in the attribute column will be transformed into new columns of their own and be written out to a csv file. @return None ''' csv.field_size_limit(10000000) csv.field_size_limit() with open(file_1) as file_in: reader = csv.reader(file_in) header = next(reader) format_column_index = header.index('FORMAT') column_names = set() for row in progress(reader): cell_information = row[format_column_index] column_names.update(cell_information.split(':')) column_names = list(column_names) num_cols = len(column_names) with open(file_1) as file_in: with open(file_2, 'w') as file_out: reader = csv.reader(file_in) writer = csv.writer(file_out) header = next(reader) if 'NORMAL' in header: format_column_index = header.index('FORMAT') normal_column_index = header.index('NORMAL') tumor_column_index = header.index('TUMOR') writer.writerow([f'{name}_Normal' for name in column_names] + [f'{name}_Tumor' for name in column_names]) for row in progress(reader): columns = row[format_column_index].split(':') tumor_col_values = row[tumor_column_index].split(':') if row[normal_column_index] != '': normal_col_values = row[normal_column_index].split(':') column_indicies = [ column_names.index(column) for column in columns ] row_out = [''] * (num_cols * 2) for column_index, normal_value, tumor_value in zip( column_indicies, normal_col_values, tumor_col_values): row_out[column_index] = normal_value row_out[column_index + num_cols] = tumor_value writer.writerow(row_out) else: column_indicies = [ column_names.index(column) for column in columns ] row_out = [''] * (num_cols * 2) for column_index, tumor_value in zip( column_indicies, tumor_col_values): row_out[column_index + num_cols] = tumor_value writer.writerow(row_out) else: format_column_index = header.index('FORMAT') tumor_column_index = header.index('TUMOR') writer.writerow([f'{name}_Tumor' for name in column_names]) for row in progress(reader): columns = row[format_column_index].split(':') tumor_col_values = row[tumor_column_index].split(':') column_indicies = [ column_names.index(column) for column in columns ] row_out = [''] * (num_cols) for column_index, tumor_value in zip( column_indicies, tumor_col_values): row_out[column_index] = tumor_value writer.writerow(row_out)
def body(cmd_args): '''This function runs the main functionality Args: cmd_args (argparse.Namespace): an argparse namespace Returns: GeoDataFrame: a GeoPandas GeoDataFrame with all of the rainfall sums included. ''' # parse month range month_range = fp.cropCalendarParser(cmd_args.unit_code) month_range = [int(month) for month in month_range] # get precip data precip_data = importPrecipData(month_range, windows=cmd_args.windows, testing=cmd_args.testing) # get geodata st_coords = fp.precipFileParser('./resources/precip_data/precip.1977', [4, 8], return_coords=True) gdf = fp.shapeFileParser(cmd_args.shapefile_path, st_coords, cmd_args, testing=cmd_args.testing) # generate rainfall totals station_indices = gdf['Station Indices'].tolist() rainfall_totals = [generateRainFallSums(index_list, data) for index_list, data in progress(zip(station_indices, itertools.repeat(precip_data)), total=len(gdf['Station Indices']), desc='Calculating rainfall sums')] gdf['Rainfall Totals'] = rainfall_totals # print out needed calculation stats station_lengths = [len(lst) for lst in station_indices] # how many stations were captured _, columns = os.popen('stty size', 'r').read().split() fancy_sep = ['-' for _ in range(int(columns))] print(''.join(fancy_sep)) # allow for some eyeball breathing room print(f'The average number of captured stations was {round(statistics.mean(station_lengths), 2)}') if 0 in station_lengths: # warn if any location didn't capture data cprint('::ATTENTION::', 'red', attrs=['reverse', 'blink']) print(f'{station_lengths.count(0)}/{len(station_lengths)} locations did not capture a single precip station. This will *likely* be addressed in the final csv file.') else: print('Every location captured at least one precip station.') return gdf
def upload(request): wait = 0 # Sleeper for CRUD during upload accountids = [] # Holds current document "unique=True" fields added = 0 if 'GET' == request.method: memberdata = Conflict.objects.all() context = {'memberdata': memberdata} return render(request, 'upload.html', context) try: # First Condition: Make sure there is a file csv_file = request.FILES["csv_file"] if len(csv_file) == 0: messages.error(request, 'Empty File') return render(request, 'upload.html') # Second Condition: Make sure the file is a CSV file if not csv_file.name.endswith('.csv'): messages.error(request, 'File is not CSV type') return render(request, 'upload.html') # Prepare Data for parsing file_data = csv_file.read().decode("utf-8") lines = file_data.split("\n") # Begin loop through CSV file lines for index, line in progress(enumerate(lines)): fields = line.split(",") if index == 0: # Third Condition: Check if top row CVS fields are as expected if (fields[0] == 'first_name') and (fields[1] == 'last_name') and ( fields[2] == 'phone_number') and ( fields[3] == 'client_member_id') and (fields[4] == 'account_id'): pass # Throw an Error if document headers don't match else: messages.error(request, 'File is not Correct Headers') return render(request, 'upload.html') break # Save as member if not in database, otherwise save as conflict if (len(fields[0]) != 0) and (len(fields[1]) != 0) and (len( fields[2]) != 0) and (len(fields[3]) != 0): # Check if current data appeared in current document duplicate = False if fields[3] in accountids: duplicate = True if fields[2] in accountids: duplicate = True if fields[2] == 'phone_number': duplicate = True # Save to database if doesn't already exist if duplicate == False: try: data = Member( first=fields[0], last=fields[1], telephone=fields[2], clientid=fields[3], accountid=fields[4], ) # Tracking system for current document items accountids.append(fields[3]) accountids.append(fields[2]) added += 1 data.save() # Sleeper allows for CRUD operations during upload # A more robust version would be using multiprocessors if wait == 5: time.sleep(.2) wait = 0 else: wait += 1 except: if wait == 5: time.sleep(.2) wait = 0 else: wait += 1 pass # 10.A If the person is in system elif duplicate == True: # When commented this block will not allow conf = Conflict( # duplicates. This block will sort first=fields[ 0], # duplicates into another table called last=fields[1], # "conflicts" for further handling. telephone=fields[2], clientid=fields[3], accountid=fields[4], ) conf.save() # Message at top of screen once members are added messages.success(request, "Successfully Uploaded CSV File") return redirect('/upload') # If one of the coditions weren't met except Exception as e: messages.error(request, "Unable to upload file. " + e) return redirect('/upload')
conditioner = analog.HeterodyneMarkII() magnet = hardware.Thing('canceling_magnet', { 'orientation': 'up', 'distance_from_base_mm': 25 }) hw = hardware.Hardware(conditioner, magnet) ri = hardware_tools.r2_with_mk2() ri.set_dac_atten(40) ri.set_fft_gain(4) ri.set_modulation_output('high') # Run ncf = acquire.new_nc_file(suffix='sweep') tic = time.time() try: for lo in progress(lo_MHz): state = hw.state() state['temperature'] = { 'package': temps.get_temperature_at(time.time()) } tone_banks = np.array([np.array([f]) for f in lo + offsets_MHz]) ri.set_lo(lomhz=lo, chan_spacing=round_to_MHz) sweep = acquire.run_sweep(ri, tone_banks=tone_banks, num_tone_samples=num_tone_samples, length_seconds=length_seconds, state=state) ncf.write(sweep) finally: ncf.close() print("Wrote {}".format(ncf.root_path))
def _load_normalized_signatures(self, signatures_directory, num_of_classes, signatures_per_class): images_have_been_normalized = True model_has_been_created = True self._signatures = [] self._signatures_features = [] progress_value = progress(total=num_of_classes * signatures_per_class, desc='Loading signatures', unit=' images') if not os.path.exists(signatures_directory + '_normalized'): os.mkdir(signatures_directory + '_normalized') images_have_been_normalized = False if not os.path.exists(signatures_directory + '_normalized_model'): os.mkdir(signatures_directory + '_normalized_model') model_has_been_created = False for class_folder_index in range(1, num_of_classes + 1): class_folder_number = '0' if class_folder_index < 10: class_folder_number += str(class_folder_index) else: class_folder_number = str(class_folder_index) self._signatures_features.append([]) self._signatures.append([]) if not images_have_been_normalized: os.mkdir(signatures_directory + '_normalized' + '/p' + class_folder_number) if not model_has_been_created: os.mkdir(signatures_directory + '_normalized_model' + '/p' + class_folder_number) for signature_file_index in range(1, signatures_per_class + 1): signature_file = '0' if signature_file_index < 10: signature_file += str(signature_file_index) else: signature_file = str(signature_file_index) if not model_has_been_created: if not images_have_been_normalized: loaded_image = Image.open(signatures_directory + '/p' + class_folder_number + '/p' + class_folder_number + 's' + signature_file + '.png') loaded_image = loaded_image.resize((250, 250), Image.ANTIALIAS) loaded_image = loaded_image.convert('L') loaded_image.save(signatures_directory + '_normalized' + '/p' + class_folder_number + '/p' + class_folder_number + 's' + signature_file + '.png') signature_image = loaded_image else: signature_image = Image.open(signatures_directory + '_normalized' + '/p' + class_folder_number + '/p' + class_folder_number + 's' + signature_file + '.png') signature_model = self._extract_signature_features_vector( np.asarray(signature_image)) self._signatures_features[class_folder_index - 1].append(signature_model) self._signatures[class_folder_index - 1].append(signature_image) signature_model_file = open( signatures_directory + '_normalized_model' + '/p' + class_folder_number + '/p' + class_folder_number + 's' + signature_file + '.csv', "+a") for element in signature_model: signature_model_file.write("%f\n" % element) else: signature_model_lines = open( signatures_directory + '_normalized_model' + '/p' + class_folder_number + '/p' + class_folder_number + 's' + signature_file + '.csv', "r") signature_model_lines = signature_model_lines.readlines() signature_model = [] for line in signature_model_lines: signature_model.append(float(line)) self._signatures_features[class_folder_index - 1].append(signature_model) progress_value.update()