def _get_single_df( stream: IO, filetype: Optional[TypeEnum], **kwargs ) -> Union[pd.DataFrame, Iterable[pd.DataFrame]]: """ Read a stream and retrieve the data frame or data frame generator (chunks) It uses `stream.name`, which is the path to a local file (often temporary) to avoid closing it. It will be closed at the end of the method. """ if filetype is None: filetype = TypeEnum(detect_type(stream.name)) # Check encoding encoding = kwargs.get('encoding') if not validate_encoding(stream.name, encoding): encoding = detect_encoding(stream.name) kwargs['encoding'] = encoding # Check separator for CSV files if it's not set if filetype is TypeEnum.CSV and 'sep' not in kwargs: if not validate_sep(stream.name, encoding=encoding): kwargs['sep'] = detect_sep(stream.name, encoding) pd_read = getattr(pd, f'read_{filetype}') try: df = pd_read(stream.name, **kwargs) finally: stream.close() # In case of sheets, the df can be a dictionary if kwargs.get('sheet_name', NOTSET) is None: for sheet_name, _df in df.items(): _df['__sheet__'] = sheet_name df = pd.concat(df.values(), sort=False) return df
def display(results: Results, json_file: IO): if json_file: # Write to the file json.dump( { "successful_requests": results.successful_requests(), "slowest": results.slowest(), "fastest": results.fastest(), "total_time": results.total_time, "Requests Per Minute": results.requests_per_minute(), "Requests Per Second": results.requests_per_second(), }, json_file, ) json_file.close() print("... Done!") else: # Print to Screen print("... Done!") print("--- Results ---") print(f"Successful requests\t{results.successful_requests()}") print(f"Slowest \t{results.slowest()}") print(f"Fastest \t{results.fastest()}") print(f"Average \t{results.average_time()}") print(f"Total Time \t{results.total_time}") print(f"Requests Per Minute\t{results.requests_per_minute()}") print(f"Requests Per Second\t{results.requests_per_second()}")
def out_write_bytes(wfile: IO, payload, close: bool = True) -> None: if not is_bytes(payload): raise ValueError("Unexpected type: " + str(payload.__class__) + ". Expected bytes-like.") wfile.write(payload) if close: wfile.close()
def IJsonIterator(buffer: IO) -> Iterator[Any]: """Takes a file-like object with a json array, and yields elements from that array. Provided buffer will be automatically closed.""" try: yield from ijson.items(buffer, "item", use_float=True) # type: ignore finally: buffer.close()
def write(self, evaluation_results: EvaluationResults, buffer: typing.IO = None, include_specification: bool = None, **kwargs): if self.destination is None and buffer is None: raise ValueError( f"A buffer must be passed in if no destination is declared") data_to_write: typing.Dict[ str, typing.Hashable] = self._results_to_dictionary( evaluation_results, include_specification) indent = kwargs.get("indent", 4) buffer_was_created_here = buffer is None try: if buffer is None: buffer = open(self.destination, 'w') json.dump(data_to_write, buffer, indent=indent) finally: if buffer_was_created_here and buffer is not None: buffer.close()
def unsmarten(in_stream: IO) -> str: """Actual conversion function""" in_text = in_stream.readlines() in_stream.close() out_text = '' for line in in_text: out_text += unsmarten_line(line) return out_text
def _load_config(self, config_file: IO) -> Config: """Load the application configuration.""" try: config = load_config(config_file, self.logger) except (InvalidMetricType, ConfigError) as error: raise ErrorExitMessage(str(error)) finally: config_file.close() return config
def svcErrorReader(self, err: IO, queue, logDir: str): os.makedirs(logDir, exist_ok=True) logFile = os.path.join(logDir,'stderr.log') fErr = open(logFile, 'wb') for line in iter(err.readline, b''): fErr.write(line) Logging.info("TDengine STDERR: {}".format(line)) Logging.info("EOF for TDengine STDERR: {}".format(self)) err.close() fErr.close()
def in_read(rfile: IO, content_length=None, close: bool = True): if content_length is None: content = rfile.read() else: content = rfile.read(content_length) logging.trace("Finished reading %d bytes from input stream. ", content_length) if close: rfile.close() return content
def fromINI(projectId: int, fileObj: IO, metadata: Dict, original_path: str = None) -> TileServer: """ :param projectId: int :param fileObj: file descriptor :param metadata: Dict of <key, val> pairs :param original_path: str path of original file location :return: Feature """ config = configparser.ConfigParser(allow_no_value=True) config.read_string(fileObj.read().decode('utf-8')) tile_server_data = {} tile_server_data['tileOptions'] = {} tile_server_data['uiOptions'] = {} general_config = config['general'] tile_server_data['name'] = general_config.get('id', '') tile_server_data['type'] = general_config.get('type', '').lower() if (config.has_section('license')): attribution = '' for key in config['license']: attribution += config['license'].get(key, '') tile_server_data['attribution'] = attribution else: tile_server_data['attribution'] = '' if (tile_server_data['type'] == 'tms'): tms_config = config['tms'] tile_server_data['url'] = tms_config.get('url', fallback='') tile_server_data['tileOptions']['maxZoom'] = tms_config.getint( 'zmax', fallback=19) tile_server_data['tileOptions']['minZoom'] = tms_config.getint( 'zmin', fallback=0) elif (tile_server_data['type'] == 'wms'): wms_config = config['wms'] tile_server_data['url'] = wms_config.get('url', fallback='') tile_server_data['tileOptions']['layers'] = wms_config.get( 'layers', fallback='') tile_server_data['tileOptions']['params'] = wms_config.get( 'params', fallback='') tile_server_data['tileOptions']['format'] = wms_config.get( 'format', fallback='') tile_server_data['uiOptions']['isActive'] = True tile_server_data['uiOptions']['opacity'] = 1 fileObj.close() return FeaturesService.addTileServer(projectId, tile_server_data)
def _save_chain(chain_pem: bytes, chain_file: IO) -> None: """Saves chain_pem at a unique path based on chain_path. :param bytes chain_pem: certificate chain in PEM format :param str chain_file: chain file object """ try: chain_file.write(chain_pem) finally: chain_file.close()
def stream(reader: IO, writer: IO, chunksize=1024, stoplen=0, close=False): """Low-level utility function to stream all of `reader`'s contents into `writer` chunk-by-chunk.""" while True: data = reader.read(chunksize) # Read a lil if len(data) <= stoplen: # End of stream break writer.write(data) # Write a lil if close: reader.close() writer.close()
def try_finally_io_close(f: IO): try: if f is not None: fio: IO = f logger.info("try close {0}".format(fio.name)) f.close() logger.info("closed {0}".format(fio.name)) elif f is None: pass else: logger.warning('Not IO ') except BaseException as ex: logger.error(" cannot close {0} : {1}".format(f, ex)) logger.error(ex, exc_info=True)
def embed_file(self, input_file: IO, output_file_path: str, output_format: str = "all", batch_size: int = DEFAULT_BATCH_SIZE) -> None: """ Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentences is saved in a dataset. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. """ assert output_format in ["all", "top", "average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file if line.strip()] split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence as the key. embedded_sentences = zip( sentences, self.embed_sentences(split_sentences, batch_size)) logger.info("Processing sentences.") with h5py.File(output_file_path, 'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if key in fout.keys(): logger.warning( f"Key already exists in {output_file_path}, skipping: {key}" ) else: if output_format == "all": output = embeddings elif output_format == "top": output = embeddings[2] elif output_format == "average": output = numpy.average(embeddings, axis=0) fout.create_dataset(key, output.shape, dtype='float32', data=output) input_file.close()
def close_python_file(python_file: IO): """ Closes the file, closing first the object inside it. Args: python_file {IO}: """ python_file.write("}") # close file python_file.close() # write update to console print("\noui.py updated")
def fromImage(projectId: int, fileObj: IO, metadata: Dict, original_path: str = None) -> Feature: """ Create a Point feature from a georeferenced image :param projectId: int :param fileObj: file :param metadata: dict :return: None """ imdata = ImageService.processImage(fileObj) point = Point(imdata.coordinates) f = Feature() f.project_id = projectId f.the_geom = from_shape(point, srid=4326) f.properties = metadata asset_uuid = uuid.uuid4() base_filepath = make_project_asset_dir(projectId) asset_path = os.path.join(base_filepath, str(asset_uuid) + '.jpeg') fa = FeatureAsset( uuid=asset_uuid, asset_type="image", original_path=original_path, display_path=original_path, path=get_asset_relative_path(asset_path), feature=f, ) f.assets.append(fa) thumbnail_path = os.path.join(base_filepath, str(asset_uuid) + ".thumb.jpeg") resized_image_path = os.path.join(base_filepath, str(asset_uuid) + '.jpeg') try: imdata.thumb.save(thumbnail_path, "JPEG") imdata.resized.save(resized_image_path, "JPEG") except: if os.path.exists(thumbnail_path): os.remove(thumbnail_path) if os.path.exists(resized_image_path): os.remove(resized_image_path) raise finally: fileObj.close() db_session.add(f) db_session.commit() return f
def check_stream( stream: typing.IO, check_seekable: typing.Optional[bool] = None, check_position: typing.Optional[int] = None, reset_position: typing.Optional[bool] = False, check_content: typing.Optional[typing.AnyStr] = None, check_closeable: typing.Optional[bool] = True) -> typing.NoReturn: # check input is not None assert stream is not None, "provided `stream` is None" # check input is a stream assert hasattr(stream, "seekable"), \ "provided `stream` does not appear file-like" # check_seekable if check_seekable is not None: assert stream.seekable() == check_seekable, \ "value of stream.seekable() is not as expected" # check_position if check_position is not None: if hasattr(stream, "tell"): assert stream.tell( ) == check_position, "position not as expected" elif hasattr(stream, "seek"): # stream.seek(0, 1) is an alternate to stream.tell() assert stream.seek( 0, 1) == check_position, "position not as expected" # reset_position if reset_position is not None and reset_position: # the stream.seek(0) will fail if these don't pass assert hasattr(stream, "seekable") assert stream.seekable() # reset the stream's position stream.seek(0) # check_content if check_content is not None and check_content: content = stream.read() assert content == check_content, "stream content is not as expected" # check_closeable if check_closeable is not None and check_closeable: assert hasattr(stream, "closed") and not stream.closed assert hasattr(stream, "close") stream.close() assert stream.closed, "cannot close stream"
def sort_by_column(file: IO, index_args: Tuple[str], with_header: bool = False) -> None: """Reads a CSV file, sorts it by a given column, prints the sorted rows""" reader = csv.reader(file) writer = csv.writer(sys.stdout) if with_header: writer.writerow(next(reader)) for row in sorted(reader, key=lambda x: sort_multiple(x, index_args)): writer.writerow(row) file.close()
def fromGeoJSON(projectId: int, fileObj: IO, metadata: Dict, original_path: str = None) -> List[Feature]: """ :param projectId: int :param fileObj: file descriptor :param metadata: Dict of <key, val> pairs :param original_path: str path of original file location :return: Feature """ data = json.loads(fileObj.read()) fileObj.close() return FeaturesService.addGeoJSON(projectId, data)
def embed_file(self, input_file: IO, output_file_path: str, output_format: str = "all", batch_size: int = DEFAULT_BATCH_SIZE) -> None: """ Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentences is saved in a dataset. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. """ assert output_format in ["all", "top", "average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file if line.strip()] split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence as the key. embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size)) logger.info("Processing sentences.") with h5py.File(output_file_path, 'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if key in fout.keys(): logger.warning(f"Key already exists in {output_file_path}, skipping: {key}") else: if output_format == "all": output = embeddings elif output_format == "top": output = embeddings[2] elif output_format == "average": output = numpy.average(embeddings, axis=0) fout.create_dataset( key, output.shape, dtype='float32', data=output ) input_file.close()
def write_file(archive: tarfile.TarFile, file: IO, name: str, dataset_dir: Union[Path, str]): if isinstance(dataset_dir, str): dataset_dir = Path(dataset_dir) assert dataset_dir.is_dir() inter_dir, short_file_name = get_stored_file_name(name) file_path = dataset_dir / inter_dir / short_file_name with file_path.open("wb") as out_file: out_file.write(file.read()) file.close() archive.close()
def func(combinations: typing.Iterable[typing.Tuple[utils.models.Student, utils.models.Student]], pathfunc: typing.Callable[[utils.models.Student], pathlib.Path], record_file: typing.IO, cutoff: float = -1): codes = [] max_sim = {} for s1, s2 in combinations: try: d, r = check_diff(pathfunc(s1), pathfunc(s2), False) if r > cutoff: for code in codes: if s1 in code: code.add(s2) break else: codes.append({s1, s2}) print(s1, s2, end=' ') print(d, round(r, 2)) record_file.write(f'{s1} {s2} {d} {round(r, 2)}\n') if s1 in max_sim: if max_sim[s1][1] < r: max_sim[s1][0] = s2 max_sim[s1][1] = r else: max_sim[s1] = [s2, r] if s2 in max_sim: if max_sim[s2][1] < r: max_sim[s2][0] = s1 max_sim[s2][1] = r else: max_sim[s2] = [s1, r] except FileNotFoundError as e: print(e) pass print() record_file.write('\n') for code in codes: print([s for s in code]) record_file.write(f'{[s for s in code]}\n') print() record_file.write('\n') for s1, v in max_sim.items(): s2, r = v print(s1, s2, f'{r:.2f}') record_file.write(f'{s1} {s2} {round(r, 2)}\n') record_file.close()
def svcOutputReader(self, out: IO, queue, logDir: str): ''' The infinite routine that processes the STDOUT stream for the sub process being managed. :param out: the IO stream object used to fetch the data from :param queue: the queue where we dump the roughly parsed line-by-line data :param logDir: where we should dump a verbatim output file ''' os.makedirs(logDir, exist_ok=True) logFile = os.path.join(logDir,'stdout.log') fOut = open(logFile, 'wb') # Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python # print("This is the svcOutput Reader...") # for line in out : for line in iter(out.readline, b''): fOut.write(line) # print("Finished reading a line: {}".format(line)) # print("Adding item to queue...") try: line = line.decode("utf-8").rstrip() except UnicodeError: print("\nNon-UTF8 server output: {}\n".format(line)) # This might block, and then causing "out" buffer to block queue.put(line) self._printProgress("_i") if self._status.isStarting(): # we are starting, let's see if we have started if line.find(self.TD_READY_MSG) != -1: # found Logging.info("Waiting for the service to become FULLY READY") time.sleep(1.0) # wait for the server to truly start. TODO: remove this Logging.info("Service is now FULLY READY") # TODO: more ID info here? self._status.set(Status.STATUS_RUNNING) # Trim the queue if necessary: TODO: try this 1 out of 10 times self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10) # trim to 90% size if self._status.isStopping(): # TODO: use thread status instead # WAITING for stopping sub process to finish its outptu print("_w", end="", flush=True) # queue.put(line) # meaning sub process must have died Logging.info("EOF for TDengine STDOUT: {}".format(self)) out.close() # Close the stream fOut.close() # Close the output file
def _log_queue_worker(stream: IO, line_queue: queue.Queue) -> None: """ Worker function to run in a seprate thread. Reads from 'stream', puts lines in a Queue (Queue is thread-safe). """ while True: # readline() is a blocking operation. # decode to push a string in the queue instead of 8-bit bytes. log_line = stream.readline().decode("utf-8") line_queue.put(log_line) if len(log_line) == 0: # This is the end of the stream meaning the server process # has exited. stream.close() break
def _scan_dynamodb_and_upload_to_s3(self, temp_file: IO, scan_kwargs: dict, table: Any) -> IO: while True: response = table.scan(**scan_kwargs) items = response['Items'] for item in items: temp_file.write(self.process_func(item)) if 'LastEvaluatedKey' not in response: # no more items to scan break last_evaluated_key = response['LastEvaluatedKey'] scan_kwargs['ExclusiveStartKey'] = last_evaluated_key # Upload the file to S3 if reach file size limit if getsize(temp_file.name) >= self.file_size: _upload_file_to_s3(temp_file, self.s3_bucket_name, self.s3_key_prefix) temp_file.close() temp_file = NamedTemporaryFile() return temp_file
def _to_textio(fp: IO, mode: str, read_codec: str) -> TextIO: if 'b' in mode: fp = cast(TextIO, fp) # TODO: FIx me fp.decoder = read_codec fp.native_reader = fp.read fp.read = lambda *args: _auto_decode(fp, *args) if getattr(fp, 'native_closer', None): fp.native_closer = fp.close fp.close = lambda *a: _wrapped_close(fp) return fp
def convert_to_zipfile_object(fileobj: IO): is_zipfile = zipfile.is_zipfile(fileobj) fileobj.seek(0) if is_zipfile: return fileobj if hasattr(fileobj, "name"): named_fileobj = fileobj else: named_fileobj = tempfile.NamedTemporaryFile(suffix='.zip') named_fileobj.write(fileobj.read()) fileobj.close() named_fileobj.seek(0) tmp_file = tempfile.NamedTemporaryFile(suffix='.zip') with zipfile.ZipFile(tmp_file.name, 'w', compression=zipfile.ZIP_DEFLATED) as new_zip: new_zip.write(named_fileobj.name, arcname=Path(named_fileobj.name).name) tmp_file.seek(0) named_fileobj.close() return tmp_file
def write(self, evaluation_results: specification.EvaluationResults, buffer: typing.IO = None, **kwargs): if self.destination is None and buffer is None: raise ValueError( f"A buffer must be passed in if no destination is declared") converted_output = self._to_xarray(evaluation_results) responsible_for_buffer = buffer is None try: if responsible_for_buffer: buffer = open(self.destination, 'wb') raw_netcdf = converted_output.to_netcdf() buffer.write(raw_netcdf) finally: if responsible_for_buffer and buffer is not None and not buffer.closed: buffer.close()
def load( self, fp: IO = None, serialization: SerializationFormat = None ) -> None: if serialization is None: serialization = self._serialization_format if fp is None: close_fp_before_return = True try: fp = open(self._file) except (FileNotFoundError, TypeError) as e: raise ConfigFileException(e) else: close_fp_before_return = False s = fp.read() if close_fp_before_return: fp.close() self.loads(s, serialization) return
def svcOutputReader(self, out: IO, queue): # Important Reference: https://stackoverflow.com/questions/375427/non-blocking-read-on-a-subprocess-pipe-in-python # print("This is the svcOutput Reader...") # for line in out : for line in iter(out.readline, b''): # print("Finished reading a line: {}".format(line)) # print("Adding item to queue...") try: line = line.decode("utf-8").rstrip() except UnicodeError: print("\nNon-UTF8 server output: {}\n".format(line)) # This might block, and then causing "out" buffer to block queue.put(line) self._printProgress("_i") if self._status.isStarting( ): # we are starting, let's see if we have started if line.find(self.TD_READY_MSG) != -1: # found Logging.info( "Waiting for the service to become FULLY READY") time.sleep( 1.0 ) # wait for the server to truly start. TODO: remove this Logging.info("Service is now FULLY READY" ) # TODO: more ID info here? self._status.set(Status.STATUS_RUNNING) # Trim the queue if necessary: TODO: try this 1 out of 10 times self._trimQueue(self.MAX_QUEUE_SIZE * 9 // 10) # trim to 90% size if self._status.isStopping(): # TODO: use thread status instead # WAITING for stopping sub process to finish its outptu print("_w", end="", flush=True) # queue.put(line) # meaning sub process must have died Logging.info("EOF for TDengine STDOUT: {}".format(self)) out.close()
def _cleanup_stream(self, camera: Camera, server_socket: socket.socket, client: IO): if client: try: client.close() except Exception as e: self.logger.warning('Error on client socket close: {}'.format( str(e))) try: server_socket.close() except Exception as e: self.logger.warning('Error on server socket close: {}'.format( str(e))) if camera.stream: try: camera.stream.close() except Exception as e: self.logger.warning( 'Error while closing the encoding stream: {}'.format( str(e)))
def dump( self, fp: IO = None, serialization: SerializationFormat = None ) -> int: if serialization is None: serialization = self._serialization_format s = self.dumps(serialization) if fp is None: close_fp_before_return = True try: fp = open(self._file, 'w') except FileNotFoundError as e: raise ConfigFileException(e) else: close_fp_before_return = False ret = fp.write(s) if close_fp_before_return: fp.close() return ret
def embed_file(self, input_file: IO, output_file_path: str, output_format: str = "all", batch_size: int = DEFAULT_BATCH_SIZE, forget_sentences: bool = False, use_sentence_keys: bool = False) -> None: """ Computes ELMo embeddings from an input_file where each line contains a sentence tokenized by whitespace. The ELMo embeddings are written out in HDF5 format, where each sentence embedding is saved in a dataset with the line number in the original file as the key. Parameters ---------- input_file : ``IO``, required A file with one tokenized sentence per line. output_file_path : ``str``, required A path to the output hdf5 file. output_format : ``str``, optional, (default = "all") The embeddings to output. Must be one of "all", "top", or "average". batch_size : ``int``, optional, (default = 64) The number of sentences to process in ELMo at one time. forget_sentences : ``bool``, optional, (default = False). If use_sentence_keys is False, whether or not to include a string serialized JSON dictionary that associates sentences with their line number (its HDF5 key). The mapping is placed in the "sentence_to_index" HDF5 key. This is useful if you want to use the embeddings without keeping the original file of sentences around. use_sentence_keys : ``bool``, optional, (default = False). Whether or not to use full sentences as keys. By default, the line numbers of the input file are used as ids, which is more robust. """ assert output_format in ["all", "top", "average"] # Tokenizes the sentences. sentences = [line.strip() for line in input_file] blank_lines = [i for (i, line) in enumerate(sentences) if line == ""] if blank_lines: raise ConfigurationError(f"Your input file contains empty lines at indexes " f"{blank_lines}. Please remove them.") split_sentences = [sentence.split() for sentence in sentences] # Uses the sentence index as the key. if use_sentence_keys: logger.warning("Using sentences as keys can fail if sentences " "contain forward slashes or colons. Use with caution.") embedded_sentences = zip(sentences, self.embed_sentences(split_sentences, batch_size)) else: embedded_sentences = ((str(i), x) for i, x in enumerate(self.embed_sentences(split_sentences, batch_size))) sentence_to_index = {} logger.info("Processing sentences.") with h5py.File(output_file_path, 'w') as fout: for key, embeddings in Tqdm.tqdm(embedded_sentences): if use_sentence_keys and key in fout.keys(): raise ConfigurationError(f"Key already exists in {output_file_path}. " f"To encode duplicate sentences, do not pass " f"the --use-sentence-keys flag.") if not forget_sentences and not use_sentence_keys: sentence = sentences[int(key)] sentence_to_index[sentence] = key if output_format == "all": output = embeddings elif output_format == "top": output = embeddings[-1] elif output_format == "average": output = numpy.average(embeddings, axis=0) fout.create_dataset( str(key), output.shape, dtype='float32', data=output ) if not forget_sentences and not use_sentence_keys: sentence_index_dataset = fout.create_dataset( "sentence_to_index", (1,), dtype=h5py.special_dtype(vlen=str)) sentence_index_dataset[0] = json.dumps(sentence_to_index) input_file.close()