def preview_bam(filename): import pysam res = '' with pysam.AlignmentFile(filename, 'rb') as bam: headers = bam.header for record_type in ('RG', 'PG', 'SQ'): if record_type not in headers: continue else: records = headers[record_type] res += record_type + ':\n' for i, record in enumerate(records): if type(record) == str: res += ' ' + short_repr(record) + '\n' elif type(record) == dict: res += ' ' for idx, (k, v) in enumerate(record.items()): if idx < 4: res += '{}: {} '.format(k, short_repr(v)) elif idx == 4: res += '...' break if i > 4: res += '\n ...\n' break else: res += '\n' return res
def log(self, stage=None, msg=None): if stage == 'start': env.logger.debug('{} ``{}``: {}'.format('Checking' if self.run_mode == 'dryrun' else 'Executing', self.step.step_name(), self.step.comment.strip())) elif stage == 'input': if env.sos_dict['input'] is not None: env.logger.debug('input: ``{}``'.format(short_repr(env.sos_dict['input']))) elif stage == 'output': if env.sos_dict['output'] is not None: env.logger.debug('output: ``{}``'.format(short_repr(env.sos_dict['output'])))
def _R_repr(obj): if isinstance(obj, bool): return 'TRUE' if obj else 'FALSE' elif isinstance(obj, (int, float, str)): return repr(obj) elif isinstance(obj, Sequence): if len(obj) == 0: return 'c()' # if the data is of homogeneous type, let us use c() # otherwise use list() # this can be confusion but list can be difficult to handle if homogeneous_type(obj): return 'c(' + ','.join(_R_repr(x) for x in obj) + ')' else: return 'list(' + ','.join(_R_repr(x) for x in obj) + ')' elif obj is None: return 'NULL' elif isinstance(obj, dict): return 'list(' + ','.join('{}={}'.format(x, _R_repr(y)) for x,y in obj.items()) + ')' elif isinstance(obj, set): return 'list(' + ','.join(_R_repr(x) for x in obj) + ')' else: import numpy import pandas if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\ numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, \ numpy.float64)): return repr(obj) elif isinstance(obj, numpy.matrixlib.defmatrix.matrix): try: import feather except ImportError: raise UsageError('The feather-format module is required to pass numpy matrix as R matrix' 'See https://github.com/wesm/feather/tree/master/python for details.') feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name feather.write_dataframe(pandas.DataFrame(obj).copy(), feather_tmp_) return 'data.matrix(read_feather("{}"))'.format(feather_tmp_) elif isinstance(obj, numpy.ndarray): return 'c(' + ','.join(_R_repr(x) for x in obj) + ')' elif isinstance(obj, pandas.DataFrame): try: import feather except ImportError: raise UsageError('The feather-format module is required to pass pandas DataFrame as R data.frame' 'See https://github.com/wesm/feather/tree/master/python for details.') feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name try: data = obj.copy() feather.write_dataframe(data, feather_tmp_) except: # if data cannot be written, we try to manipulate data # frame to have consistent types and try again for c in data.columns: if not homogeneous_type(data[c]): data[c] = [str(x) for x in data[c]] feather.write_dataframe(data, feather_tmp_) return 'read_feather("{}")'.format(feather_tmp_) else: return repr('Unsupported datatype {}'.format(short_repr(obj)))
def load_pickled(self, item): if isinstance(item, bytes): return pickle.loads(item) elif isinstance(item, str): return pickle.loads(item.encode('utf-8')) else: self.sos_kernel.warn( 'Cannot restore from result of pickle.dumps: {}'.format( short_repr(item))) return {}
def get_vars(self, var_names): """ Functionality to transfer CAS objects and TypeSystem from SoS (python) kernel to the IRuta kernel. This function is called when a use invokes the line magic %get or %with. """ if len(var_names) != 1: raise Exception( "%get takes exactly one variable name as argument." "If you want to transfer multiple CAS, then please write them to a directory and use `%inputDir` in IRuta kernel." ) var_name = var_names[0] var_content = env.sos_dict[var_name] # Step 1: Writing Cas and TypeSystem to disk using dkpro-cassis temp_directory = tempfile.TemporaryDirectory() temp_typesystem_file = tempfile.NamedTemporaryFile( suffix=".xml", dir=temp_directory.name, delete=False) temp_typesystem_file_path = os.path.normpath( temp_typesystem_file.name).replace('\\', "/") temp_xmi_file = tempfile.NamedTemporaryFile(suffix=".xmi", dir=temp_directory.name, delete=False) temp_xmi_file_path = os.path.normpath(temp_xmi_file.name).replace( '\\', "/") if isinstance(var_content, cassis.Cas): var_content.to_xmi(temp_xmi_file_path) var_content.typesystem.to_xml(temp_typesystem_file_path) cmd_transfer_var = "%displayMode NONE\n" \ f"%loadCas {temp_xmi_file_path}\n" \ f"%loadTypeSystem {temp_typesystem_file_path}" elif isinstance(var_content, cassis.TypeSystem): var_content.to_xml(temp_typesystem_file_path) cmd_transfer_var = "%displayMode NONE\n" \ f"%loadTypeSystem {temp_typesystem_file_path}" else: raise Exception( '%get only support transfering UIMA CAS objects or TypeSystem objects. ' 'Use %expand for transfering string variables. Received datatype {}' .format(short_repr(var_content))) # Step 2: Loading files env.log_to_file('KERNEL', f'Executing "{cmd_transfer_var}"') self.ruta_kernel.run_cell( cmd_transfer_var, silent=True, store_history=False, on_error=f'Failed to get variable {var_name}') # Step 3: Clean-up temp files temp_typesystem_file.close() temp_xmi_file.close() temp_directory.cleanup()
def _Ruby_repr(self, obj): if isinstance(obj, bool): return 'true' if obj else 'false' elif isinstance(obj, float) and numpy.isnan(obj): return "Float::NAN" elif isinstance(obj, (int, float)): return repr(obj) elif isinstance(obj, str): return '%(' + obj + ')' elif isinstance(obj, complex): return 'Complex(' + str(obj.real) + ',' + str(obj.imag) + ')' elif isinstance(obj, range): return '(' + repr(min(obj)) + '...' + repr(max(obj)) + ')' elif isinstance(obj, Sequence): if len(obj) == 0: return '[]' else: return '[' + ','.join(self._Ruby_repr(x) for x in obj) + ']' elif obj is None: return 'nil' elif isinstance(obj, dict): return '{' + ','.join('"{}" => {}'.format(x, self._Ruby_repr(y)) for x, y in obj.items()) + '}' elif isinstance(obj, set): return 'Set[' + ','.join(self._Ruby_repr(x) for x in obj) + ']' else: if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\ numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, numpy.float64)): return repr(obj) elif isinstance(obj, numpy.matrixlib.defmatrix.matrix): return 'N' + repr(obj.tolist()) elif isinstance(obj, numpy.ndarray): return repr(obj.tolist()) elif isinstance(obj, pandas.DataFrame): _beginning_result_string_dataframe_to_ruby = "Daru::DataFrame.new({" _context_string_dataframe_to_ruby = str([ '"' + str(x).replace("'", '"') + '"' + "=>" + "[" + str(",".join( list(map(lambda y: self._Ruby_repr(y), obj[x].tolist())))).replace("'", '"') + "]" for x in obj.keys().tolist() ])[2:-2].replace("\', \'", ", ") + "}," _indexing_result_string_dataframe_to_ruby = "index:" + str( obj.index.values.tolist()).replace("'", '"') + ")" _result_string_dataframe_to_ruby = _beginning_result_string_dataframe_to_ruby + _context_string_dataframe_to_ruby + _indexing_result_string_dataframe_to_ruby return _result_string_dataframe_to_ruby elif isinstance(obj, pandas.Series): dat = list(obj.values) ind = list(obj.index.values) ans = "{" + ",".join( [repr(x) + "=>" + repr(y) for x, y in zip(ind, dat)]) + "}" return ans else: return repr('Unsupported datatype {}'.format(short_repr(obj)))
def _R_repr(obj, processed=None): if isinstance(obj, bool): return 'TRUE' if obj else 'FALSE' elif isinstance(obj, (int, str)): return repr(obj) elif isinstance(obj, float): if numpy.isnan(obj): return 'NaN' else: return repr(obj) elif isinstance(obj, complex): return 'complex(real = ' + str(obj.real) + ', imaginary = ' + str(obj.imag) + ')' elif isinstance(obj, Sequence): if len(obj) == 0: return 'c()' # if the data is of homogeneous type, let us use c() # otherwise use list() # this can be confusion but list can be difficult to handle if homogeneous_type(obj): return 'c(' + ','.join(_R_repr(x) for x in obj) + ')' else: return 'list(' + ','.join(_R_repr(x) for x in obj) + ')' elif obj is None: return 'NULL' elif isinstance(obj, dict): if processed: if id(obj) in processed: return 'NULL' else: processed = set() processed.add(id(obj)) return 'list(' + ','.join('{}={}'.format(make_name(str(x)), _R_repr(y, processed)) for x, y in obj.items()) + ')' elif isinstance(obj, set): return 'list(' + ','.join(_R_repr(x) for x in obj) + ')' else: if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, numpy.float64)): return repr(obj) elif isinstance(obj, numpy.matrixlib.defmatrix.matrix): try: import feather except ImportError: raise UsageError('The feather-format module is required to pass numpy matrix as R matrix' 'See https://github.com/wesm/feather/tree/master/python for details.') feather_tmp_ = tempfile.NamedTemporaryFile( suffix='.feather', delete=False).name feather.write_dataframe(pandas.DataFrame(obj).copy(), feather_tmp_) return 'data.matrix(..read.feather({!r}))'.format(feather_tmp_) elif isinstance(obj, numpy.ndarray): if obj.ndim == 1: return 'array(c(' + ','.join(_R_repr(x) for x in obj) + '))' else: return 'array(' + 'c(' + ','.join(repr(x) for x in obj.swapaxes(obj.ndim - 2, obj.ndim - 1).flatten(order='C')) + ')' + ', dim=(' + 'rev(c' + repr(obj.swapaxes(obj.ndim - 2, obj.ndim - 1).shape) + ')))' elif isinstance(obj, pandas.DataFrame): try: import feather except ImportError: raise UsageError('The feather-format module is required to pass pandas DataFrame as R data.frame' 'See https://github.com/wesm/feather/tree/master/python for details.') feather_tmp_ = tempfile.NamedTemporaryFile( suffix='.feather', delete=False).name try: data = obj.copy() # if the dataframe has index, it would not be transferred due to limitations # of feather. We will have to do something to save the index separately and # recreate it. (#397) if isinstance(data.index, pandas.Index): df_index = list(data.index) elif not isinstance(data.index, pandas.RangeIndex): # we should give a warning here df_index = None feather.write_dataframe(data, feather_tmp_) except Exception: # if data cannot be written, we try to manipulate data # frame to have consistent types and try again for c in data.columns: if not homogeneous_type(data[c]): data[c] = [str(x) for x in data[c]] feather.write_dataframe(data, feather_tmp_) # use {!r} for path because the string might contain c:\ which needs to be # double quoted. return '..read.feather({!r}, index={})'.format(feather_tmp_, _R_repr(df_index)) elif isinstance(obj, pandas.Series): dat = list(obj.values) ind = list(obj.index.values) return 'setNames(' + 'c(' + ','.join(_R_repr(x) for x in dat) + ')' + ',c(' + ','.join(_R_repr(y) for y in ind) + '))' else: return repr('Unsupported datatype {}'.format(short_repr(obj)))
def _R_repr(obj): if isinstance(obj, bool): return 'TRUE' if obj else 'FALSE' elif isinstance(obj, (int, float, str)): return repr(obj) elif isinstance(obj, Sequence): if len(obj) == 0: return 'c()' # if the data is of homogeneous type, let us use c() # otherwise use list() # this can be confusion but list can be difficult to handle if homogeneous_type(obj): return 'c(' + ','.join(_R_repr(x) for x in obj) + ')' else: return 'list(' + ','.join(_R_repr(x) for x in obj) + ')' elif obj is None: return 'NULL' elif isinstance(obj, dict): return 'list(' + ','.join('{}={}'.format(x, _R_repr(y)) for x, y in obj.items()) + ')' elif isinstance(obj, set): return 'list(' + ','.join(_R_repr(x) for x in obj) + ')' else: import numpy import pandas if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\ numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32, \ numpy.float64)): return repr(obj) elif isinstance(obj, numpy.matrixlib.defmatrix.matrix): try: import feather except ImportError: raise UsageError( 'The feather-format module is required to pass numpy matrix as R matrix' 'See https://github.com/wesm/feather/tree/master/python for details.' ) feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name feather.write_dataframe(pandas.DataFrame(obj).copy(), feather_tmp_) return 'data.matrix(read_feather("{}"))'.format(feather_tmp_) elif isinstance(obj, numpy.ndarray): return 'c(' + ','.join(_R_repr(x) for x in obj) + ')' elif isinstance(obj, pandas.DataFrame): try: import feather except ImportError: raise UsageError( 'The feather-format module is required to pass pandas DataFrame as R data.frame' 'See https://github.com/wesm/feather/tree/master/python for details.' ) feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name try: data = obj.copy() feather.write_dataframe(data, feather_tmp_) except: # if data cannot be written, we try to manipulate data # frame to have consistent types and try again for c in data.columns: if not homogeneous_type(data[c]): data[c] = [str(x) for x in data[c]] feather.write_dataframe(data, feather_tmp_) return 'read_feather("{}")'.format(feather_tmp_) else: return repr('Unsupported datatype {}'.format(short_repr(obj)))
def _julia_repr(self, obj): if isinstance(obj, bool): return 'true' if obj else 'false' elif isinstance(obj, (int, float)): return repr(obj) elif isinstance(obj, str): # Not using repr() here becasue of the problem of qoutes in Julia. return '"""' + obj + '"""' elif isinstance(obj, complex): return 'complex(' + str(obj.real) + ',' + str(obj.imag) + ')' elif isinstance(obj, Sequence): if len(obj) == 0: return '[]' else: return '[' + ','.join(self._julia_repr(x) for x in obj) + ']' elif obj is None: return 'NaN' elif isinstance(obj, dict): return 'Dict(' + ','.join( '"{}" => {}'.format(x, self._julia_repr(y)) for x, y in obj.items()) + ')' elif isinstance(obj, set): return 'Set([' + ','.join(self._julia_repr(x) for x in obj) + '])' else: if isinstance(obj, (numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64,\ numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float16, numpy.float32)): return repr(obj) # need to specify Float64() as the return to Julia in order to avoid losing precision elif isinstance(obj, numpy.float64): return 'Float64(' + obj + ')' elif isinstance(obj, numpy.matrixlib.defmatrix.matrix): try: import feather except ImportError: raise UsageError( 'The feather-format module is required to pass numpy matrix as julia matrix(array)' 'See https://github.com/wesm/feather/tree/master/python for details.' ) feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name feather.write_dataframe( pandas.DataFrame(obj).copy(), feather_tmp_) return 'convert(Matrix, Feather.read("' + feather_tmp_ + '"))' elif isinstance(obj, numpy.ndarray): return '[' + ','.join(self._julia_repr(x) for x in obj) + ']' elif isinstance(obj, pandas.DataFrame): try: import feather except ImportError: raise UsageError( 'The feather-format module is required to pass pandas DataFrame as julia.DataFrames' 'See https://github.com/wesm/feather/tree/master/python for details.' ) feather_tmp_ = tempfile.NamedTemporaryFile(suffix='.feather', delete=False).name try: data = obj.copy() # Julia DataFrame does not have index if not isinstance(data.index, pandas.RangeIndex): self.sos_kernel.warn( 'Raw index is ignored because Julia DataFrame does not support raw index.' ) feather.write_dataframe(data, feather_tmp_) except Exception: # if data cannot be written, we try to manipulate data # frame to have consistent types and try again for c in data.columns: if not homogeneous_type(data[c]): data[c] = [str(x) for x in data[c]] feather.write_dataframe(data, feather_tmp_) # use {!r} for path because the string might contain c:\ which needs to be # double quoted. return 'Feather.read("' + feather_tmp_ + '")' elif isinstance(obj, pandas.Series): dat = list(obj.values) ind = list(obj.index.values) ans = 'NamedArray(' + '[' + ','.join( self._julia_repr(x) for x in dat) + ']' + ',([' + ','.join( self._julia_repr(y) for y in ind) + '],))' return ans.replace("'", '"') else: return repr('Unsupported datatype {}'.format(short_repr(obj)))