def load(file): # Throws FileNotFoundError if `file` does not exist. log.debug(f'Loading {file} of size {os.path.getsize(file)}.') # The column 'name' may contain single quoted strings. # See http://www.tptp.org/TPTP/SyntaxBNF.html # <fof_plain_term> ::= <functor> ::= <atomic_word> ::= <single_quoted> ::= <single_quote> ::: ['] # We assume that there are no NAs in the symbols CSV table. # Note that for example in SWV478+2.p there is a symbol called 'null' that may alias with the NA filtering # (its name being misinterpreted as a missing value). return pd.read_csv(file, index_col=['isFunction', 'id'], quotechar='\'', escapechar='\\', na_filter=False, dtype={ 'isFunction': np.bool, 'id': pd.UInt32Dtype(), 'name': 'object', 'arity': pd.UInt32Dtype(), 'usageCnt': pd.UInt32Dtype(), 'unitUsageCnt': pd.UInt32Dtype(), 'inGoal': np.bool, 'inUnit': np.bool, 'skolem': np.bool, 'inductionSkolem': np.bool, 'interpreted': np.bool, 'introduced': np.bool, 'stringConstant': np.bool, 'numericConstant': np.bool, 'interpretedNumber': np.bool })
def fresh(cls, problems, clausifier, randomize=None, ucb_method='hoeffding', hoeffding_exponent=4, background='random', metric='saturation_iterations'): signature_sizes = get_signature_sizes(problems, clausifier) assert len(signature_sizes) == len(problems) # Filter out problems where signature size fetching fails. records = [{ 'problem': problems[i], 'predicates': signature_sizes[i]['predicate'], 'functions': signature_sizes[i]['function'], 'attempts': 0, 'hits': 0 } for i in range(len(problems)) if signature_sizes[i] is not None] dtypes = { 'problem': 'object', 'predicates': pd.UInt32Dtype(), 'functions': pd.UInt32Dtype(), 'attempts': pd.UInt32Dtype(), 'hits': pd.UInt32Dtype() } df = dataframe_from_records(records, index_keys='problem', dtypes=dtypes) return cls(df, randomize, ucb_method=ucb_method, hoeffding_exponent=hoeffding_exponent, background=background, metric=metric)
def results(self): """Calculate list of merges.""" drop_list = [] if self.alt_hash: for hash_val in self.alt_hash_dict: related_hashes = [hash_val] + [ alt for alt in self.alt_hash_dict[hash_val] if alt in self.count_dict ] if len(related_hashes) == 1: continue related_hashes.sort( ) # take the first in numberical order if all else is equal non_ambig_hashes = [ h for h in related_hashes if self.ambig_dict[h] == 1 ] max_count_idx = np.argmax( [self.count_dict[h] for h in non_ambig_hashes]) best_hash = non_ambig_hashes[max_count_idx] if best_hash != hash_val: drop_list.append(hash_val) del self.alt_hash_dict, self.count_dict, self.ambig_dict merge_frame = pd.DataFrame( { self.count_key: self.counts, self.ambig_key: self.ambig }, index=self.values, dtype=pd.UInt32Dtype(), ) merge_frame.drop(drop_list, inplace=True) merge_frame.sort_values(by=[self.ambig_key, self.count_key], inplace=True) unambig_frame = merge_frame[merge_frame[self.ambig_key] == 1].copy() n_unambig = len(unambig_frame) unambig_frame[self.ordinal_key] = pd.array( range(self.start_base, self.start_base + n_unambig), dtype=pd.UInt32Dtype(), ) del unambig_frame[self.ambig_key] ambig_frame = merge_frame[merge_frame[self.ambig_key] > 1].copy() del merge_frame if self.ambig_count_key is None: # Don't pass counts along del ambig_frame[self.count_key] else: ambig_frame = ambig_frame.rename( columns={self.count_key: self.ambig_count_key}) del ambig_frame[self.ambig_key] ambig_frame[self.ambig_ordinal_key] = pd.array( range( self.start_base + n_unambig, self.start_base + len(ambig_frame) + n_unambig, ), dtype=pd.UInt32Dtype(), ) return unambig_frame, ambig_frame
def calculate_disambig_hashes(self, df): """Calculate disambiguation frame (per-fragment). if self.disambig_adj_only is True, then disambiguation will be done only for those locations adjacent to an umabiguous hash. """ hash2_fr = df[["syn.anchor.id", "tmp.ambig.id"]].copy() hash2_fr = hash2_fr.rename(columns={"syn.anchor.id": "tmp.anchor.id"}) hash2_fr["tmp.upstr_anchor"] = _fill_na_with_last_valid( df["syn.anchor.id"]) hash2_fr["tmp.downstr_anchor"] = _fill_na_with_last_valid( df["syn.anchor.id"], flip=True) hash2_fr["tmp.upstr_occur"] = _cum_val_cnt_where_ser2_is_na( df["tmp.ambig.id"], df["syn.anchor.id"]) hash2_fr["tmp.downstr_occur"] = _cum_val_cnt_where_ser2_is_na( df["tmp.ambig.id"], df["syn.anchor.id"], flip=True) hash2_fr["tmp.i"] = range(len(hash2_fr)) upstream_hash = pd.array([pd.NA] * len(hash2_fr), dtype=pd.UInt32Dtype()) downstream_hash = pd.array([pd.NA] * len(hash2_fr), dtype=pd.UInt32Dtype()) hash2_fr["tmp.disambig.up"] = pd.NA hash2_fr["tmp.disambig.down"] = pd.NA for unused_id, row in hash2_fr.iterrows(): row_no = row["tmp.i"] ambig_base = row["tmp.ambig.id"] upstream_unambig = row["tmp.upstr_anchor"] downstream_unambig = row["tmp.downstr_anchor"] occur_upstream = row["tmp.upstr_occur"] occur_downstream = row["tmp.downstr_occur"] if pd.notna(ambig_base): if pd.notna(upstream_unambig): if not pd.notna(occur_upstream): logger.warning( f"Something is wrong upstream of base {ambig_base}" ) if self.disambig_adj_only and occur_upstream > 1: continue upstream_hash[row_no] = hash_array( np.array( [upstream_unambig, ambig_base, occur_upstream])) if pd.notna(downstream_unambig): if not pd.notna(occur_downstream): logger.warning( f"Something is wrong downstream of base {ambig_base}" ) if self.disambig_adj_only and occur_downstream > 1: continue downstream_hash[row_no] = hash_array( np.array( [ambig_base, downstream_unambig, occur_downstream])) hash2_fr["tmp.disambig.up"] = upstream_hash hash2_fr["tmp.disambig.down"] = downstream_hash return hash2_fr[["tmp.disambig.up", "tmp.disambig.down"]]
def load_papers_df(path): """Load table listing papers. The table is in MAG format at `path`. Returns: 1. a DataFrame of all paper IDs, 2. a DataFrame of paper, journal ID, where the Journal ID exists, 3. a DataFrame of paper, conference series ID, where exists. The papers are sorted by year. Within each year, they are sorted by rank. """ df = pd.read_csv(path, dialect=MAGDialect(), engine='c', usecols=[0, 1, 7, 11, 12], names=['paper_id', 'rank', 'year', 'journal_id', 'cs_id'], dtype={ 'paper_id': np.uint32, 'rank': np.uint16, 'year': pd.UInt16Dtype(), 'journal_id': pd.UInt32Dtype(), 'cs_id': pd.UInt32Dtype() }, keep_default_na=False, na_values={ 'year': [''], 'journal_id': [''], 'cs_id': [''] }) # Make separate tables for paper-journal/conference series mappings. paper_journals_df = df.loc[df['journal_id'].notna(), ['paper_id', 'journal_id']] paper_journals_df.reset_index(drop=True, inplace=True) # Memory. paper_journals_df['journal_id'] \ = paper_journals_df['journal_id'].astype(np.uint32) del df['journal_id'] paper_cs_df = df.loc[df['cs_id'].notna(), ['paper_id', 'cs_id']] paper_cs_df.reset_index(drop=True, inplace=True) # Save memory. paper_cs_df['cs_id'] = paper_cs_df['cs_id'].astype(np.uint32) del df['cs_id'] df['year'].fillna(YEAR_SENTINEL, inplace=True) # NaN -> sentinel. df['year'] = df['year'].astype(np.uint16) # From masked type. # 'mergesort' is stable, unlike the other sorts. Remember that we # want to sort by year, then rank. df.sort_values('rank', inplace=True, ignore_index=True, kind='mergesort') del df['rank'] df.sort_values('year', inplace=True, ignore_index=True, kind='mergesort') return df, paper_journals_df, paper_cs_df
def coerce_not_float_cols_nans(cls, self): """Coerce cols with floats and nans to the correct integer dtype.""" cols = self.not_float_cols_nans int8_val = 127 int16_val = 32767 int32_val = 2147483648 for col in cols: min = self.df[col].min() max = self.df[col].max() if min >= 0: if max < 255: self.df[col] = self.df[col].astype(pd.UInt8Dtype()) elif max < 65535: self.df[col] = self.df[col].astype(pd.UInt16Dtype()) elif max < 4294967295: self.df[col] = self.df[col].astype(pd.UInt32Dtype()) else: if min > -int8_val and max < int8_val: self.df[col] = self.df[col].astype(pd.Int8Dtype()) elif min > -int16_val and max < int16_val: self.df[col] = self.df[col].astype(pd.Int16Dtype()) elif min > -int32_val and max < int32_val: self.df[col] = self.df[col].astype(pd.Int32Dtype())
def pyarrow2pandas_extension( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, ) -> Optional[pd.api.extensions.ExtensionDtype]: """Pyarrow to Pandas data types conversion.""" if pa.types.is_int8(dtype): return pd.Int8Dtype() if pa.types.is_int16(dtype): return pd.Int16Dtype() if pa.types.is_int32(dtype): return pd.Int32Dtype() if pa.types.is_int64(dtype): return pd.Int64Dtype() if pa.types.is_uint8(dtype): return pd.UInt8Dtype() if pa.types.is_uint16(dtype): return pd.UInt16Dtype() if pa.types.is_uint32(dtype): return pd.UInt32Dtype() if pa.types.is_uint64(dtype): return pd.UInt64Dtype() if pa.types.is_boolean(dtype): return pd.BooleanDtype() if pa.types.is_string(dtype): return pd.StringDtype() return None
def traiter_cantons(cantons_cog_path, dest): cantons = pd.read_csv( cantons_cog_path, dtype={ "CAN": str, "DEP": str, "BURCENTRAL": str, "COMPCT": pd.UInt32Dtype() }, usecols=[ "CAN", "TYPECT", "COMPCT", "NCCENR", "TNCC", "DEP", "BURCENTRAL", ], ) cantons.rename( columns={ "CAN": "code", "TYPECT": "type", "COMPCT": "composition", "NCCENR": "nom", "TNCC": "type_nom", "DEP": "departement", "BURCENTRAL": "bureau_centralisateur", }).to_csv(dest, index=False)
def merge_disambig_hashes( args, unambig=None, ambig=None, hasher=None, mailboxes=None, ): """Merge disambiguated synteny hashes into proteomes per-proteome.""" idx, dotpath = args plain_hash_name = hasher.hash_name(no_prefix=True) hash_name = "syn." + plain_hash_name outpath = dotpath_to_path(dotpath) syn = read_tsv_or_parquet(outpath / SYNTENY_FILE) syn = _join_on_col_with_na(syn, unambig, "tmp.disambig.up") syn = _join_on_col_with_na(syn, unambig, "tmp.disambig.down") for dup_col in [ "tmp.disambig.anchor.count", "tmp.disambig.anchor.id", ]: xcol = dup_col + "_x" ycol = dup_col + "_y" syn[dup_col] = syn[xcol].fillna(syn[ycol]) del syn[xcol], syn[ycol] syn["syn.anchor.id"] = syn["syn.anchor.id"].fillna( syn["tmp.disambig.anchor.id"]) syn["syn.anchor.count"] = syn["syn.anchor.count"].fillna( syn["tmp.disambig.anchor.count"]) syn["syn.code"] = _fill_col1_val_where_col2_notna( syn["syn.code"], syn["tmp.disambig.anchor.id"], DISAMBIGUATED_CODE) # Delete some non-needed tmp columns non_needed_cols = [ "tmp.disambig.anchor.count", "tmp.disambig.anchor.id", "tmp.disambig.up", "tmp.disambig.down", ] syn = syn.drop(columns=non_needed_cols) # null hashes are already assigned syn[hash_name][syn["syn.anchor.id"].notna()] = pd.NA write_tsv_or_parquet(syn, outpath / SYNTENY_FILE, remove_tmp=False) # Write out non-ambiguous hashes syn["tmp.self_count"] = pd.array( syn[hash_name].map(syn[hash_name].value_counts()), dtype=pd.UInt32Dtype(), ) unique_hashes = (syn[[ hash_name, "tmp.self_count" ]].drop_duplicates(subset=[hash_name]).dropna(how="any")) unique_hashes = unique_hashes.set_index(hash_name).sort_index() with mailboxes.locked_open_for_write(idx) as file_handle: unique_hashes.to_csv(file_handle, header=False, sep="\t") # logger.debug(f"{dotpath} has {syn['syn.anchor.id'].notna().sum()} assignments") return { "idx": idx, "path": dotpath, "syn.anchors.disambiguated": _count_code(syn["syn.code"], DISAMBIGUATED_CODE), }
def load_authorships_df(path): """Load table of authorships and affiliations. The table is at `path` in the MAG format. The first three columns are used; these are the paper ID, author ID, and affiliation ID (or blank). Returns: 1. a DataFrame of paper IDs and the corresponding author IDs, 2. a DataFrame of paper IDs and the corresponding affiliation IDs. Duplicate entries are permitted. Null affiliations are not returned. """ df = pd.read_csv(path, dialect=MAGDialect(), engine='c', usecols=[0, 1, 2], names=['paper_id', 'author_id', 'affiliation_id'], dtype={ 'paper_id': np.uint32, 'author_id': np.uint32, 'affiliation_id': pd.UInt32Dtype() }, keep_default_na=False, na_values={'affiliation_id': ['']}) paper_affiliations_df = df.loc[df['affiliation_id'].notna(), ['paper_id', 'affiliation_id']] paper_affiliations_df.reset_index(drop=True, inplace=True) paper_affiliations_df['affiliation_id'] \ = paper_affiliations_df['affiliation_id'].astype(np.uint32) del df['affiliation_id'] return df, paper_affiliations_df
def integer_type_mapping( use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]: if use_extension_types: return { IntegerType.INT8: pd.Int8Dtype(), IntegerType.UINT8: pd.UInt8Dtype(), IntegerType.INT16: pd.Int16Dtype(), IntegerType.UINT16: pd.UInt16Dtype(), IntegerType.INT24: pd.Int32Dtype(), IntegerType.UINT24: pd.Int32Dtype(), IntegerType.INT32: pd.Int32Dtype(), IntegerType.UINT32: pd.UInt32Dtype(), IntegerType.INT64: pd.Int64Dtype(), IntegerType.UINT64: pd.UInt64Dtype(), } else: return { IntegerType.INT8: np.int8, IntegerType.UINT8: np.uint8, IntegerType.INT16: np.int16, IntegerType.UINT16: np.uint16, IntegerType.INT24: np.int32, IntegerType.UINT24: np.uint32, IntegerType.INT32: np.int32, IntegerType.UINT32: np.uint32, IntegerType.INT64: np.int64, IntegerType.UINT64: np.uint64, }
def traiter_cantons(cantons_cog_path, dest): cantons = pd.read_csv( cantons_cog_path, dtype={ "can": str, "dep": str, "burcentral": str, "compct": pd.UInt32Dtype() }, usecols=[ "can", "typect", "compct", "nccenr", "tncc", "dep", "burcentral", ], ) # La commune d'Azé (53014) est maintenant une commune déléguée de Château-Gontier-sur-Mayenne cantons.loc[cantons["burcentral"] == "53014", "burcentral"] = "53062" cantons.rename( columns={ "can": "code", "typect": "type", "compct": "composition", "nccenr": "nom", "tncc": "type_nom", "dep": "departement", "burcentral": "bureau_centralisateur", }).to_csv(dest, index=False)
def calculate(self, cluster_series): """Return an array of synteny block hashes data.""" # Maybe the best code I've ever written--JB vec = cluster_series.to_numpy().astype(int) if self.peatmer: uneq_idxs = np.append(np.where(vec[1:] != vec[:-1]), vec.size - 1) runlengths = np.diff(np.append(-1, uneq_idxs)) positions = np.cumsum(np.append(0, runlengths))[:-1] n_mers = len(positions) - self.k + 1 footprints = pd.array( [runlengths[i:i + self.k].sum() for i in range(n_mers)], dtype=pd.UInt32Dtype(), ) else: n_elements = len(cluster_series) n_mers = n_elements - self.k + 1 positions = np.arange(n_elements) footprints = pd.array([self.k] * n_mers, dtype=pd.UInt32Dtype()) if n_mers < 1: return None # Calculate k-mers over indirect index kmer_mat = np.array( [vec[positions[i:i + self.k]] for i in range(n_mers)]) fwd_rev_hashes = np.array([ np.apply_along_axis(hash_array, 1, kmer_mat), np.apply_along_axis(hash_array, 1, np.flip(kmer_mat, axis=1)), ]) plus_minus = np.array([["+"] * n_mers, ["-"] * n_mers]) directions = np.take_along_axis( plus_minus, np.expand_dims(fwd_rev_hashes.argmin(axis=0), axis=0), axis=0, )[0] return pd.DataFrame( [ pd.Categorical(directions, dtype=DIRECTIONAL_CATEGORY), footprints, pd.array(np.amin(fwd_rev_hashes, axis=0), dtype=pd.UInt32Dtype()), ], columns=[ "syn.hash.direction", "syn.hash.footprint", self.hash_name(), ], index=cluster_series.index[positions[:n_mers]], )
def test_reductions_2d_axis0(self, data, method, request): if not hasattr(data, method): pytest.skip("test is not applicable for this type/dtype") arr2d = data.reshape(1, -1) kwargs = {} if method == "std": # pass ddof=0 so we get all-zero std instead of all-NA std kwargs["ddof"] = 0 try: result = getattr(arr2d, method)(axis=0, **kwargs) except Exception as err: try: getattr(data, method)() except Exception as err2: assert type(err) == type(err2) return else: raise AssertionError("Both reductions should raise or neither") if method in ["mean", "median", "sum", "prod"]: # std and var are not dtype-preserving expected = data if method in ["sum", "prod"] and data.dtype.kind in "iub": # FIXME: kludge if data.dtype.kind in ["i", "b"]: if is_platform_windows() or not IS64: # FIXME: kludge for 32bit builds if result.dtype.itemsize == 4: dtype = pd.Int32Dtype() else: dtype = pd.Int64Dtype() else: dtype = pd.Int64Dtype() elif data.dtype.kind == "u": if is_platform_windows() or not IS64: # FIXME: kludge for 32bit builds if result.dtype.itemsize == 4: dtype = pd.UInt32Dtype() else: dtype = pd.UInt64Dtype() else: dtype = pd.UInt64Dtype() expected = data.astype(dtype) if data.dtype.kind == "b" and method in ["sum", "prod"]: # We get IntegerArray instead of BooleanArray pass else: assert type(expected) == type(data), type(expected) assert dtype == expected.dtype self.assert_extension_array_equal(result, expected) elif method == "std": self.assert_extension_array_equal(result, data - data)
def test_arrow_from_arrow_uint(): # https://github.com/pandas-dev/pandas/issues/31896 # possible mismatch in types dtype = pd.UInt32Dtype() result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64")) expected = pd.array([1, 2, 3, 4, None], dtype="UInt32") tm.assert_extension_array_equal(result, expected)
def test_intdtypes() -> None: pd.Int8Dtype() pd.Int16Dtype() pd.Int32Dtype() pd.Int64Dtype() pd.UInt8Dtype() pd.UInt16Dtype() pd.UInt32Dtype() pd.UInt64Dtype()
def __init__(self, pandas_obj): # validate and assign object self._validate(pandas_obj) self._obj = pandas_obj # define incorporated modules - columns consisting of others will not have the dtype changed self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas'] # define a possible list of null values self._NULL_VALS = [ None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf, '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk', 'UNKNOWN', 'UNK' ] # assign dtypes and limits # boolean BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on'] BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off'] self._BOOL_MAP_DICT = {i: True for i in BOOL_STRINGS_TRUE }.update({i: False for i in BOOL_STRINGS_FALSE}) self._DTYPE_BOOL_BASE = np.bool self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype() # unsigned integers - base and nullable self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64] self._DTYPES_UINT_NULLABLE = [ pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype() ] self._LIMIT_LOW_UINT = [ np.iinfo(i).min for i in self._DTYPES_UINT_BASE ] self._LIMIT_HIGH_UINT = [ np.iinfo(i).max for i in self._DTYPES_UINT_BASE ] # signed integers - base and nullable self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64] self._DTYPES_INT_NULLABLE = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype() ] self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE] self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE] # floats - nullable by default self._DTYPES_FLOAT = [np.float16, np.float32, np.float64] # datetime - nullable by default self._DTYPE_DATETIME = np.datetime64 # string self._DTYPE_STRING = pd.StringDtype() # categorical - nullable by default self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
class Result(process.Result): def __init__(self, process_result, result_symbols, clauses): log.debug(process_result) super().__init__(**process_result.__dict__) self.symbols = result_symbols self.clauses = clauses pd_dtypes = { **process.Result.pd_dtypes, 'time_elapsed_vampire': float, 'saturation_iterations': pd.UInt32Dtype(), 'memory_used': pd.UInt32Dtype() } def symbols_of_type(self, symbol_type): return symbols.symbols_of_type(self.symbols, symbol_type) @property def saturation_iterations(self): try: return int( re.search(r'^% Main loop iterations started: (\d+)$', self.stdout, re.MULTILINE)[1]) except TypeError: return None @property def memory_used(self): try: return int( re.search(r'^% Memory used \[KB\]: (\d+)$', self.stdout, re.MULTILINE)[1]) except TypeError: return None @property def time_elapsed_vampire(self): try: return float( re.search(r'^% Time elapsed: (\d+\.\d+) s$', self.stdout, re.MULTILINE)[1]) except TypeError: return None
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ): kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: if LooseVersion(self.api.__version__) >= "0.16": import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), } to_pandas_kwargs["types_mapper"] = mapping.get else: raise ValueError( "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " f"({self.api.__version__} is installed" ) manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs ).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def merge_unambig_hashes( args, unambig=None, ambig=None, hasher=None, mailboxes=None, ): """Merge unambiguous synteny hashes into proteomes per-proteome.""" hash_name = hasher.hash_name() idx, dotpath = args outpath = dotpath_to_path(dotpath) syn = read_tsv_or_parquet(outpath / SYNTENY_FILE) syn = _join_on_col_with_na(syn, unambig, hash_name) syn = _join_on_col_with_na(syn, ambig, hash_name) syn["syn.code"] = pd.NA syn["syn.code"] = _fill_col1_val_where_col2_notna(syn["syn.code"], syn["syn.anchor.id"], UNAMBIGUOUS_CODE) # Calculate disambiguation hashes and write them out for merge disambig_frame_list = [] for unused_frag, subframe in syn.groupby(by=["frag.id"]): disambig_frame_list.append(hasher.calculate_disambig_hashes(subframe)) disambig_fr = pd.concat( [df for df in disambig_frame_list if df is not None]) disambig_fr = disambig_fr.dropna(how="all") syn = syn.join(disambig_fr) write_tsv_or_parquet(syn, outpath / SYNTENY_FILE, remove_tmp=False) # Write out unified upstream/downstream hash values merged_hashes = pd.concat( [ _rename_and_fill_alt(syn, "tmp.disambig.up", "tmp.disambig.down"), _rename_and_fill_alt(syn, "tmp.disambig.down", "tmp.disambig.up"), ], ignore_index=True, ) merged_hashes["self_count"] = pd.array( merged_hashes["hash"].map(merged_hashes["hash"].value_counts()), dtype=pd.UInt32Dtype(), ) merged_hashes = merged_hashes.reindex( columns=["hash", "self_count", "alt_hash"]) unique_hashes = (merged_hashes.drop_duplicates( subset=["hash"]).set_index("hash").sort_index()) del merged_hashes with mailboxes.locked_open_for_write(idx) as file_handle: unique_hashes.to_csv(file_handle, header=False, sep="\t") return { "idx": idx, "path": dotpath, "syn.anchors.unambiguous": _count_code(syn["syn.code"], UNAMBIGUOUS_CODE), }
def test_numeric_dtypes(self): dtypes = [ bool, np.byte, np.ubyte, np.short, np.ushort, np.single, np.int32, np.intc, np.half, np.float16, np.double, np.float64, pd.StringDtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), ] for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".xml", "xml"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: with tmpfile(suffix) as path: for dtype in dtypes: try: df = Ind2Col2.convert(Ind2Col2( sample_data_ind2_col2())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), self.api.float32(): pd.Float32Dtype(), self.api.float64(): pd.Float64Dtype(), } to_pandas_kwargs["types_mapper"] = mapping.get manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def dtypes(self): return { 'clausify_returncode': 'category', 'num_clauses': pd.UInt32Dtype(), 'num_predicate': pd.UInt32Dtype(), 'num_function': pd.UInt32Dtype(), 'graph_nodes': pd.UInt32Dtype(), 'graph_nodes_lower_bound': pd.UInt32Dtype(), **{ f'graph_nodes_{ntype}': pd.UInt32Dtype() for ntype in self.formula_visitor().ntypes() }, 'graph_edges': pd.UInt32Dtype() }
def _cum_val_count(arr): """Return an array of cumulative counts of values.""" counts = {} out_arr = pd.array( [pd.NA] * len(arr), dtype=pd.UInt32Dtype(), ) for i, val in enumerate(arr): if pd.isnull(val): continue elif val in counts: counts[val] += 1 else: counts[val] = 1 out_arr[i] = counts[val] return out_arr
def test_numeric_nullable_dtypes(self): dtypes = [ pd.StringDtype(), pd.BooleanDtype(), pd.Float64Dtype(), pd.Float32Dtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), pd.StringDtype(), ] # TODO: Re-add (".xml", "xml"), # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46 for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: # TODO: include xml for dtype in dtypes: with tmpfile(suffix) as path: try: df = Ind2Col2.convert( Ind2Col2( sample_data_ind2_col2_pd_na())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def test_to_pandas_dtype_integer_nullable(): expectations = { (-100, 100): pd.Int8Dtype(), (0, 240): pd.UInt8Dtype(), (-10000, 10000): pd.Int16Dtype(), (500, 40000): pd.UInt16Dtype(), (-200000000, 200000000): pd.Int32Dtype(), (25, 4000000000): pd.UInt32Dtype(), (-9000000000000000000, 2000000000): pd.Int64Dtype(), (25, 10000000000000000000): pd.UInt64Dtype(), (25, 1000000000000000000000000000): np.float128, (None, None): pd.Int64Dtype(), } for (min_, max_), expected_pandas_type in expectations.items(): constraints = RecordsSchemaFieldIntegerConstraints(required=True, unique=None, min_=min_, max_=max_) yield with_nullable( True, check_dtype), "integer", constraints, expected_pandas_type
def calculate_synteny_hashes(args, mailboxes=None, hasher=None, unambig=None, ambig=None): """Calculate synteny hashes for proteins per-genome.""" idx, dotpath = args outpath = dotpath_to_path(dotpath) hom = read_tsv_or_parquet(outpath / HOMOLOGY_FILE) hom["tmp.nan_group"] = ( (hom["hom.cluster"].isnull()).astype(int).cumsum() + 1) * (~hom["hom.cluster"].isnull()) hom.replace(to_replace={"tmp.nan_group": 0}, value=pd.NA, inplace=True) hash_name = hasher.hash_name() syn_list = [] if hasher.thorny: # drop rows hom = hom[hom["hom.cluster"].notna()] for unused_id_tuple, subframe in hom.groupby( by=["frag.id", "tmp.nan_group"]): syn_list.append(hasher.calculate(subframe["hom.cluster"])) del hom["tmp.nan_group"] syn = hom.join(pd.concat([df for df in syn_list if df is not None], axis=0)) del syn_list write_tsv_or_parquet(syn, outpath / SYNTENY_FILE, remove_tmp=False) syn["tmp.self_count"] = pd.array( syn[hash_name].map(syn[hash_name].value_counts()), dtype=pd.UInt32Dtype(), ) unique_hashes = (syn[[ hash_name, "tmp.self_count" ]].drop_duplicates(subset=[hash_name]).dropna(how="any")) unique_hashes = unique_hashes.set_index(hash_name).sort_index() with mailboxes.locked_open_for_write(idx) as file_handle: unique_hashes.to_csv(file_handle, header=False, sep="\t") return { "idx": idx, "path": dotpath, "hom.clusters": syn["hom.cluster"].notna().sum(), "syn.hashes.n": syn[hash_name].notna().sum(), }
def _fill_na_with_last_valid(ser, flip=False): """Input a series with NA values, returns a series with those values filled.""" lv_arr = pd.array( [pd.NA] * len(ser), dtype=pd.UInt32Dtype(), ) if not (ser.isnull().all() or ser.notna().all()): null_vec = ser.isnull().to_numpy() val_vec = ser.to_numpy() if flip: null_vec = np.flip(null_vec) val_vec = np.flip(val_vec) first_null_pos, null_runs = _true_positions_and_runs(null_vec) fill_vals = np.append(pd.NA, val_vec)[first_null_pos] for i, pos in enumerate(first_null_pos): for j in range(null_runs[i]): lv_arr[pos + j] = fill_vals[i] if flip: lv_arr = np.flip(lv_arr) lv_ser = pd.Series(lv_arr, index=ser.index) return lv_ser
def _cum_val_cnt_where_ser2_is_na(ser1, ser2, flip=False): """Return the cumulative value count of ser1 in regions where ser2 is NA.""" if len(ser1) != len(ser2): logger.warning(f"Lengths of ser1 and ser2 differ at {ser1}") vc_arr = pd.array( [pd.NA] * len(ser1), dtype=pd.UInt32Dtype(), ) if not (ser2.isnull().all() or ser2.notna().all()): null_vec = ser2.isnull().to_numpy() val_vec = ser1.to_numpy() if flip: null_vec = np.flip(null_vec) val_vec = np.flip(val_vec) null_pos, null_runs = _true_positions_and_runs(null_vec) null_len = len(null_pos) for i in range(null_len): vc_arr[null_pos[i]:(null_pos[i] + null_runs[i])] = _cum_val_count( val_vec[null_pos[i]:(null_pos[i] + null_runs[i])]) if flip: vc_arr = np.flip(vc_arr) vc_ser = pd.Series(vc_arr, index=ser2.index) return vc_ser
def calculate_adjacency_group(index_series, frag_series): """Calculate an adjacency group numger.""" index_fr = pd.DataFrame({"index": index_series, "fragment": frag_series}) n_prot = len(index_fr) adj_gr_count = 0 was_adj = False index_fr["i"] = range(n_prot) adj_group = np.array([np.nan] * n_prot) for unused_group, subframe in index_fr.groupby(by=["fragment"]): if len(subframe) == 1: continue last_pos = -2 last_row = None if was_adj: adj_gr_count += 1 was_adj = False for unused_i, row in subframe.iterrows(): row_no = row["i"] if row["index"] == last_pos + 1: if not was_adj: adj_group[last_row] = adj_gr_count was_adj = True adj_group[row_no] = adj_gr_count else: if was_adj: adj_gr_count += 1 was_adj = False last_pos = row["index"] last_row = row_no if was_adj: adj_gr_count += 1 adj_arr = pd.Series(adj_group, dtype=pd.UInt32Dtype(), index=index_series.index) n_adj = n_prot - adj_arr.isnull().sum() return n_adj, adj_gr_count, adj_arr