예제 #1
0
def load(file):
    # Throws FileNotFoundError if `file` does not exist.
    log.debug(f'Loading {file} of size {os.path.getsize(file)}.')
    # The column 'name' may contain single quoted strings.
    # See http://www.tptp.org/TPTP/SyntaxBNF.html
    # <fof_plain_term> ::= <functor> ::= <atomic_word> ::= <single_quoted> ::= <single_quote> ::: [']
    # We assume that there are no NAs in the symbols CSV table.
    # Note that for example in SWV478+2.p there is a symbol called 'null' that may alias with the NA filtering
    # (its name being misinterpreted as a missing value).
    return pd.read_csv(file,
                       index_col=['isFunction', 'id'],
                       quotechar='\'',
                       escapechar='\\',
                       na_filter=False,
                       dtype={
                           'isFunction': np.bool,
                           'id': pd.UInt32Dtype(),
                           'name': 'object',
                           'arity': pd.UInt32Dtype(),
                           'usageCnt': pd.UInt32Dtype(),
                           'unitUsageCnt': pd.UInt32Dtype(),
                           'inGoal': np.bool,
                           'inUnit': np.bool,
                           'skolem': np.bool,
                           'inductionSkolem': np.bool,
                           'interpreted': np.bool,
                           'introduced': np.bool,
                           'stringConstant': np.bool,
                           'numericConstant': np.bool,
                           'interpretedNumber': np.bool
                       })
예제 #2
0
 def fresh(cls,
           problems,
           clausifier,
           randomize=None,
           ucb_method='hoeffding',
           hoeffding_exponent=4,
           background='random',
           metric='saturation_iterations'):
     signature_sizes = get_signature_sizes(problems, clausifier)
     assert len(signature_sizes) == len(problems)
     # Filter out problems where signature size fetching fails.
     records = [{
         'problem': problems[i],
         'predicates': signature_sizes[i]['predicate'],
         'functions': signature_sizes[i]['function'],
         'attempts': 0,
         'hits': 0
     } for i in range(len(problems)) if signature_sizes[i] is not None]
     dtypes = {
         'problem': 'object',
         'predicates': pd.UInt32Dtype(),
         'functions': pd.UInt32Dtype(),
         'attempts': pd.UInt32Dtype(),
         'hits': pd.UInt32Dtype()
     }
     df = dataframe_from_records(records,
                                 index_keys='problem',
                                 dtypes=dtypes)
     return cls(df,
                randomize,
                ucb_method=ucb_method,
                hoeffding_exponent=hoeffding_exponent,
                background=background,
                metric=metric)
예제 #3
0
 def results(self):
     """Calculate list of merges."""
     drop_list = []
     if self.alt_hash:
         for hash_val in self.alt_hash_dict:
             related_hashes = [hash_val] + [
                 alt for alt in self.alt_hash_dict[hash_val]
                 if alt in self.count_dict
             ]
             if len(related_hashes) == 1:
                 continue
             related_hashes.sort(
             )  # take the first in numberical order if all else is equal
             non_ambig_hashes = [
                 h for h in related_hashes if self.ambig_dict[h] == 1
             ]
             max_count_idx = np.argmax(
                 [self.count_dict[h] for h in non_ambig_hashes])
             best_hash = non_ambig_hashes[max_count_idx]
             if best_hash != hash_val:
                 drop_list.append(hash_val)
         del self.alt_hash_dict, self.count_dict, self.ambig_dict
     merge_frame = pd.DataFrame(
         {
             self.count_key: self.counts,
             self.ambig_key: self.ambig
         },
         index=self.values,
         dtype=pd.UInt32Dtype(),
     )
     merge_frame.drop(drop_list, inplace=True)
     merge_frame.sort_values(by=[self.ambig_key, self.count_key],
                             inplace=True)
     unambig_frame = merge_frame[merge_frame[self.ambig_key] == 1].copy()
     n_unambig = len(unambig_frame)
     unambig_frame[self.ordinal_key] = pd.array(
         range(self.start_base, self.start_base + n_unambig),
         dtype=pd.UInt32Dtype(),
     )
     del unambig_frame[self.ambig_key]
     ambig_frame = merge_frame[merge_frame[self.ambig_key] > 1].copy()
     del merge_frame
     if self.ambig_count_key is None:
         # Don't pass counts along
         del ambig_frame[self.count_key]
     else:
         ambig_frame = ambig_frame.rename(
             columns={self.count_key: self.ambig_count_key})
     del ambig_frame[self.ambig_key]
     ambig_frame[self.ambig_ordinal_key] = pd.array(
         range(
             self.start_base + n_unambig,
             self.start_base + len(ambig_frame) + n_unambig,
         ),
         dtype=pd.UInt32Dtype(),
     )
     return unambig_frame, ambig_frame
예제 #4
0
    def calculate_disambig_hashes(self, df):
        """Calculate disambiguation frame (per-fragment).

        if self.disambig_adj_only is True, then disambiguation will be done
        only for those locations adjacent to an umabiguous hash.
        """
        hash2_fr = df[["syn.anchor.id", "tmp.ambig.id"]].copy()
        hash2_fr = hash2_fr.rename(columns={"syn.anchor.id": "tmp.anchor.id"})
        hash2_fr["tmp.upstr_anchor"] = _fill_na_with_last_valid(
            df["syn.anchor.id"])
        hash2_fr["tmp.downstr_anchor"] = _fill_na_with_last_valid(
            df["syn.anchor.id"], flip=True)
        hash2_fr["tmp.upstr_occur"] = _cum_val_cnt_where_ser2_is_na(
            df["tmp.ambig.id"], df["syn.anchor.id"])
        hash2_fr["tmp.downstr_occur"] = _cum_val_cnt_where_ser2_is_na(
            df["tmp.ambig.id"], df["syn.anchor.id"], flip=True)
        hash2_fr["tmp.i"] = range(len(hash2_fr))
        upstream_hash = pd.array([pd.NA] * len(hash2_fr),
                                 dtype=pd.UInt32Dtype())
        downstream_hash = pd.array([pd.NA] * len(hash2_fr),
                                   dtype=pd.UInt32Dtype())
        hash2_fr["tmp.disambig.up"] = pd.NA
        hash2_fr["tmp.disambig.down"] = pd.NA
        for unused_id, row in hash2_fr.iterrows():
            row_no = row["tmp.i"]
            ambig_base = row["tmp.ambig.id"]
            upstream_unambig = row["tmp.upstr_anchor"]
            downstream_unambig = row["tmp.downstr_anchor"]
            occur_upstream = row["tmp.upstr_occur"]
            occur_downstream = row["tmp.downstr_occur"]
            if pd.notna(ambig_base):
                if pd.notna(upstream_unambig):
                    if not pd.notna(occur_upstream):
                        logger.warning(
                            f"Something is wrong upstream of base {ambig_base}"
                        )
                    if self.disambig_adj_only and occur_upstream > 1:
                        continue
                    upstream_hash[row_no] = hash_array(
                        np.array(
                            [upstream_unambig, ambig_base, occur_upstream]))
                if pd.notna(downstream_unambig):
                    if not pd.notna(occur_downstream):
                        logger.warning(
                            f"Something is wrong downstream of base {ambig_base}"
                        )
                    if self.disambig_adj_only and occur_downstream > 1:
                        continue
                    downstream_hash[row_no] = hash_array(
                        np.array(
                            [ambig_base, downstream_unambig,
                             occur_downstream]))
        hash2_fr["tmp.disambig.up"] = upstream_hash
        hash2_fr["tmp.disambig.down"] = downstream_hash
        return hash2_fr[["tmp.disambig.up", "tmp.disambig.down"]]
예제 #5
0
def load_papers_df(path):
    """Load table listing papers.

    The table is in MAG format at `path`.
    
    Returns:
    1. a DataFrame of all paper IDs,
    2. a DataFrame of paper, journal ID, where the Journal ID exists,
    3. a DataFrame of paper, conference series ID, where exists.

    The papers are sorted by year. Within each year, they are sorted by
    rank.
    """
    df = pd.read_csv(path,
                     dialect=MAGDialect(),
                     engine='c',
                     usecols=[0, 1, 7, 11, 12],
                     names=['paper_id', 'rank', 'year', 'journal_id', 'cs_id'],
                     dtype={
                         'paper_id': np.uint32,
                         'rank': np.uint16,
                         'year': pd.UInt16Dtype(),
                         'journal_id': pd.UInt32Dtype(),
                         'cs_id': pd.UInt32Dtype()
                     },
                     keep_default_na=False,
                     na_values={
                         'year': [''],
                         'journal_id': [''],
                         'cs_id': ['']
                     })

    # Make separate tables for paper-journal/conference series mappings.
    paper_journals_df = df.loc[df['journal_id'].notna(),
                               ['paper_id', 'journal_id']]
    paper_journals_df.reset_index(drop=True, inplace=True)  # Memory.
    paper_journals_df['journal_id'] \
        = paper_journals_df['journal_id'].astype(np.uint32)
    del df['journal_id']
    paper_cs_df = df.loc[df['cs_id'].notna(), ['paper_id', 'cs_id']]
    paper_cs_df.reset_index(drop=True, inplace=True)  # Save memory.
    paper_cs_df['cs_id'] = paper_cs_df['cs_id'].astype(np.uint32)
    del df['cs_id']

    df['year'].fillna(YEAR_SENTINEL, inplace=True)  # NaN -> sentinel.
    df['year'] = df['year'].astype(np.uint16)  # From masked type.
    # 'mergesort' is stable, unlike the other sorts. Remember that we
    # want to sort by year, then rank.
    df.sort_values('rank', inplace=True, ignore_index=True, kind='mergesort')
    del df['rank']
    df.sort_values('year', inplace=True, ignore_index=True, kind='mergesort')

    return df, paper_journals_df, paper_cs_df
예제 #6
0
    def coerce_not_float_cols_nans(cls, self):
        """Coerce cols with floats and nans to the correct integer dtype."""
        cols = self.not_float_cols_nans

        int8_val = 127
        int16_val = 32767
        int32_val = 2147483648

        for col in cols:
            min = self.df[col].min()
            max = self.df[col].max()
            if min >= 0:
                if max < 255:
                    self.df[col] = self.df[col].astype(pd.UInt8Dtype())
                elif max < 65535:
                    self.df[col] = self.df[col].astype(pd.UInt16Dtype())
                elif max < 4294967295:
                    self.df[col] = self.df[col].astype(pd.UInt32Dtype())
            else:
                if min > -int8_val and max < int8_val:
                    self.df[col] = self.df[col].astype(pd.Int8Dtype())
                elif min > -int16_val and max < int16_val:
                    self.df[col] = self.df[col].astype(pd.Int16Dtype())
                elif min > -int32_val and max < int32_val:
                    self.df[col] = self.df[col].astype(pd.Int32Dtype())
예제 #7
0
def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType,
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_uint8(dtype):
        return pd.UInt8Dtype()
    if pa.types.is_uint16(dtype):
        return pd.UInt16Dtype()
    if pa.types.is_uint32(dtype):
        return pd.UInt32Dtype()
    if pa.types.is_uint64(dtype):
        return pd.UInt64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None
예제 #8
0
def traiter_cantons(cantons_cog_path, dest):
    cantons = pd.read_csv(
        cantons_cog_path,
        dtype={
            "CAN": str,
            "DEP": str,
            "BURCENTRAL": str,
            "COMPCT": pd.UInt32Dtype()
        },
        usecols=[
            "CAN",
            "TYPECT",
            "COMPCT",
            "NCCENR",
            "TNCC",
            "DEP",
            "BURCENTRAL",
        ],
    )

    cantons.rename(
        columns={
            "CAN": "code",
            "TYPECT": "type",
            "COMPCT": "composition",
            "NCCENR": "nom",
            "TNCC": "type_nom",
            "DEP": "departement",
            "BURCENTRAL": "bureau_centralisateur",
        }).to_csv(dest, index=False)
예제 #9
0
def merge_disambig_hashes(
    args,
    unambig=None,
    ambig=None,
    hasher=None,
    mailboxes=None,
):
    """Merge disambiguated synteny hashes into proteomes per-proteome."""
    idx, dotpath = args
    plain_hash_name = hasher.hash_name(no_prefix=True)
    hash_name = "syn." + plain_hash_name
    outpath = dotpath_to_path(dotpath)
    syn = read_tsv_or_parquet(outpath / SYNTENY_FILE)
    syn = _join_on_col_with_na(syn, unambig, "tmp.disambig.up")
    syn = _join_on_col_with_na(syn, unambig, "tmp.disambig.down")
    for dup_col in [
            "tmp.disambig.anchor.count",
            "tmp.disambig.anchor.id",
    ]:
        xcol = dup_col + "_x"
        ycol = dup_col + "_y"
        syn[dup_col] = syn[xcol].fillna(syn[ycol])
        del syn[xcol], syn[ycol]
    syn["syn.anchor.id"] = syn["syn.anchor.id"].fillna(
        syn["tmp.disambig.anchor.id"])
    syn["syn.anchor.count"] = syn["syn.anchor.count"].fillna(
        syn["tmp.disambig.anchor.count"])
    syn["syn.code"] = _fill_col1_val_where_col2_notna(
        syn["syn.code"], syn["tmp.disambig.anchor.id"], DISAMBIGUATED_CODE)
    # Delete some non-needed tmp columns
    non_needed_cols = [
        "tmp.disambig.anchor.count",
        "tmp.disambig.anchor.id",
        "tmp.disambig.up",
        "tmp.disambig.down",
    ]
    syn = syn.drop(columns=non_needed_cols)
    # null hashes are already assigned
    syn[hash_name][syn["syn.anchor.id"].notna()] = pd.NA
    write_tsv_or_parquet(syn, outpath / SYNTENY_FILE, remove_tmp=False)
    # Write out non-ambiguous hashes
    syn["tmp.self_count"] = pd.array(
        syn[hash_name].map(syn[hash_name].value_counts()),
        dtype=pd.UInt32Dtype(),
    )
    unique_hashes = (syn[[
        hash_name, "tmp.self_count"
    ]].drop_duplicates(subset=[hash_name]).dropna(how="any"))
    unique_hashes = unique_hashes.set_index(hash_name).sort_index()
    with mailboxes.locked_open_for_write(idx) as file_handle:
        unique_hashes.to_csv(file_handle, header=False, sep="\t")
    # logger.debug(f"{dotpath} has {syn['syn.anchor.id'].notna().sum()} assignments")
    return {
        "idx":
        idx,
        "path":
        dotpath,
        "syn.anchors.disambiguated":
        _count_code(syn["syn.code"], DISAMBIGUATED_CODE),
    }
예제 #10
0
def load_authorships_df(path):
    """Load table of authorships and affiliations.

    The table is at `path` in the MAG format. The first three columns
    are used; these are the paper ID, author ID, and affiliation ID (or
    blank).

    Returns:
    1. a DataFrame of paper IDs and the corresponding author IDs,
    2. a DataFrame of paper IDs and the corresponding affiliation IDs.

    Duplicate entries are permitted. Null affiliations are not returned.
    """
    df = pd.read_csv(path,
                     dialect=MAGDialect(),
                     engine='c',
                     usecols=[0, 1, 2],
                     names=['paper_id', 'author_id', 'affiliation_id'],
                     dtype={
                         'paper_id': np.uint32,
                         'author_id': np.uint32,
                         'affiliation_id': pd.UInt32Dtype()
                     },
                     keep_default_na=False,
                     na_values={'affiliation_id': ['']})

    paper_affiliations_df = df.loc[df['affiliation_id'].notna(),
                                   ['paper_id', 'affiliation_id']]
    paper_affiliations_df.reset_index(drop=True, inplace=True)
    paper_affiliations_df['affiliation_id'] \
        = paper_affiliations_df['affiliation_id'].astype(np.uint32)
    del df['affiliation_id']
    return df, paper_affiliations_df
예제 #11
0
def integer_type_mapping(
        use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
    if use_extension_types:
        return {
            IntegerType.INT8: pd.Int8Dtype(),
            IntegerType.UINT8: pd.UInt8Dtype(),
            IntegerType.INT16: pd.Int16Dtype(),
            IntegerType.UINT16: pd.UInt16Dtype(),
            IntegerType.INT24: pd.Int32Dtype(),
            IntegerType.UINT24: pd.Int32Dtype(),
            IntegerType.INT32: pd.Int32Dtype(),
            IntegerType.UINT32: pd.UInt32Dtype(),
            IntegerType.INT64: pd.Int64Dtype(),
            IntegerType.UINT64: pd.UInt64Dtype(),
        }
    else:
        return {
            IntegerType.INT8: np.int8,
            IntegerType.UINT8: np.uint8,
            IntegerType.INT16: np.int16,
            IntegerType.UINT16: np.uint16,
            IntegerType.INT24: np.int32,
            IntegerType.UINT24: np.uint32,
            IntegerType.INT32: np.int32,
            IntegerType.UINT32: np.uint32,
            IntegerType.INT64: np.int64,
            IntegerType.UINT64: np.uint64,
        }
예제 #12
0
파일: cog.py 프로젝트: aktiur/data-france
def traiter_cantons(cantons_cog_path, dest):
    cantons = pd.read_csv(
        cantons_cog_path,
        dtype={
            "can": str,
            "dep": str,
            "burcentral": str,
            "compct": pd.UInt32Dtype()
        },
        usecols=[
            "can",
            "typect",
            "compct",
            "nccenr",
            "tncc",
            "dep",
            "burcentral",
        ],
    )

    # La commune d'Azé (53014) est maintenant une commune déléguée de Château-Gontier-sur-Mayenne
    cantons.loc[cantons["burcentral"] == "53014", "burcentral"] = "53062"

    cantons.rename(
        columns={
            "can": "code",
            "typect": "type",
            "compct": "composition",
            "nccenr": "nom",
            "tncc": "type_nom",
            "dep": "departement",
            "burcentral": "bureau_centralisateur",
        }).to_csv(dest, index=False)
예제 #13
0
 def calculate(self, cluster_series):
     """Return an array of synteny block hashes data."""
     # Maybe the best code I've ever written--JB
     vec = cluster_series.to_numpy().astype(int)
     if self.peatmer:
         uneq_idxs = np.append(np.where(vec[1:] != vec[:-1]), vec.size - 1)
         runlengths = np.diff(np.append(-1, uneq_idxs))
         positions = np.cumsum(np.append(0, runlengths))[:-1]
         n_mers = len(positions) - self.k + 1
         footprints = pd.array(
             [runlengths[i:i + self.k].sum() for i in range(n_mers)],
             dtype=pd.UInt32Dtype(),
         )
     else:
         n_elements = len(cluster_series)
         n_mers = n_elements - self.k + 1
         positions = np.arange(n_elements)
         footprints = pd.array([self.k] * n_mers, dtype=pd.UInt32Dtype())
     if n_mers < 1:
         return None
     # Calculate k-mers over indirect index
     kmer_mat = np.array(
         [vec[positions[i:i + self.k]] for i in range(n_mers)])
     fwd_rev_hashes = np.array([
         np.apply_along_axis(hash_array, 1, kmer_mat),
         np.apply_along_axis(hash_array, 1, np.flip(kmer_mat, axis=1)),
     ])
     plus_minus = np.array([["+"] * n_mers, ["-"] * n_mers])
     directions = np.take_along_axis(
         plus_minus,
         np.expand_dims(fwd_rev_hashes.argmin(axis=0), axis=0),
         axis=0,
     )[0]
     return pd.DataFrame(
         [
             pd.Categorical(directions, dtype=DIRECTIONAL_CATEGORY),
             footprints,
             pd.array(np.amin(fwd_rev_hashes, axis=0),
                      dtype=pd.UInt32Dtype()),
         ],
         columns=[
             "syn.hash.direction",
             "syn.hash.footprint",
             self.hash_name(),
         ],
         index=cluster_series.index[positions[:n_mers]],
     )
예제 #14
0
파일: dim2.py 프로젝트: Varun270/pandas
    def test_reductions_2d_axis0(self, data, method, request):
        if not hasattr(data, method):
            pytest.skip("test is not applicable for this type/dtype")

        arr2d = data.reshape(1, -1)

        kwargs = {}
        if method == "std":
            # pass ddof=0 so we get all-zero std instead of all-NA std
            kwargs["ddof"] = 0

        try:
            result = getattr(arr2d, method)(axis=0, **kwargs)
        except Exception as err:
            try:
                getattr(data, method)()
            except Exception as err2:
                assert type(err) == type(err2)
                return
            else:
                raise AssertionError("Both reductions should raise or neither")

        if method in ["mean", "median", "sum", "prod"]:
            # std and var are not dtype-preserving
            expected = data
            if method in ["sum", "prod"] and data.dtype.kind in "iub":
                # FIXME: kludge
                if data.dtype.kind in ["i", "b"]:
                    if is_platform_windows() or not IS64:
                        # FIXME: kludge for 32bit builds
                        if result.dtype.itemsize == 4:
                            dtype = pd.Int32Dtype()
                        else:
                            dtype = pd.Int64Dtype()
                    else:
                        dtype = pd.Int64Dtype()
                elif data.dtype.kind == "u":
                    if is_platform_windows() or not IS64:
                        # FIXME: kludge for 32bit builds
                        if result.dtype.itemsize == 4:
                            dtype = pd.UInt32Dtype()
                        else:
                            dtype = pd.UInt64Dtype()
                    else:
                        dtype = pd.UInt64Dtype()

                expected = data.astype(dtype)
                if data.dtype.kind == "b" and method in ["sum", "prod"]:
                    # We get IntegerArray instead of BooleanArray
                    pass
                else:
                    assert type(expected) == type(data), type(expected)
                assert dtype == expected.dtype

            self.assert_extension_array_equal(result, expected)
        elif method == "std":
            self.assert_extension_array_equal(result, data - data)
def test_arrow_from_arrow_uint():
    # https://github.com/pandas-dev/pandas/issues/31896
    # possible mismatch in types

    dtype = pd.UInt32Dtype()
    result = dtype.__from_arrow__(pa.array([1, 2, 3, 4, None], type="int64"))
    expected = pd.array([1, 2, 3, 4, None], dtype="UInt32")

    tm.assert_extension_array_equal(result, expected)
예제 #16
0
def test_intdtypes() -> None:
    pd.Int8Dtype()
    pd.Int16Dtype()
    pd.Int32Dtype()
    pd.Int64Dtype()
    pd.UInt8Dtype()
    pd.UInt16Dtype()
    pd.UInt32Dtype()
    pd.UInt64Dtype()
예제 #17
0
    def __init__(self, pandas_obj):
        # validate and assign object
        self._validate(pandas_obj)
        self._obj = pandas_obj

        # define incorporated modules - columns consisting of others will not have the dtype changed
        self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas']

        # define a possible list of null values
        self._NULL_VALS = [
            None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf,
            '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk',
            'UNKNOWN', 'UNK'
        ]

        # assign dtypes and limits
        # boolean
        BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on']
        BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off']
        self._BOOL_MAP_DICT = {i: True
                               for i in BOOL_STRINGS_TRUE
                               }.update({i: False
                                         for i in BOOL_STRINGS_FALSE})
        self._DTYPE_BOOL_BASE = np.bool
        self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype()
        # unsigned integers - base and nullable
        self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64]
        self._DTYPES_UINT_NULLABLE = [
            pd.UInt8Dtype(),
            pd.UInt16Dtype(),
            pd.UInt32Dtype(),
            pd.UInt64Dtype()
        ]
        self._LIMIT_LOW_UINT = [
            np.iinfo(i).min for i in self._DTYPES_UINT_BASE
        ]
        self._LIMIT_HIGH_UINT = [
            np.iinfo(i).max for i in self._DTYPES_UINT_BASE
        ]
        # signed integers - base and nullable
        self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64]
        self._DTYPES_INT_NULLABLE = [
            pd.Int8Dtype(),
            pd.Int16Dtype(),
            pd.Int32Dtype(),
            pd.Int64Dtype()
        ]
        self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE]
        self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE]
        # floats - nullable by default
        self._DTYPES_FLOAT = [np.float16, np.float32, np.float64]
        # datetime - nullable by default
        self._DTYPE_DATETIME = np.datetime64
        # string
        self._DTYPE_STRING = pd.StringDtype()
        # categorical - nullable by default
        self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
예제 #18
0
class Result(process.Result):
    def __init__(self, process_result, result_symbols, clauses):
        log.debug(process_result)
        super().__init__(**process_result.__dict__)
        self.symbols = result_symbols
        self.clauses = clauses

    pd_dtypes = {
        **process.Result.pd_dtypes, 'time_elapsed_vampire': float,
        'saturation_iterations': pd.UInt32Dtype(),
        'memory_used': pd.UInt32Dtype()
    }

    def symbols_of_type(self, symbol_type):
        return symbols.symbols_of_type(self.symbols, symbol_type)

    @property
    def saturation_iterations(self):
        try:
            return int(
                re.search(r'^% Main loop iterations started: (\d+)$',
                          self.stdout, re.MULTILINE)[1])
        except TypeError:
            return None

    @property
    def memory_used(self):
        try:
            return int(
                re.search(r'^% Memory used \[KB\]: (\d+)$', self.stdout,
                          re.MULTILINE)[1])
        except TypeError:
            return None

    @property
    def time_elapsed_vampire(self):
        try:
            return float(
                re.search(r'^% Time elapsed: (\d+\.\d+) s$', self.stdout,
                          re.MULTILINE)[1])
        except TypeError:
            return None
예제 #19
0
파일: parquet.py 프로젝트: zjkanjie/pandas
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            if LooseVersion(self.api.__version__) >= "0.16":
                import pandas as pd

                mapping = {
                    self.api.int8(): pd.Int8Dtype(),
                    self.api.int16(): pd.Int16Dtype(),
                    self.api.int32(): pd.Int32Dtype(),
                    self.api.int64(): pd.Int64Dtype(),
                    self.api.uint8(): pd.UInt8Dtype(),
                    self.api.uint16(): pd.UInt16Dtype(),
                    self.api.uint32(): pd.UInt32Dtype(),
                    self.api.uint64(): pd.UInt64Dtype(),
                    self.api.bool_(): pd.BooleanDtype(),
                    self.api.string(): pd.StringDtype(),
                }
                to_pandas_kwargs["types_mapper"] = mapping.get
            else:
                raise ValueError(
                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
                    f"({self.api.__version__} is installed"
                )
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns, **kwargs
            ).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
예제 #20
0
def merge_unambig_hashes(
    args,
    unambig=None,
    ambig=None,
    hasher=None,
    mailboxes=None,
):
    """Merge unambiguous synteny hashes into proteomes per-proteome."""
    hash_name = hasher.hash_name()
    idx, dotpath = args
    outpath = dotpath_to_path(dotpath)
    syn = read_tsv_or_parquet(outpath / SYNTENY_FILE)
    syn = _join_on_col_with_na(syn, unambig, hash_name)
    syn = _join_on_col_with_na(syn, ambig, hash_name)
    syn["syn.code"] = pd.NA
    syn["syn.code"] = _fill_col1_val_where_col2_notna(syn["syn.code"],
                                                      syn["syn.anchor.id"],
                                                      UNAMBIGUOUS_CODE)
    # Calculate disambiguation hashes and write them out for merge
    disambig_frame_list = []
    for unused_frag, subframe in syn.groupby(by=["frag.id"]):
        disambig_frame_list.append(hasher.calculate_disambig_hashes(subframe))
    disambig_fr = pd.concat(
        [df for df in disambig_frame_list if df is not None])
    disambig_fr = disambig_fr.dropna(how="all")
    syn = syn.join(disambig_fr)
    write_tsv_or_parquet(syn, outpath / SYNTENY_FILE, remove_tmp=False)
    # Write out unified upstream/downstream hash values
    merged_hashes = pd.concat(
        [
            _rename_and_fill_alt(syn, "tmp.disambig.up", "tmp.disambig.down"),
            _rename_and_fill_alt(syn, "tmp.disambig.down", "tmp.disambig.up"),
        ],
        ignore_index=True,
    )
    merged_hashes["self_count"] = pd.array(
        merged_hashes["hash"].map(merged_hashes["hash"].value_counts()),
        dtype=pd.UInt32Dtype(),
    )
    merged_hashes = merged_hashes.reindex(
        columns=["hash", "self_count", "alt_hash"])
    unique_hashes = (merged_hashes.drop_duplicates(
        subset=["hash"]).set_index("hash").sort_index())
    del merged_hashes
    with mailboxes.locked_open_for_write(idx) as file_handle:
        unique_hashes.to_csv(file_handle, header=False, sep="\t")
    return {
        "idx": idx,
        "path": dotpath,
        "syn.anchors.unambiguous": _count_code(syn["syn.code"],
                                               UNAMBIGUOUS_CODE),
    }
예제 #21
0
 def test_numeric_dtypes(self):
     dtypes = [
         bool,
         np.byte,
         np.ubyte,
         np.short,
         np.ushort,
         np.single,
         np.int32,
         np.intc,
         np.half,
         np.float16,
         np.double,
         np.float64,
         pd.StringDtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
     ]
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".xml", "xml"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         with tmpfile(suffix) as path:
             for dtype in dtypes:
                 try:
                     df = Ind2Col2.convert(Ind2Col2(
                         sample_data_ind2_col2())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
예제 #22
0
파일: parquet.py 프로젝트: tnir/pandas
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
예제 #23
0
 def dtypes(self):
     return {
         'clausify_returncode': 'category',
         'num_clauses': pd.UInt32Dtype(),
         'num_predicate': pd.UInt32Dtype(),
         'num_function': pd.UInt32Dtype(),
         'graph_nodes': pd.UInt32Dtype(),
         'graph_nodes_lower_bound': pd.UInt32Dtype(),
         **{
             f'graph_nodes_{ntype}': pd.UInt32Dtype()
             for ntype in self.formula_visitor().ntypes()
         }, 'graph_edges': pd.UInt32Dtype()
     }
예제 #24
0
def _cum_val_count(arr):
    """Return an array of cumulative counts of values."""
    counts = {}
    out_arr = pd.array(
        [pd.NA] * len(arr),
        dtype=pd.UInt32Dtype(),
    )
    for i, val in enumerate(arr):
        if pd.isnull(val):
            continue
        elif val in counts:
            counts[val] += 1
        else:
            counts[val] = 1
        out_arr[i] = counts[val]
    return out_arr
예제 #25
0
 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
예제 #26
0
def test_to_pandas_dtype_integer_nullable():
    expectations = {
        (-100, 100): pd.Int8Dtype(),
        (0, 240): pd.UInt8Dtype(),
        (-10000, 10000): pd.Int16Dtype(),
        (500, 40000): pd.UInt16Dtype(),
        (-200000000, 200000000): pd.Int32Dtype(),
        (25, 4000000000): pd.UInt32Dtype(),
        (-9000000000000000000, 2000000000): pd.Int64Dtype(),
        (25, 10000000000000000000): pd.UInt64Dtype(),
        (25, 1000000000000000000000000000): np.float128,
        (None, None): pd.Int64Dtype(),
    }
    for (min_, max_), expected_pandas_type in expectations.items():
        constraints = RecordsSchemaFieldIntegerConstraints(required=True,
                                                           unique=None,
                                                           min_=min_,
                                                           max_=max_)
        yield with_nullable(
            True, check_dtype), "integer", constraints, expected_pandas_type
예제 #27
0
def calculate_synteny_hashes(args,
                             mailboxes=None,
                             hasher=None,
                             unambig=None,
                             ambig=None):
    """Calculate synteny hashes for proteins per-genome."""
    idx, dotpath = args
    outpath = dotpath_to_path(dotpath)
    hom = read_tsv_or_parquet(outpath / HOMOLOGY_FILE)
    hom["tmp.nan_group"] = (
        (hom["hom.cluster"].isnull()).astype(int).cumsum() +
        1) * (~hom["hom.cluster"].isnull())
    hom.replace(to_replace={"tmp.nan_group": 0}, value=pd.NA, inplace=True)
    hash_name = hasher.hash_name()
    syn_list = []
    if hasher.thorny:  # drop rows
        hom = hom[hom["hom.cluster"].notna()]
    for unused_id_tuple, subframe in hom.groupby(
            by=["frag.id", "tmp.nan_group"]):
        syn_list.append(hasher.calculate(subframe["hom.cluster"]))
    del hom["tmp.nan_group"]
    syn = hom.join(pd.concat([df for df in syn_list if df is not None],
                             axis=0))
    del syn_list
    write_tsv_or_parquet(syn, outpath / SYNTENY_FILE, remove_tmp=False)
    syn["tmp.self_count"] = pd.array(
        syn[hash_name].map(syn[hash_name].value_counts()),
        dtype=pd.UInt32Dtype(),
    )
    unique_hashes = (syn[[
        hash_name, "tmp.self_count"
    ]].drop_duplicates(subset=[hash_name]).dropna(how="any"))
    unique_hashes = unique_hashes.set_index(hash_name).sort_index()
    with mailboxes.locked_open_for_write(idx) as file_handle:
        unique_hashes.to_csv(file_handle, header=False, sep="\t")
    return {
        "idx": idx,
        "path": dotpath,
        "hom.clusters": syn["hom.cluster"].notna().sum(),
        "syn.hashes.n": syn[hash_name].notna().sum(),
    }
예제 #28
0
def _fill_na_with_last_valid(ser, flip=False):
    """Input a series with NA values, returns a series with those values filled."""
    lv_arr = pd.array(
        [pd.NA] * len(ser),
        dtype=pd.UInt32Dtype(),
    )
    if not (ser.isnull().all() or ser.notna().all()):
        null_vec = ser.isnull().to_numpy()
        val_vec = ser.to_numpy()
        if flip:
            null_vec = np.flip(null_vec)
            val_vec = np.flip(val_vec)
        first_null_pos, null_runs = _true_positions_and_runs(null_vec)
        fill_vals = np.append(pd.NA, val_vec)[first_null_pos]
        for i, pos in enumerate(first_null_pos):
            for j in range(null_runs[i]):
                lv_arr[pos + j] = fill_vals[i]
        if flip:
            lv_arr = np.flip(lv_arr)
        lv_ser = pd.Series(lv_arr, index=ser.index)
        return lv_ser
예제 #29
0
def _cum_val_cnt_where_ser2_is_na(ser1, ser2, flip=False):
    """Return the cumulative value count of ser1 in regions where ser2 is NA."""
    if len(ser1) != len(ser2):
        logger.warning(f"Lengths of ser1 and ser2 differ at {ser1}")
    vc_arr = pd.array(
        [pd.NA] * len(ser1),
        dtype=pd.UInt32Dtype(),
    )
    if not (ser2.isnull().all() or ser2.notna().all()):
        null_vec = ser2.isnull().to_numpy()
        val_vec = ser1.to_numpy()
        if flip:
            null_vec = np.flip(null_vec)
            val_vec = np.flip(val_vec)
        null_pos, null_runs = _true_positions_and_runs(null_vec)
        null_len = len(null_pos)
        for i in range(null_len):
            vc_arr[null_pos[i]:(null_pos[i] + null_runs[i])] = _cum_val_count(
                val_vec[null_pos[i]:(null_pos[i] + null_runs[i])])
        if flip:
            vc_arr = np.flip(vc_arr)
    vc_ser = pd.Series(vc_arr, index=ser2.index)
    return vc_ser
예제 #30
0
def calculate_adjacency_group(index_series, frag_series):
    """Calculate an adjacency group numger."""
    index_fr = pd.DataFrame({"index": index_series, "fragment": frag_series})
    n_prot = len(index_fr)
    adj_gr_count = 0
    was_adj = False
    index_fr["i"] = range(n_prot)
    adj_group = np.array([np.nan] * n_prot)
    for unused_group, subframe in index_fr.groupby(by=["fragment"]):
        if len(subframe) == 1:
            continue
        last_pos = -2
        last_row = None
        if was_adj:
            adj_gr_count += 1
        was_adj = False
        for unused_i, row in subframe.iterrows():
            row_no = row["i"]
            if row["index"] == last_pos + 1:
                if not was_adj:
                    adj_group[last_row] = adj_gr_count
                was_adj = True
                adj_group[row_no] = adj_gr_count
            else:
                if was_adj:
                    adj_gr_count += 1
                    was_adj = False
            last_pos = row["index"]
            last_row = row_no
    if was_adj:
        adj_gr_count += 1
    adj_arr = pd.Series(adj_group,
                        dtype=pd.UInt32Dtype(),
                        index=index_series.index)
    n_adj = n_prot - adj_arr.isnull().sum()
    return n_adj, adj_gr_count, adj_arr