Пример #1
0
def spark_type_to_pandas_dtype(spark_type: types.DataType,
                               *,
                               use_extension_dtypes: bool = False) -> Dtype:
    """Return the given Spark DataType to pandas dtype."""

    if use_extension_dtypes and extension_dtypes_available:
        # IntegralType
        if isinstance(spark_type, types.ByteType):
            return Int8Dtype()
        elif isinstance(spark_type, types.ShortType):
            return Int16Dtype()
        elif isinstance(spark_type, types.IntegerType):
            return Int32Dtype()
        elif isinstance(spark_type, types.LongType):
            return Int64Dtype()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(spark_type, types.BooleanType):
                return BooleanDtype()
            # StringType
            elif isinstance(spark_type, types.StringType):
                return StringDtype()

        # FractionalType
        if extension_float_dtypes_available:
            if isinstance(spark_type, types.FloatType):
                return Float32Dtype()
            elif isinstance(spark_type, types.DoubleType):
                return Float64Dtype()

    if isinstance(
            spark_type,
        (
            types.DateType,
            types.NullType,
            types.ArrayType,
            types.MapType,
            types.StructType,
            types.UserDefinedType,
        ),
    ):
        return np.dtype("object")
    elif isinstance(spark_type, types.TimestampType):
        return np.dtype("datetime64[ns]")
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
Пример #2
0
 def object_extension_dtypes(self):
     return (
         ["boolean", "string", BooleanDtype(), StringDtype()]
         if extension_object_dtypes_available
         else []
     )
Пример #3
0
 def string_extension_dtype(self):
     return ["string", StringDtype()] if extension_object_dtypes_available else []
Пример #4
0
def name2taxid(names, sciname=False, threads=None, data_dir=None, debug=False):
    '''query taxid by taxon scientific name

    Parameters
    ----------
    names : list or iterable
        A list of species names or synonyms
    sciname: bool, default False
        By default, both scientific names and synonyms are supported; when `sciname=True`, synonyms
        are ignored
    threads : int
        Override the default taxonkit threads setting
    data_dir : str, default None
        Specify the location of the NCBI taxonomy `.dmp` files; by default, taxonkit searches in
        `~/.taxonkit/`
    debug : bool, default False
        Print debugging output, e.g., system calls to `taxonkit`

    Returns
    -------
    DataFrame
        A two-dimensional data structure.

    Examples
    --------
    >>> import pytaxonkit
    >>> names = ['Phyllobolus spinuliferus', 'Alteromonas putrefaciens', 'Rexia erectus']
    >>> pytaxonkit.name2taxid(names)
                           Name   TaxID     Rank
    0  Phyllobolus spinuliferus  359607  species
    1  Alteromonas putrefaciens      24  species
    2             Rexia erectus  262902  species
    >>> pytaxonkit.name2taxid(names, sciname=True)
                           Name  TaxID  Rank
    0  Phyllobolus spinuliferus   <NA>  <NA>
    1  Alteromonas putrefaciens   <NA>  <NA>
    2             Rexia erectus   <NA>  <NA>
    '''
    namelist = '\n'.join(map(str, names))
    arglist = ['taxonkit', 'name2taxid', '--show-rank']
    if sciname:
        arglist.append('--sci-name')
    if threads:
        arglist.extend(('--threads', validate_threads(threads)))
    if data_dir:
        arglist.extend(('--data-dir', validate_data_dir(data_dir)))  # pragma: no cover
    if debug:
        log(*arglist)  # pragma: no cover
    proc = Popen(arglist, stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True)
    out, err = proc.communicate(input=namelist)
    if proc.returncode != 0:
        raise TaxonKitCLIError(err)  # pragma: no cover
    columns = {
        'Name': StringDtype(),
        'TaxID': UInt32Dtype(),
        'Rank': StringDtype(),
    }
    data = pandas.read_csv(
        StringIO(out), sep='\t', header=None, names=columns, dtype=columns, index_col=False
    )
    return data
Пример #5
0
def create_schema(phases, sources):
    dataframe_fields_types = {
        "name": StringDtype(),
        "schema": CategoricalDtype(settings.SCHEMAS),
        "collection_id": StringDtype(),
        "id": StringDtype(),
        "country": StringDtype(),
        "address": StringDtype(),
        "registrationNumber": StringDtype(),
        "alias": StringDtype(),
        "status": StringDtype(),
        "classification": StringDtype(),
        "gender": StringDtype(),
        "firstName": StringDtype(),
        "lastName": StringDtype(),
        "birthPlace": StringDtype(),
        "birthDate": StringDtype(),
        "idNumber": StringDtype(),
        "motherName": StringDtype(),
        "nationality": StringDtype(),
    }
    dataframe_meta = {
        f"{which}_{c}": t
        for which in ("left", "right")
        for c, t in dataframe_fields_types.items()
    }
    dataframe_meta["judgement"] = bool
    dataframe_meta["source"] = CategoricalDtype(sources)
    dataframe_meta["phase"] = CategoricalDtype(phases)
    dataframe_meta["features"] = object
    dataframe_meta["schema"] = StringDtype()
    return dataframe_meta