def dataframe_to_keytable(self, df, keys=[]): """Convert Spark SQL DataFrame to key table. Spark SQL data types are converted to Hail types in the obvious way as follows: .. code-block:: text BooleanType => Boolean IntegerType => Int LongType => Long FloatType => Float DoubleType => Double StringType => String BinaryType => Binary ArrayType => Array StructType => Struct Unlisted Spark SQL data types are currently unsupported. :param keys: List of key column names. :type keys: list of string :return: Key table constructed from the Spark SQL DataFrame. :rtype: :class:`.KeyTable` """ jkeys = jarray(self._jvm.java.lang.String, keys) return KeyTable( self, self._hail.keytable.KeyTable.fromDF(self._jhc, df._jdf, jkeys))
def import_keytable(self, path, npartitions=None, config=TextTableConfig()): """Import delimited text file (text table) as key table. The resulting key table will have no key columns, use :py:meth:`.KeyTable.key_by` to specify keys. :param path: files to import. :type path: str or list of str :param npartitions: Number of partitions. :type npartitions: int or None :param config: Configuration options for importing text files :type config: :class:`.TextTableConfig` :return: Key table constructed from text table. :rtype: :class:`.KeyTable` """ if not config: config = TextTableConfig() jkt = self._jhc.importKeyTable(jindexed_seq_args(path), joption(npartitions), config._to_java()) return KeyTable(self, jkt)
def import_keytable(self, path, key_names=[], npartitions=None, config=TextTableConfig()): """Import delimited text file (text table) as KeyTable. :param path: files to import. :type path: str or list of str :param key_names: The name(s) of fields to be considered keys :type key_names: str or list of str :param npartitions: Number of partitions. :type npartitions: int or None :param config: Configuration options for importing text files :type config: :class:`.TextTableConfig` :rtype: :class:`.KeyTable` """ if not config: config = TextTableConfig() jkt = self._jhc.importKeyTable(jindexed_seq_args(path), jindexed_seq_args(key_names), joption(npartitions), config._to_java()) return KeyTable(self, jkt)
def read_table(self, path): """Read a KT file as key table. :param str path: KT file to read. :return: Key table read from disk. :rtype: :class:`.KeyTable` """ jkt = self._jhc.readTable(path) return KeyTable(self, jkt)
def import_keytable(self, path, key_names, npartitions=None, config=None): """Import delimited text file (text table) as KeyTable. :param path: files to import. :type path: str or list of str :param key_names: The name(s) of fields to be considered keys :type key_names: str or list of str :param npartitions: Number of partitions. :type npartitions: int or None :param config: Configuration options for importing text files :type config: :class:`.TextTableConfig` or None :rtype: :class:`.KeyTable` """ path_args = [] if isinstance(path, str): path_args.append(path) else: for p in path: path_args.append(p) if not isinstance(key_names, str): key_names = ','.join(key_names) if not npartitions: npartitions = self.sc.defaultMinPartitions if not config: config = TextTableConfig() return KeyTable( self, self._hail.keytable.KeyTable.importTextTable( self._jsc, jarray(self._jvm.java.lang.String, path_args), key_names, npartitions, config._to_java()))
def import_table(self, paths, key=[], min_partitions=None, impute=False, no_header=False, comment=None, delimiter="\t", missing="NA", types={}, quote=None): """Import delimited text file (text table) as key table. The resulting key table will have no key columns, use :py:meth:`.KeyTable.key_by` to specify keys. **Example** Given this file .. code-block:: text $ cat data/samples1.tsv Sample Height Status Age PT-1234 154.1 ADHD 24 PT-1236 160.9 Control 19 PT-1238 NA ADHD 89 PT-1239 170.3 Control 55 The interesting thing about this table is that column ``Height`` is a floating-point number, and column ``Age`` is an integer. We can either provide have Hail impute these types from the file, or pass them ourselves: Pass the types ourselves: >>> table = hc.import_table('data/samples1.tsv', types={'Height': TFloat64(), 'Age': TInt32()}) Note that string columns like ``Sample`` and ``Status`` do not need to be typed, because ``String`` is the default type. Use type imputation (a bit easier, but requires reading the file twice): >>> table = hc.import_table('data/samples1.tsv', impute=True) **Detailed examples** Let's import annotations from a CSV file with missing data and special characters: .. code-block:: text $ cat data/samples2.tsv Batch,PT-ID 1kg,PT-0001 1kg,PT-0002 study1,PT-0003 study3,PT-0003 .,PT-0004 1kg,PT-0005 .,PT-0006 1kg,PT-0007 In this case, we should: - Pass the non-default delimiter ``,`` - Pass the non-default missing value ``.`` >>> table = hc.import_table('data/samples2.tsv', delimiter=',', missing='.') Let's import annotations from a file with no header and sample IDs that need to be transformed. Suppose the vds sample IDs are of the form ``NA#####``. This file has no header line, and the sample ID is hidden in a field with other information. .. code-block: text $ cat data/samples3.tsv 1kg_NA12345 female 1kg_NA12346 male 1kg_NA12348 female pgc_NA23415 male pgc_NA23418 male To import: >>> annotations = (hc.import_table('data/samples3.tsv', no_header=True) ... .annotate('sample = f0.split("_")[1]') ... .key_by('sample')) **Notes** The ``impute`` option tells Hail to scan the file an extra time to gather information about possible field types. While this is a bit slower for large files, (the file is parsed twice), the convenience is often worth this cost. The ``delimiter`` parameter is a field separator regex. This regex follows the `Java regex standard <http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html>`_. .. note:: Use ``delimiter='\\s+'`` to specify whitespace delimited files. The ``comment`` is an optional parameter which causes Hail to skip any line that starts in the given pattern. Passing ``comment='#'`` will skip any line beginning in a pound sign, for example. The ``missing`` parameter defines the representation of missing data in the table. .. note:: The ``comment`` and ``missing`` parameters are **NOT** regexes. The ``no_header`` option indicates that the file has no header line. If this option is passed, then the column names will be ``f0``, ``f1``, ... ``fN`` (0-indexed). The ``types`` option allows the user to pass the types of columns in the table. This is a dict keyed by ``str``, with :py:class:`~hail.expr.Type` values. See the examples above for a standard usage. Additionally, this option can be used to override type imputation. For example, if a column in a file refers to chromosome and does not contain any sex chromosomes, it will be imputed as an integer, while most Hail methods expect chromosome to be passed as a string. Using the ``impute=True`` mode and passing ``types={'Chromosome': TString()}`` will solve this problem. The ``min_partitions`` option can be used to increase the number of partitions (level of sharding) of an imported table. The default partition size depends on file system and a number of other factors (including the ``min_block_size`` of the hail context), but usually is between 32M and 128M. :param paths: Files to import. :type paths: str or list of str :param key: Key column(s). :type key: str or list of str :param min_partitions: Minimum number of partitions. :type min_partitions: int or None :param bool no_header: File has no header and the N columns are named ``f0``, ``f1``, ... ``fN`` (0-indexed) :param bool impute: Impute column types from the file :param comment: Skip lines beginning with the given pattern :type comment: str or None :param str delimiter: Field delimiter regex :param str missing: Specify identifier to be treated as missing :param types: Define types of fields in annotations files :type types: dict with str keys and :py:class:`.Type` values :return: Key table constructed from text table. :rtype: :class:`.KeyTable` :param quote: Quote character :type quote: str or None """ key = wrap_to_list(key) paths = wrap_to_list(paths) jtypes = {k: v._jtype for k, v in types.items()} jkt = self._jhc.importTable(paths, key, min_partitions, jtypes, comment, delimiter, missing, no_header, impute, quote) return KeyTable(self, jkt)