Exemplo n.º 1
0
def get_temp_file(dir=None,
                  shared=False,
                  suffix="",
                  mode="w+",
                  encoding="utf-8"):
    '''get a temporary file.

    The file is created and the caller needs to close and delete the
    temporary file once it is not used any more. By default, the file
    is opened as a text file (mode ``w+``) with encoding ``utf-8``
    instead of the default mode ``w+b`` used in
    :class:`tempfile.NamedTemporaryFile`

    If dir does not exist, it will be created.

    Arguments
    ---------
    dir : string
        Directory of the temporary file and if not given is set to the
        default temporary location in the global configuration dictionary.
    shared : bool
        If set, the tempory file will be in a shared temporary
        location (given by the global configuration directory).
    suffix : string
        Filename suffix

    Returns
    -------
    file : File
        A file object of the temporary file.

    '''
    if dir is None:
        if shared:
            dir = get_params()['shared_tmpdir']
        else:
            dir = get_params()['tmpdir']

    if not os.path.exists(dir):
        try:
            os.makedirs(dir)
        except OSError:
            # avoid race condition when several processes try to create
            # temporary directory.
            pass
        if not os.path.exists(dir):
            raise OSError(
                "temporary directory {} could not be created".format(dir))

    return tempfile.NamedTemporaryFile(dir=dir,
                                       delete=False,
                                       prefix="ctmp",
                                       mode=mode,
                                       encoding=encoding,
                                       suffix=suffix)
Exemplo n.º 2
0
def get_database_name():
    '''Return the database name associated with the pipeline.

    This method lookis in different sections in the ini file to permit
    both old style ``database`` and new style ``database_name``.

    This method has been implemented for backwards compatibility.

    Returns
    -------
    databasename : string
        Database name. Returns empty string if not found.

    Raises
    ------
    KeyError
       If no database name is found

    '''

    locations = ["database_name", "database"]
    params = get_params()
    for location in locations:
        database = params.get(location, None)
        if database is not None:
            return database

    raise KeyError("database name not found")
Exemplo n.º 3
0
def connect():
    """connect to SQLite database used in this pipeline.

    .. note::
       This method is currently only implemented for sqlite
       databases. It needs refactoring for generic access.
       Alternatively, use an full or partial ORM.

    If ``annotations_database`` is in params, this method
    will attach the named database as ``annotations``.

    Returns
    -------
    dbh
       a database handle

    """

    # Note that in the future this might return an sqlalchemy or
    # db.py handle.
    url = get_params()["database"]["url"]
    is_sqlite3 = url.startswith("sqlite")

    if is_sqlite3:
        connect_args = {'check_same_thread': False}
    else:
        connect_args = {}

    creator = None
    if is_sqlite3 and "annotations_dir" in get_params():
        # not sure what the correct way is for url
        # sqlite:///./csvdb -> ./csvdb
        # sqlite:////path/to/csvdb -> /path/to/csvdb
        filename = os.path.abspath(url[len("sqlite:///"):])

        def creator():
            conn = sqlite3.connect(filename)
            conn.execute("ATTACH DATABASE '{}' as annotations".format(
                os.path.join(get_params()["annotations_dir"], "csvdb")))
            return conn

    engine = sqlalchemy.create_engine(url,
                                      connect_args=connect_args,
                                      creator=creator)

    return engine
Exemplo n.º 4
0
def get_temp_dir(dir=None, shared=False, clear=False):
    '''get a temporary directory.

    The directory is created and the caller needs to delete the temporary
    directory once it is not used any more.

    If dir does not exist, it will be created.

    Arguments
    ---------
    dir : string
        Directory of the temporary directory and if not given is set to the
        default temporary location in the global configuration dictionary.
    shared : bool
        If set, the tempory directory will be in a shared temporary
        location.

    Returns
    -------
    filename : string
        Absolute pathname of temporary file.

    '''
    if dir is None:
        if shared:
            dir = get_params()['shared_tmpdir']
        else:
            dir = get_params()['tmpdir']

    if not os.path.exists(dir):
        os.makedirs(dir)

    tmpdir = tempfile.mkdtemp(dir=dir, prefix="ctmp")
    if clear:
        os.rmdir(tmpdir)
    return tmpdir
Exemplo n.º 5
0
def build_load_statement(tablename, retry=True, options=""):
    """build a command line statement to upload data.

    Upload is performed via the :doc:`csv2db` script.

    The returned statement is suitable to use in pipe expression.
    This method is aware of the configuration values for database
    access and the chosen database backend.

    For example::

        load_statement = P.build_load_statement("data")
        statement = "cat data.txt | %(load_statement)s"
        P.run(statement)

    Arguments
    ---------
    tablename : string
        Tablename for upload
    retry : bool
        Add the ``--retry`` option to `csv2db.py`
    options : string
        Command line options to be passed on to `csv2db.py`

    Returns
    -------
    string

    """

    opts = []

    if retry:
        opts.append(" --retry ")

    params = get_params()
    opts.append("--database-url={}".format(params["database"]["url"]))

    db_options = " ".join(opts)
    load_statement = (
        "python -m CGATCore.CSV2DB {db_options} {options} --table={tablename}".
        format(**locals()))

    return load_statement
Exemplo n.º 6
0
def load(infile,
         outfile=None,
         options="",
         collapse=False,
         transpose=False,
         tablename=None,
         retry=True,
         limit=0,
         shuffle=False,
         job_memory=None):
    """import data from a tab-separated file into database.

    The table name is given by outfile without the
    ".load" suffix.

    A typical load task in ruffus would look like this::

        @transform("*.tsv.gz", suffix(".tsv.gz"), ".load")
        def loadData(infile, outfile):
            P.load(infile, outfile)

    Upload is performed via the :doc:`csv2db` script.

    Arguments
    ---------
    infile : string
        Filename of the input data
    outfile : string
        Output filename. This will contain the logging information. The
        table name is derived from `outfile` if `tablename` is not set.
    options : string
        Command line options for the `csv2db.py` script.
    collapse : string
        If set, the table will be collapsed before loading. This
        transforms a data set with two columns where the first column
        is the row name into a multi-column table.  The value of
        collapse is the value used for missing values.
    transpose : string
        If set, the table will be transposed before loading. The first
        column in the first row will be set to the string within
        transpose.
    retry : bool
        If True, multiple attempts will be made if the data can
        not be loaded at the first try, for example if a table is locked.
    limit : int
        If set, only load the first n lines.
    shuffle : bool
        If set, randomize lines before loading. Together with `limit`
        this permits loading a sample of rows.
    job_memory : string
        Amount of memory to allocate for job. If unset, uses the global
        default.
    """

    if job_memory is None:
        job_memory = get_params()["cluster_memory_default"]

    if not tablename:
        tablename = to_table(outfile)

    statement = []

    if infile.endswith(".gz"):
        statement.append("zcat %(infile)s")
    else:
        statement.append("cat %(infile)s")

    if collapse:
        statement.append("python -m CGATCore.Table "
                         "--log=%(outfile)s.collapse.log "
                         "--collapse=%(collapse)s")

    if transpose:
        statement.append("python -m CGATCore.Table "
                         "--log=%(outfile)s.transpose.log "
                         "--transpose "
                         "--set-transpose-field=%(transpose)s")

    if shuffle:
        statement.append("python -m CGATCore.Table "
                         "--log=%(outfile)s.shuffle.log "
                         "--method=randomize-rows")

    if limit > 0:
        # use awk to filter in order to avoid a pipeline broken error from head
        statement.append("awk 'NR > %i {exit(0)} {print}'" % (limit + 1))
        # ignore errors from cat or zcat due to broken pipe
        ignore_pipe_errors = True

    statement.append(
        build_load_statement(tablename, options=options, retry=retry))

    statement = " | ".join(statement) + " > %(outfile)s"

    to_cluster = False
    run(statement)
Exemplo n.º 7
0
 def creator():
     conn = sqlite3.connect(filename)
     conn.execute("ATTACH DATABASE '{}' as annotations".format(
         os.path.join(get_params()["annotations_dir"], "csvdb")))
     return conn
Exemplo n.º 8
0
def concatenate_and_load(infiles,
                         outfile,
                         regex_filename=None,
                         header=None,
                         cat="track",
                         has_titles=True,
                         missing_value="na",
                         retry=True,
                         tablename=None,
                         options="",
                         job_memory=None):
    """concatenate multiple tab-separated files and upload into database.

    The table name is given by outfile without the
    ".load" suffix.

    A typical concatenate and load task in ruffus would look like this::

        @merge("*.tsv.gz", ".load")
        def loadData(infile, outfile):
            P.concatenateAndLoad(infiles, outfile)

    Upload is performed via the :doc:`csv2db` script.

    Arguments
    ---------
    infiles : list
        Filenames of the input data
    outfile : string
        Output filename. This will contain the logging information. The
        table name is derived from `outfile`.
    regex_filename : string
        If given, *regex_filename* is applied to the filename to extract
        the track name. If the pattern contains multiple groups, they are
        added as additional columns. For example, if `cat` is set to
        ``track,method`` and `regex_filename` is ``(.*)_(.*).tsv.gz``
        it will add the columns ``track`` and method to the table.
    header : string
        Comma-separated list of values for header.
    cat : string
        Column title for column containing the track name. The track name
        is derived from the filename, see `regex_filename`.
    has_titles : bool
        If True, files are expected to have column titles in their first row.
    missing_value : string
        String to use for missing values.
    retry : bool
        If True, multiple attempts will be made if the data can
        not be loaded at the first try, for example if a table is locked.
    tablename: string
        Name to use for table. If unset derive from outfile.
    options : string
        Command line options for the `csv2db.py` script.
    job_memory : string
        Amount of memory to allocate for job. If unset, uses the global
        default.

    """
    if job_memory is None:
        job_memory = get_params()["cluster_memory_default"]

    if tablename is None:
        tablename = to_table(outfile)

    infiles = " ".join(infiles)

    passed_options = options
    load_options, cat_options = ["--add-index=track"], []

    if regex_filename:
        cat_options.append("--regex-filename='%s'" % regex_filename)

    if header:
        load_options.append("--header-names=%s" % header)

    if not has_titles:
        cat_options.append("--no-titles")

    cat_options = " ".join(cat_options)
    load_options = " ".join(load_options) + " " + passed_options

    load_statement = build_load_statement(tablename,
                                          options=load_options,
                                          retry=retry)

    statement = '''python -m CGATCore.Tables
    --cat=%(cat)s
    --missing-value=%(missing_value)s
    %(cat_options)s
    %(infiles)s
    | %(load_statement)s
    > %(outfile)s'''

    to_cluster = False
    run(statement)