Exemplo n.º 1
0
def index_db(index_filename, filenames=None, format=None,
        key_function=None, **kwargs):
    """Indexes several search output files into an SQLite database.

    Arguments:
    index_filename -- The SQLite filename.
    filenames -- List of strings specifying file(s) to be indexed, or when
                 indexing a single file this can be given as a string.
                 (optional if reloading an existing index, but must match)
    format -- Lower case string denoting one of the supported formats.
              (optional if reloading an existing index, but must match)
    key_function -- Optional callback function which when given a
                    QueryResult identifier string should return a unique
                    key for the dictionary.
    kwargs -- Format-specific keyword arguments.

    The `index_db` function is similar to `index` in that it indexes the start
    position of all queries from search output files. The main difference is
    instead of storing these indices in-memory, they are written to disk as an
    SQLite database file. This allows the indices to persist between Python
    sessions. This enables access to any queries in the file without any
    indexing overhead, provided it has been indexed at least once.

    >>> from Bio import SearchIO
    >>> db_idx = SearchIO.index_db(':memory:', 'Blast/mirna.xml', 'blast-xml')
    >>> sorted(db_idx.keys())
    ['33211', '33212', '33213']
    >>> db_idx['33212']
    QueryResult(id='33212', 44 hits)

    `index_db` can also index multiple files and store them in the same
    database, making it easier to group multiple search files and access them
    from a single interface.

    >>> from Bio import SearchIO
    >>> files = ['Blast/mirna.xml', 'Blast/wnts.xml']
    >>> db_idx = SearchIO.index_db(':memory:', files, 'blast-xml')
    >>> sorted(db_idx.keys())
    ['33211', '33212', '33213', 'gi|156630997:105-1160', ..., 'gi|53729353:216-1313']
    >>> db_idx['33212']
    QueryResult(id='33212', 44 hits)

    One common example where this is helpful is if you had a large set of
    query sequences (say ten thousand) which you split into ten query files
    of one thousand sequences each in order to run as ten separate BLAST jobs
    on a cluster. You could use `index_db` to index the ten BLAST output
    files together for seamless access to all the results as one dictionary.

    Note that ':memory:' rather than an index filename tells SQLite to hold
    the index database in memory. This is useful for quick tests, but using
    the Bio.SearchIO.index(...) function instead would use less memory.

    BGZF compressed files are supported, and detected automatically. Ordinary
    GZIP compressed files are not supported.
    """
    # cast filenames to list if it's a string
    # (can we check if it's a string or a generator?)
    if isinstance(filenames, basestring):
        filenames = [filenames]

    from Bio.File import _SQLiteManySeqFilesDict
    repr = "SearchIO.index_db(%r, filenames=%r, format=%r, key_function=%r, ...)" \
               % (index_filename, filenames, format, key_function)

    def proxy_factory(format, filename=None):
        """Given a filename returns proxy object, else boolean if format OK."""
        if filename:
            return get_processor(format, _INDEXER_MAP)(filename, **kwargs)
        else:
            return format in _INDEXER_MAP

    return _SQLiteManySeqFilesDict(index_filename, filenames,
                                   proxy_factory, format,
                                   key_function, repr)
Exemplo n.º 2
0
def index_db(index_filename, filenames=None, format=None, alphabet=None,
             key_function=None):
    """Index several sequence files and return a dictionary like object.

    The index is stored in an SQLite database rather than in memory (as in the
    Bio.SeqIO.index(...) function).

        - index_filename - Where to store the SQLite index
        - filenames - list of strings specifying file(s) to be indexed, or when
          indexing a single file this can be given as a string.
          (optional if reloading an existing index, but must match)
        - format   - lower case string describing the file format
          (optional if reloading an existing index, but must match)
        - alphabet - optional Alphabet object, useful when the sequence type
          cannot be automatically inferred from the file itself
          (e.g. format="fasta" or "tab")
        - key_function - Optional callback function which when given a
          SeqRecord identifier string should return a unique
          key for the dictionary.

    This indexing function will return a dictionary like object, giving the
    SeqRecord objects as values:

    >>> from Bio.Alphabet import generic_protein
    >>> from Bio import SeqIO
    >>> files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"]
    >>> def get_gi(name):
    ...     parts = name.split("|")
    ...     i = parts.index("gi")
    ...     assert i != -1
    ...     return parts[i+1]
    >>> idx_name = ":memory:" #use an in memory SQLite DB for this test
    >>> records = SeqIO.index_db(idx_name, files, "fasta", generic_protein, get_gi)
    >>> len(records)
    95
    >>> records["7525076"].description
    'gi|7525076|ref|NP_051101.1| Ycf2 [Arabidopsis thaliana]'
    >>> records["45478717"].description
    'gi|45478717|ref|NP_995572.1| pesticin [Yersinia pestis biovar Microtus str. 91001]'
    >>> records.close()

    In this example the two files contain 85 and 10 records respectively.

    BGZF compressed files are supported, and detected automatically. Ordinary
    GZIP compressed files are not supported.

    See also: Bio.SeqIO.index() and Bio.SeqIO.to_dict(), and the Python module
    glob which is useful for building lists of files.
    """
    # Try and give helpful error messages:
    if not isinstance(index_filename, basestring):
        raise TypeError("Need a string for the index filename")
    if isinstance(filenames, basestring):
        # Make the API a little more friendly, and more similar
        # to Bio.SeqIO.index(...) for indexing just one file.
        filenames = [filenames]
    if filenames is not None and not isinstance(filenames, list):
        raise TypeError(
            "Need a list of filenames (as strings), or one filename")
    if format is not None and not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if format and format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet) or
                                     isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %r" % alphabet)

    # Map the file format to a sequence iterator:
    from ._index import _FormatToRandomAccess  # Lazy import
    from Bio.File import _SQLiteManySeqFilesDict
    repr = "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \
               % (index_filename, filenames, format, alphabet, key_function)

    def proxy_factory(format, filename=None):
        """Given a filename returns proxy object, else boolean if format OK."""
        if filename:
            return _FormatToRandomAccess[format](filename, format, alphabet)
        else:
            return format in _FormatToRandomAccess

    return _SQLiteManySeqFilesDict(index_filename, filenames,
                                   proxy_factory, format,
                                   key_function, repr)
Exemplo n.º 3
0
def index_db(index_filename, filenames=None, format=None, alphabet=None,
             key_function=None):
    """Index several sequence files and return a dictionary like object.

    The index is stored in an SQLite database rather than in memory (as in the
    Bio.SeqIO.index(...) function).

        - index_filename - Where to store the SQLite index
        - filenames - list of strings specifying file(s) to be indexed, or when
          indexing a single file this can be given as a string.
          (optional if reloading an existing index, but must match)
        - format   - lower case string describing the file format
          (optional if reloading an existing index, but must match)
        - alphabet - optional Alphabet object, useful when the sequence type
          cannot be automatically inferred from the file itself
          (e.g. format="fasta" or "tab")
        - key_function - Optional callback function which when given a
          SeqRecord identifier string should return a unique
          key for the dictionary.

    This indexing function will return a dictionary like object, giving the
    SeqRecord objects as values:

    >>> from Bio.Alphabet import generic_protein
    >>> from Bio import SeqIO
    >>> files = ["GenBank/NC_000932.faa", "GenBank/NC_005816.faa"]
    >>> def get_gi(name):
    ...     parts = name.split("|")
    ...     i = parts.index("gi")
    ...     assert i != -1
    ...     return parts[i+1]
    >>> idx_name = ":memory:" #use an in memory SQLite DB for this test
    >>> records = SeqIO.index_db(idx_name, files, "fasta", generic_protein, get_gi)
    >>> len(records)
    95
    >>> records["7525076"].description
    'gi|7525076|ref|NP_051101.1| Ycf2 [Arabidopsis thaliana]'
    >>> records["45478717"].description
    'gi|45478717|ref|NP_995572.1| pesticin [Yersinia pestis biovar Microtus str. 91001]'
    >>> records.close()

    In this example the two files contain 85 and 10 records respectively.

    BGZF compressed files are supported, and detected automatically. Ordinary
    GZIP compressed files are not supported.

    See Also: Bio.SeqIO.index() and Bio.SeqIO.to_dict(), and the Python module
    glob which is useful for building lists of files.

    """
    # Try and give helpful error messages:
    if not isinstance(index_filename, basestring):
        raise TypeError("Need a string for the index filename")
    if isinstance(filenames, basestring):
        # Make the API a little more friendly, and more similar
        # to Bio.SeqIO.index(...) for indexing just one file.
        filenames = [filenames]
    if filenames is not None and not isinstance(filenames, list):
        raise TypeError(
            "Need a list of filenames (as strings), or one filename")
    if format is not None and not isinstance(format, basestring):
        raise TypeError("Need a string for the file format (lower case)")
    if format and format != format.lower():
        raise ValueError("Format string '%s' should be lower case" % format)
    if alphabet is not None and not (isinstance(alphabet, Alphabet) or
                                     isinstance(alphabet, AlphabetEncoder)):
        raise ValueError("Invalid alphabet, %r" % alphabet)

    # Map the file format to a sequence iterator:
    from ._index import _FormatToRandomAccess  # Lazy import
    from Bio.File import _SQLiteManySeqFilesDict
    repr = "SeqIO.index_db(%r, filenames=%r, format=%r, alphabet=%r, key_function=%r)" \
               % (index_filename, filenames, format, alphabet, key_function)

    def proxy_factory(format, filename=None):
        """Given a filename returns proxy object, else boolean if format OK."""
        if filename:
            return _FormatToRandomAccess[format](filename, format, alphabet)
        else:
            return format in _FormatToRandomAccess

    return _SQLiteManySeqFilesDict(index_filename, filenames,
                                   proxy_factory, format,
                                   key_function, repr)
Exemplo n.º 4
0
def index_db(index_filename,
             filenames=None,
             format=None,
             key_function=None,
             **kwargs):
    """Indexes several search output files into an SQLite database.

     - index_filename - The SQLite filename.
     - filenames    - List of strings specifying file(s) to be indexed, or when
                      indexing a single file this can be given as a string.
                      (optional if reloading an existing index, but must match)
     - format       - Lower case string denoting one of the supported formats.
                      (optional if reloading an existing index, but must match)
     - key_function - Optional callback function which when given a
                      QueryResult identifier string should return a unique
                      key for the dictionary.
     - kwargs       - Format-specific keyword arguments.

    The ``index_db`` function is similar to ``index`` in that it indexes the start
    position of all queries from search output files. The main difference is
    instead of storing these indices in-memory, they are written to disk as an
    SQLite database file. This allows the indices to persist between Python
    sessions. This enables access to any queries in the file without any
    indexing overhead, provided it has been indexed at least once.

    >>> from Bio import SearchIO
    >>> idx_filename = ":memory:" # Use a real filename, this is in RAM only!
    >>> db_idx = SearchIO.index_db(idx_filename, 'Blast/mirna.xml', 'blast-xml')
    >>> sorted(db_idx)
    ['33211', '33212', '33213']
    >>> db_idx['33212']
    QueryResult(id='33212', 44 hits)
    >>> db_idx.close()

    ``index_db`` can also index multiple files and store them in the same
    database, making it easier to group multiple search files and access them
    from a single interface.

    >>> from Bio import SearchIO
    >>> idx_filename = ":memory:" # Use a real filename, this is in RAM only!
    >>> files = ['Blast/mirna.xml', 'Blast/wnts.xml']
    >>> db_idx = SearchIO.index_db(idx_filename, files, 'blast-xml')
    >>> sorted(db_idx)
    ['33211', '33212', '33213', 'gi|156630997:105-1160', ..., 'gi|53729353:216-1313']
    >>> db_idx['33212']
    QueryResult(id='33212', 44 hits)
    >>> db_idx.close()

    One common example where this is helpful is if you had a large set of
    query sequences (say ten thousand) which you split into ten query files
    of one thousand sequences each in order to run as ten separate BLAST jobs
    on a cluster. You could use ``index_db`` to index the ten BLAST output
    files together for seamless access to all the results as one dictionary.

    Note that ':memory:' rather than an index filename tells SQLite to hold
    the index database in memory. This is useful for quick tests, but using
    the Bio.SearchIO.index(...) function instead would use less memory.

    BGZF compressed files are supported, and detected automatically. Ordinary
    GZIP compressed files are not supported.

    See also Bio.SearchIO.index(), Bio.SearchIO.to_dict(), and the Python module
    glob which is useful for building lists of files.
    """
    # cast filenames to list if it's a string
    # (can we check if it's a string or a generator?)
    if isinstance(filenames, str):
        filenames = [filenames]

    from Bio.File import _SQLiteManySeqFilesDict

    repr = "SearchIO.index_db(%r, filenames=%r, format=%r, key_function=%r, ...)" % (
        index_filename,
        filenames,
        format,
        key_function,
    )

    def proxy_factory(format, filename=None):
        """Given a filename returns proxy object, else boolean if format OK."""
        if filename:
            return get_processor(format, _INDEXER_MAP)(filename, **kwargs)
        else:
            return format in _INDEXER_MAP

    return _SQLiteManySeqFilesDict(index_filename, filenames, proxy_factory,
                                   format, key_function, repr)