Exemplo n.º 1
0
class BPF(Text):
    """Munich BPF annotation format
    https://www.phonetik.uni-muenchen.de/Bas/BasFormatseng.html#Partitur

    >>> from galaxy.datatypes.sniff import get_test_fname
    >>> fname = get_test_fname('1_1119_2_22_001.par')
    >>> BPF().sniff(fname)
    True
    >>> fname = get_test_fname('1_1119_2_22_001-1.par')
    >>> BPF().sniff(fname)
    True
    >>> fname = get_test_fname('drugbank_drugs.cml')
    >>> BPF().sniff(fname)
    False

    """

    file_ext = "par"

    MetadataElement(name="annotations",
                    default=[],
                    desc="Annotation types",
                    param=ListParameter,
                    readonly=True,
                    visible=True,
                    optional=True,
                    no_value=[])
    mandatory_headers = [
        'LHD', 'REP', 'SNB', 'SAM', 'SBF', 'SSB', 'NCH', 'SPN', 'LBD'
    ]
    optional_headers = [
        'FIL', 'TYP', 'DBN', 'VOL', 'DIR', 'SRC', 'BEG', 'END', 'RED', 'RET',
        'RCC', 'CMT', 'SPI', 'PCF', 'PCN', 'EXP', 'SYS', 'DAT', 'SPA', 'MAO',
        'GPO', 'SAO'
    ]

    def set_meta(self, dataset, overwrite=True, **kwd):
        """Set the metadata for this dataset from the file contents"""
        types = set()
        with open(dataset.dataset.file_name) as fd:
            for line in fd:
                # Split the line on a colon rather than regexing it
                parts = line.split(':')

                # And if the first part is a 3 character string, then it's
                # interesting.
                if len(parts) and len(parts[0]) == 3:
                    types.add(parts[0])
                else:
                    return False

        dataset.metadata.annotations = list(types)

    def sniff(self, filename):
        # We loop over 30 as there are 9 mandatory headers (the last should be
        # `LBD:`), while there are 21 optional headers that can be
        # interspersed.
        seen_headers = [
            line[0] for line in get_headers(filename, sep=':', count=40)
        ]

        # We cut everything after LBD, where the headers end and contents
        # start. We choose not to validate contents.
        if 'LBD' in seen_headers:
            seen_headers = seen_headers[0:seen_headers.index('LBD') + 1]

        # Check that every mandatory header is present in the seen headers
        for header in self.mandatory_headers:
            if header not in seen_headers:
                return False

        # Check that every seen header is either in mandatory or optional
        for header in seen_headers:
            if not (header in self.mandatory_headers
                    or header in self.optional_headers):
                return False

        return True
Exemplo n.º 2
0
class Sequence( data.Text ):
    """Class describing a sequence"""

    """Add metadata elements"""
    MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )

    def set_meta( self, dataset, **kwd ):
        """
        Set the number of sequences and the number of data lines in dataset.
        """
        data_lines = 0
        sequences = 0
        for line in file( dataset.file_name ):
            line = line.strip()
            if line and line.startswith( '#' ):
                # We don't count comment lines for sequence data types
                continue
            if line and line.startswith( '>' ):
                sequences += 1
                data_lines +=1
            else:
                data_lines += 1
        dataset.metadata.data_lines = data_lines
        dataset.metadata.sequences = sequences
    def set_peek( self, dataset, is_multi_byte=False ):
        if not dataset.dataset.purged:
            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
            if dataset.metadata.sequences:
                dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) )
            else:
                dataset.blurb = data.nice_size( dataset.get_size() )
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def get_sequences_per_file(total_sequences, split_params):
        if split_params['split_mode'] == 'number_of_parts':
            # legacy basic mode - split into a specified number of parts
            parts = int(split_params['split_size'])
            sequences_per_file = [total_sequences/parts for i in range(parts)]
            for i in range(total_sequences % parts):
                sequences_per_file[i] += 1
        elif split_params['split_mode'] == 'to_size':
            # loop through the sections and calculate the number of sequences
            chunk_size = long(split_params['split_size'])

            chunks = total_sequences / chunk_size
            rem = total_sequences % chunk_size
            sequences_per_file = [chunk_size for i in range(total_sequences / chunk_size)]
            # TODO: Should we invest the time in a better way to handle small remainders?
            if rem > 0:
                sequences_per_file.append(rem)
        else:
            raise Exception('Unsupported split mode %s' % split_params['split_mode'])
        return sequences_per_file
    get_sequences_per_file = staticmethod(get_sequences_per_file)

    def do_slow_split( cls, input_datasets, subdir_generator_function, split_params):
        # count the sequences so we can split
        # TODO: if metadata is present, take the number of lines / 4
        if input_datasets[0].metadata is not None and input_datasets[0].metadata.sequences is not None:
            total_sequences = input_datasets[0].metadata.sequences
        else:
            input_file = input_datasets[0].file_name
            compress = is_gzip(input_file)
            if compress:
                # gzip is really slow before python 2.7!
                in_file = gzip.GzipFile(input_file, 'r')
            else:
                # TODO
                # if a file is not compressed, seek locations can be calculated and stored
                # ideally, this would be done in metadata
                # TODO
                # Add BufferedReader if python 2.7?
                in_file = open(input_file, 'rt')
            total_sequences = long(0)
            for i, line in enumerate(in_file):
                total_sequences += 1
            in_file.close()
            total_sequences /= 4

        sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params)
        return cls.write_split_files(input_datasets, None, subdir_generator_function, sequences_per_file)
    do_slow_split = classmethod(do_slow_split)

    def do_fast_split( cls, input_datasets, toc_file_datasets, subdir_generator_function, split_params):
        data = simplejson.load(open(toc_file_datasets[0].file_name))
        sections = data['sections']
        total_sequences = long(0)
        for section in sections:
            total_sequences += long(section['sequences'])
        sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params)
        return cls.write_split_files(input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file)
    do_fast_split = classmethod(do_fast_split)

    def write_split_files(cls, input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file):
        directories = []
        def get_subdir(idx):
            if idx < len(directories):
                return directories[idx]
            dir = subdir_generator_function()
            directories.append(dir)
            return dir

        # we know how many splits and how many sequences in each. What remains is to write out instructions for the 
        # splitting of all the input files. To decouple the format of those instructions from this code, the exact format of
        # those instructions is delegated to scripts
        start_sequence=0
        for part_no in range(len(sequences_per_file)):
            dir = get_subdir(part_no)
            for ds_no in range(len(input_datasets)):
                ds = input_datasets[ds_no]
                base_name = os.path.basename(ds.file_name)
                part_path = os.path.join(dir, base_name)
                split_data = dict(class_name='%s.%s' % (cls.__module__, cls.__name__),
                                  output_name=part_path,
                                  input_name=ds.file_name,
                                  args=dict(start_sequence=start_sequence, num_sequences=sequences_per_file[part_no]))
                if toc_file_datasets is not None:
                    toc = toc_file_datasets[ds_no]
                    split_data['args']['toc_file'] = toc.file_name
                f = open(os.path.join(dir, 'split_info_%s.json' % base_name), 'w')
                simplejson.dump(split_data, f)
                f.close()
            start_sequence += sequences_per_file[part_no]
        return directories
    write_split_files = classmethod(write_split_files)
    
    def split( cls, input_datasets, subdir_generator_function, split_params):
        """Split a generic sequence file (not sensible or possible, see subclasses)."""
        if split_params is None:
            return None
        raise NotImplementedError("Can't split generic sequence files")
Exemplo n.º 3
0
class Sequence(data.Text):
    """Class describing a sequence"""
    edam_data = "data_2044"
    """Add metadata elements"""
    MetadataElement(name="sequences",
                    default=0,
                    desc="Number of sequences",
                    readonly=True,
                    visible=False,
                    optional=True,
                    no_value=0)

    def set_meta(self, dataset, **kwd):
        """
        Set the number of sequences and the number of data lines in dataset.
        """
        data_lines = 0
        sequences = 0
        for line in open(dataset.file_name):
            line = line.strip()
            if line and line.startswith('#'):
                # We don't count comment lines for sequence data types
                continue
            if line and line.startswith('>'):
                sequences += 1
                data_lines += 1
            else:
                data_lines += 1
        dataset.metadata.data_lines = data_lines
        dataset.metadata.sequences = sequences

    def set_peek(self, dataset, is_multi_byte=False):
        if not dataset.dataset.purged:
            dataset.peek = data.get_file_peek(dataset.file_name,
                                              is_multi_byte=is_multi_byte)
            if dataset.metadata.sequences:
                dataset.blurb = "%s sequences" % util.commaify(
                    str(dataset.metadata.sequences))
            else:
                dataset.blurb = nice_size(dataset.get_size())
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def get_sequences_per_file(total_sequences, split_params):
        if split_params['split_mode'] == 'number_of_parts':
            # legacy basic mode - split into a specified number of parts
            parts = int(split_params['split_size'])
            sequences_per_file = [
                total_sequences / parts for i in range(parts)
            ]
            for i in range(total_sequences % parts):
                sequences_per_file[i] += 1
        elif split_params['split_mode'] == 'to_size':
            # loop through the sections and calculate the number of sequences
            chunk_size = long(split_params['split_size'])
            rem = total_sequences % chunk_size
            sequences_per_file = [
                chunk_size for i in range(total_sequences / chunk_size)
            ]
            # TODO: Should we invest the time in a better way to handle small remainders?
            if rem > 0:
                sequences_per_file.append(rem)
        else:
            raise Exception('Unsupported split mode %s' %
                            split_params['split_mode'])
        return sequences_per_file

    get_sequences_per_file = staticmethod(get_sequences_per_file)

    def do_slow_split(cls, input_datasets, subdir_generator_function,
                      split_params):
        # count the sequences so we can split
        # TODO: if metadata is present, take the number of lines / 4
        if input_datasets[0].metadata is not None and input_datasets[
                0].metadata.sequences is not None:
            total_sequences = input_datasets[0].metadata.sequences
        else:
            input_file = input_datasets[0].file_name
            compress = is_gzip(input_file)
            if compress:
                # gzip is really slow before python 2.7!
                in_file = gzip.GzipFile(input_file, 'r')
            else:
                # TODO
                # if a file is not compressed, seek locations can be calculated and stored
                # ideally, this would be done in metadata
                # TODO
                # Add BufferedReader if python 2.7?
                in_file = open(input_file, 'rt')
            total_sequences = long(0)
            for i, line in enumerate(in_file):
                total_sequences += 1
            in_file.close()
            total_sequences /= 4

        sequences_per_file = cls.get_sequences_per_file(
            total_sequences, split_params)
        return cls.write_split_files(input_datasets, None,
                                     subdir_generator_function,
                                     sequences_per_file)

    do_slow_split = classmethod(do_slow_split)

    def do_fast_split(cls, input_datasets, toc_file_datasets,
                      subdir_generator_function, split_params):
        data = json.load(open(toc_file_datasets[0].file_name))
        sections = data['sections']
        total_sequences = long(0)
        for section in sections:
            total_sequences += long(section['sequences'])
        sequences_per_file = cls.get_sequences_per_file(
            total_sequences, split_params)
        return cls.write_split_files(input_datasets, toc_file_datasets,
                                     subdir_generator_function,
                                     sequences_per_file)

    do_fast_split = classmethod(do_fast_split)

    def write_split_files(cls, input_datasets, toc_file_datasets,
                          subdir_generator_function, sequences_per_file):
        directories = []

        def get_subdir(idx):
            if idx < len(directories):
                return directories[idx]
            dir = subdir_generator_function()
            directories.append(dir)
            return dir

        # we know how many splits and how many sequences in each. What remains is to write out instructions for the
        # splitting of all the input files. To decouple the format of those instructions from this code, the exact format of
        # those instructions is delegated to scripts
        start_sequence = 0
        for part_no in range(len(sequences_per_file)):
            dir = get_subdir(part_no)
            for ds_no in range(len(input_datasets)):
                ds = input_datasets[ds_no]
                base_name = os.path.basename(ds.file_name)
                part_path = os.path.join(dir, base_name)
                split_data = dict(
                    class_name='%s.%s' % (cls.__module__, cls.__name__),
                    output_name=part_path,
                    input_name=ds.file_name,
                    args=dict(start_sequence=start_sequence,
                              num_sequences=sequences_per_file[part_no]))
                if toc_file_datasets is not None:
                    toc = toc_file_datasets[ds_no]
                    split_data['args']['toc_file'] = toc.file_name
                f = open(os.path.join(dir, 'split_info_%s.json' % base_name),
                         'w')
                json.dump(split_data, f)
                f.close()
            start_sequence += sequences_per_file[part_no]
        return directories

    write_split_files = classmethod(write_split_files)

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """Split a generic sequence file (not sensible or possible, see subclasses)."""
        if split_params is None:
            return None
        raise NotImplementedError("Can't split generic sequence files")

    def get_split_commands_with_toc(input_name, output_name, toc_file,
                                    start_sequence, sequence_count):
        """
        Uses a Table of Contents dict, parsed from an FQTOC file, to come up with a set of
        shell commands that will extract the parts necessary
        >>> three_sections=[dict(start=0, end=74, sequences=10), dict(start=74, end=148, sequences=10), dict(start=148, end=148+76, sequences=10)]
        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=0, sequence_count=10)
        ['dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null >> ./output.gz']
        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=1, sequence_count=5)
        ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +5 2> /dev/null) | head -20 | gzip -c >> ./output.gz']
        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=0, sequence_count=20)
        ['dd bs=1 skip=0 count=148 if=./input.gz 2> /dev/null >> ./output.gz']
        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=5, sequence_count=10)
        ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +21 2> /dev/null) | head -20 | gzip -c >> ./output.gz', '(dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +1 2> /dev/null) | head -20 | gzip -c >> ./output.gz']
        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=10, sequence_count=10)
        ['dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null >> ./output.gz']
        >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=5, sequence_count=20)
        ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +21 2> /dev/null) | head -20 | gzip -c >> ./output.gz', 'dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null >> ./output.gz', '(dd bs=1 skip=148 count=76 if=./input.gz 2> /dev/null )| zcat | ( tail -n +1 2> /dev/null) | head -20 | gzip -c >> ./output.gz']
        """
        sections = toc_file['sections']
        result = []

        current_sequence = long(0)
        i = 0
        # skip to the section that contains my starting sequence
        while i < len(sections) and start_sequence >= current_sequence + long(
                sections[i]['sequences']):
            current_sequence += long(sections[i]['sequences'])
            i += 1
        if i == len(sections):  # bad input data!
            raise Exception('No FQTOC section contains starting sequence %s' %
                            start_sequence)

        # These two variables act as an accumulator for consecutive entire blocks that
        # can be copied verbatim (without decompressing)
        start_chunk = long(-1)
        end_chunk = long(-1)
        copy_chunk_cmd = 'dd bs=1 skip=%s count=%s if=%s 2> /dev/null >> %s'

        while sequence_count > 0 and i < len(sections):
            # we need to extract partial data. So, find the byte offsets of the chunks that contain the data we need
            # use a combination of dd (to pull just the right sections out) tail (to skip lines) and head (to get the
            # right number of lines
            sequences = long(sections[i]['sequences'])
            skip_sequences = start_sequence - current_sequence
            sequences_to_extract = min(sequence_count,
                                       sequences - skip_sequences)
            start_copy = long(sections[i]['start'])
            end_copy = long(sections[i]['end'])
            if sequences_to_extract < sequences:
                if start_chunk > -1:
                    result.append(copy_chunk_cmd %
                                  (start_chunk, end_chunk - start_chunk,
                                   input_name, output_name))
                    start_chunk = -1
                # extract, unzip, trim, recompress
                result.append(
                    '(dd bs=1 skip=%s count=%s if=%s 2> /dev/null )| zcat | ( tail -n +%s 2> /dev/null) | head -%s | gzip -c >> %s'
                    % (start_copy, end_copy - start_copy, input_name,
                       skip_sequences * 4 + 1, sequences_to_extract * 4,
                       output_name))
            else:  # whole section - add it to the start_chunk/end_chunk accumulator
                if start_chunk == -1:
                    start_chunk = start_copy
                end_chunk = end_copy
            sequence_count -= sequences_to_extract
            start_sequence += sequences_to_extract
            current_sequence += sequences
            i += 1
        if start_chunk > -1:
            result.append(copy_chunk_cmd %
                          (start_chunk, end_chunk - start_chunk, input_name,
                           output_name))

        if sequence_count > 0:
            raise Exception('%s sequences not found in file' % sequence_count)

        return result

    get_split_commands_with_toc = staticmethod(get_split_commands_with_toc)

    def get_split_commands_sequential(is_compressed, input_name, output_name,
                                      start_sequence, sequence_count):
        """
        Does a brain-dead sequential scan & extract of certain sequences
        >>> Sequence.get_split_commands_sequential(True, './input.gz', './output.gz', start_sequence=0, sequence_count=10)
        ['zcat "./input.gz" | ( tail -n +1 2> /dev/null) | head -40 | gzip -c > "./output.gz"']
        >>> Sequence.get_split_commands_sequential(False, './input.fastq', './output.fastq', start_sequence=10, sequence_count=10)
        ['tail -n +41 "./input.fastq" 2> /dev/null | head -40 > "./output.fastq"']
        """
        start_line = start_sequence * 4
        line_count = sequence_count * 4
        # TODO: verify that tail can handle 64-bit numbers
        if is_compressed:
            cmd = 'zcat "%s" | ( tail -n +%s 2> /dev/null) | head -%s | gzip -c' % (
                input_name, start_line + 1, line_count)
        else:
            cmd = 'tail -n +%s "%s" 2> /dev/null | head -%s' % (
                start_line + 1, input_name, line_count)
        cmd += ' > "%s"' % output_name

        return [cmd]

    get_split_commands_sequential = staticmethod(get_split_commands_sequential)
Exemplo n.º 4
0
class SnpEffDb(Text):
    """Class describing a SnpEff genome build"""
    file_ext = "snpeffdb"
    MetadataElement(name="genome_version",
                    default=None,
                    desc="Genome Version",
                    readonly=True,
                    visible=True,
                    no_value=None)
    MetadataElement(name="regulation",
                    default=[],
                    desc="Regulation Names",
                    readonly=True,
                    visible=True,
                    no_value=[],
                    optional=True)
    MetadataElement(name="annotation",
                    default=[],
                    desc="Annotation Names",
                    readonly=True,
                    visible=True,
                    no_value=[],
                    optional=True)

    def __init__(self, **kwd):
        Text.__init__(self, **kwd)

    def set_meta(self, dataset, **kwd):
        Text.set_meta(self, dataset, **kwd)
        data_dir = dataset.extra_files_path
        # search data_dir/genome_version for files
        regulation_pattern = 'regulation_(.+).bin'
        #  annotation files that are included in snpEff by a flag
        annotations_dict = {'nextProt.bin': '-nextprot', 'motif.bin': '-motif'}
        regulations = []
        annotations = []
        if data_dir and os.path.isdir(data_dir):
            for root, dirs, files in os.walk(data_dir):
                for fname in files:
                    if fname.startswith('snpEffectPredictor'):
                        # if snpEffectPredictor.bin download succeeded
                        genome_version = os.path.basename(root)
                        dataset.metadata.genome_version = genome_version
                    else:
                        m = re.match(regulation_pattern, fname)
                        if m:
                            name = m.groups()[0]
                            regulations.append(name)
                        elif fname in annotations_dict:
                            value = annotations_dict[fname]
                            name = value.lstrip('-')
                            annotations.append(name)
            dataset.metadata.regulation = regulations
            dataset.metadata.annotation = annotations
            try:
                fh = file(dataset.file_name, 'w')
                fh.write("%s\n" % genome_version)
                if annotations:
                    fh.write("annotations: %s\n" % ','.join(annotations))
                if regulations:
                    fh.write("regulations: %s\n" % ','.join(regulations))
                fh.close()
            except:
                pass
Exemplo n.º 5
0
class Stockholm_1_0(Text):
    file_ext = "stockholm"

    MetadataElement(name="number_of_alignments",
                    default=0,
                    desc="Number of multiple alignments",
                    readonly=True,
                    visible=True,
                    optional=True,
                    no_value=0)

    def set_peek(self, dataset, is_multi_byte=False):
        if not dataset.dataset.purged:
            dataset.peek = get_file_peek(dataset.file_name,
                                         is_multi_byte=is_multi_byte)
            if (dataset.metadata.number_of_models == 1):
                dataset.blurb = "1 alignment"
            else:
                dataset.blurb = "%s alignments" % dataset.metadata.number_of_models
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disc'

    def sniff(self, filename):
        if count_special_lines('^#[[:space:]+]STOCKHOLM[[:space:]+]1.0',
                               filename) > 0:
            return True
        else:
            return False

    def set_meta(self, dataset, **kwd):
        """

        Set the number of models in dataset.
        """
        dataset.metadata.number_of_models = count_special_lines(
            '^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', dataset.file_name)

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """

        Split the input files by model records.
        """
        if split_params is None:
            return None

        if len(input_datasets) > 1:
            raise Exception(
                "STOCKHOLM-file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            raise Exception(
                'Split mode "%s" is currently not implemented for STOCKHOLM-files.'
                % split_params['split_mode'])
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' %
                            split_params['split_mode'])

        def _read_stockholm_records(filename):
            lines = []
            with open(filename) as handle:
                for line in handle:
                    lines.append(line)
                    if line.strip() == '//':
                        yield lines
                        lines = []

        def _write_part_stockholm_file(accumulated_lines):
            part_dir = subdir_generator_function()
            part_path = os.path.join(part_dir,
                                     os.path.basename(input_files[0]))
            part_file = open(part_path, 'w')
            part_file.writelines(accumulated_lines)
            part_file.close()

        try:

            stockholm_records = _read_stockholm_records(input_files[0])
            stockholm_lines_accumulated = []
            for counter, stockholm_record in enumerate(stockholm_records,
                                                       start=1):
                stockholm_lines_accumulated.extend(stockholm_record)
                if counter % chunk_size == 0:
                    _write_part_stockholm_file(stockholm_lines_accumulated)
                    stockholm_lines_accumulated = []
            if stockholm_lines_accumulated:
                _write_part_stockholm_file(stockholm_lines_accumulated)
        except Exception as e:
            log.error('Unable to split files: %s' % str(e))
            raise

    split = classmethod(split)
Exemplo n.º 6
0
class GroupAbund(Otu):
    file_ext = 'mothur.shared'
    MetadataElement(name="groups",
                    default=[],
                    desc="Group Names",
                    readonly=True,
                    visible=True,
                    no_value=[])

    def __init__(self, **kwd):
        super(GroupAbund, self).__init__(**kwd)

    def init_meta(self, dataset, copy_from=None):
        super(GroupAbund, self).init_meta(dataset, copy_from=copy_from)

    def set_meta(self, dataset, overwrite=True, skip=1, **kwd):
        super(GroupAbund, self).set_meta(dataset, overwrite=overwrite, **kwd)

        # See if file starts with header line
        if dataset.has_data():
            label_names = set()
            group_names = set()
            data_lines = 0
            comment_lines = 0
            ncols = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            for line in headers:
                if line[0] == 'label' and line[1] == 'Group':
                    skip = 1
                    comment_lines += 1
                else:
                    skip = 0
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                    group_names.add(line[1])

            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.groups = list(group_names)
            dataset.metadata.groups.sort()
            dataset.metadata.skip = skip

    def sniff(self, filename, vals_are_int=False):
        """
        Determines whether the file is a otu (operational taxonomic unit)
        Shared format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        The first line is column headings as of Mothur v 1.2

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' )
        >>> GroupAbund().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' )
        >>> GroupAbund().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 3:
                    return False
                if count > 0 or line[0] != 'label':
                    try:
                        check = int(line[2])
                        if check + 3 != len(line):
                            return False
                        for i in range(3, len(line)):
                            if vals_are_int:
                                int(line[i])
                            else:
                                float(line[i])
                    except ValueError:
                        return False
                count += 1
        if count > 1:
            return True
        return False
Exemplo n.º 7
0
class SffFlow(Tabular):
    MetadataElement(name="flow_values",
                    default="",
                    no_value="",
                    optional=True,
                    desc="Total number of flow values",
                    readonly=True)
    MetadataElement(name="flow_order",
                    default="TACG",
                    no_value="TACG",
                    desc="Total number of flow values",
                    readonly=False)
    file_ext = 'mothur.sff.flow'
    """
        http://www.mothur.org/wiki/Flow_file
        The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400.
        Following lines contain:
        - SequenceName
        - the number of useable flows as defined by 454's software
        - the flow intensity for each base going in the order of TACG.
        Example:
          800
          GQY1XT001CQL4K 85 1.04 0.00 1.00 0.02 0.03 1.02 0.05 ...
          GQY1XT001CQIRF 84 1.02 0.06 0.98 0.06 0.09 1.05 0.07 ...
          GQY1XT001CF5YW 88 1.02 0.02 1.01 0.04 0.06 1.02 0.03 ...
    """
    def __init__(self, **kwd):
        super(SffFlow, self).__init__(**kwd)

    def set_meta(self,
                 dataset,
                 overwrite=True,
                 skip=1,
                 max_data_lines=None,
                 **kwd):
        super(SffFlow, self).set_meta(dataset, overwrite, 1, max_data_lines)

        headers = get_headers(dataset.file_name, sep='\t', count=1)
        try:
            flow_values = int(headers[0][0])
            dataset.metadata.flow_values = flow_values
        except Exception as e:
            log.warning("SffFlow set_meta %s" % e)

    def make_html_table(self, dataset, skipchars=[]):
        """Create HTML table, used for displaying peek"""
        try:
            out = '<table cellspacing="0" cellpadding="3">'

            # Generate column header
            out += '<tr>'
            out += '<th>%d. Name</th>' % 1
            out += '<th>%d. Flows</th>' % 2
            for i in range(3, dataset.metadata.columns + 1):
                base = dataset.metadata.flow_order[(i + 1) % 4]
                out += '<th>%d. %d %s</th>' % (i - 2, base)
            out += '</tr>'
            out += self.make_html_peek_rows(dataset, skipchars=skipchars)
            out += '</table>'
        except Exception as exc:
            out = "Can't create peek %s" % str(exc)
        return out
Exemplo n.º 8
0
class GenomeGraphs(Tabular):
    """
    Tab delimited data containing a marker id and any number of numeric values
    """

    MetadataElement(name="markerCol",
                    default=1,
                    desc="Marker ID column",
                    param=metadata.ColumnParameter)
    MetadataElement(name="columns",
                    default=3,
                    desc="Number of columns",
                    readonly=True)
    MetadataElement(name="column_types",
                    default=[],
                    desc="Column types",
                    readonly=True,
                    visible=False)
    file_ext = 'gg'

    def __init__(self, **kwd):
        """
        Initialize gg datatype, by adding UCSC display apps
        """
        Tabular.__init__(self, **kwd)
        self.add_display_app('ucsc', 'Genome Graph', 'as_ucsc_display_file',
                             'ucsc_links')

    def set_meta(self, dataset, **kwd):
        Tabular.set_meta(self, dataset, **kwd)
        dataset.metadata.markerCol = 1
        header = open(dataset.file_name).readlines()[0].strip().split('\t')
        dataset.metadata.columns = len(header)
        t = ['numeric' for x in header]
        t[0] = 'string'
        dataset.metadata.column_types = t
        return True

    def as_ucsc_display_file(self, dataset, **kwd):
        """
        Returns file
        """
        return open(dataset.file_name, 'rb')

    def ucsc_links(self, dataset, type, app, base_url):
        """
        from the ever-helpful angie hinrichs [email protected]
        a genome graphs call looks like this

        http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname
        &hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess
        &hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals=
        &hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index
        &hgGenome_doSubmitUpload=submit

        Galaxy gives this for an interval file

        http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText=
        http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc

        """
        ret_val = []
        if not dataset.dbkey:
            dataset.dbkey = 'hg18'  # punt!
        if dataset.has_data():
            for site_name, site_url in app.datatypes_registry.get_legacy_sites_by_build(
                    'ucsc', dataset.dbkey):
                if site_name in app.datatypes_registry.get_display_sites(
                        'ucsc'):
                    site_url = site_url.replace(
                        '/hgTracks?', '/hgGenome?')  # for genome graphs
                    internal_url = "%s" % app.url_for(
                        controller='dataset',
                        dataset_id=dataset.id,
                        action='display_at',
                        filename='ucsc_' + site_name)
                    display_url = "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (
                        base_url, app.url_for(controller='root'), dataset.id,
                        type)
                    display_url = quote_plus(display_url)
                    # was display_url = quote_plus( "%s/display_as?id=%i&display_app=%s" % (base_url, dataset.id, type) )
                    # redirect_url = quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) )
                    sl = [
                        "{}db={}".format(site_url, dataset.dbkey),
                    ]
                    # sl.append("&hgt.customText=%s")
                    sl.append(
                        "&hgGenome_dataSetName={}&hgGenome_dataSetDescription={}"
                        .format(dataset.name, 'GalaxyGG_data'))
                    sl.append(
                        "&hgGenome_formatType=best guess&hgGenome_markerType=best guess"
                    )
                    sl.append(
                        "&hgGenome_columnLabels=first row&hgGenome_maxVal=&hgGenome_labelVals="
                    )
                    sl.append("&hgGenome_doSubmitUpload=submit")
                    sl.append(
                        "&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%s"
                        % display_url)
                    s = ''.join(sl)
                    s = quote_plus(s)
                    redirect_url = s
                    link = '{}?redirect_url={}&display_url={}'.format(
                        internal_url, redirect_url, display_url)
                    ret_val.append((site_name, link))
        return ret_val

    def make_html_table(self, dataset, skipchars=[]):
        """
        Create HTML table, used for displaying peek
        """
        out = ['<table cellspacing="0" cellpadding="3">']
        try:
            with open(dataset.file_name) as f:
                d = f.readlines()[:5]
            if len(d) == 0:
                out = "Cannot find anything to parse in %s" % dataset.name
                return out
            hasheader = 0
            try:
                ['%f' % x for x in d[0][1:]
                 ]  # first is name - see if starts all numerics
            except Exception:
                hasheader = 1
            # Generate column header
            out.append('<tr>')
            if hasheader:
                for i, name in enumerate(d[0].split()):
                    out.append('<th>{}.{}</th>'.format(i + 1, name))
                d.pop(0)
                out.append('</tr>')
            for row in d:
                out.append('<tr>')
                out.append(''.join('<td>%s</td>' % x for x in row.split()))
                out.append('</tr>')
            out.append('</table>')
            out = "".join(out)
        except Exception as exc:
            out = "Can't create peek %s" % exc
        return out

    def validate(self, dataset, **kwd):
        """
        Validate a gg file - all numeric after header row
        """
        with open(dataset.file_name) as infile:
            next(infile)  # header
            for i, row in enumerate(infile):
                ll = row.strip().split('\t')[
                    1:]  # first is alpha feature identifier
                for j, x in enumerate(ll):
                    x = float(x)
        return DatatypeValidation.validated()

    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is in gg format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'test_space.txt' )
        >>> GenomeGraphs().sniff( fname )
        False
        >>> fname = get_test_fname( '1.gg' )
        >>> GenomeGraphs().sniff( fname )
        True
        """
        buf = file_prefix.contents_header
        rows = [l.split() for l in buf.splitlines()[1:4]
                ]  # break on lines and drop header, small sample

        if len(rows) < 1:
            return False

        for row in rows:
            if len(row) < 2:
                # Must actually have a marker and at least one numeric value
                return False
            first_val = row[0]
            if not VALID_GENOME_GRAPH_MARKERS.match(first_val):
                return False
            rest_row = row[1:]
            try:
                [float(x) for x in rest_row]  # first col has been removed
            except ValueError:
                return False
        return True

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'application/vnd.ms-excel'
Exemplo n.º 9
0
class IdeasPre(Html):
    """
    This datatype defines the input format required by IDEAS:
    https://academic.oup.com/nar/article/44/14/6721/2468150
    The IDEAS preprocessor tool produces an output using this
    format.  The extra_files_path of the primary input dataset
    contains the following files and directories.
    - chromosome_windows.txt (optional)
    - chromosomes.bed (optional)
    - IDEAS_input_config.txt
    - compressed archived tmp directory containing a number of compressed bed files.
    """

    MetadataElement(name="base_name",
                    desc="Base name for this dataset",
                    default='IDEASData',
                    readonly=True,
                    set_in_upload=True)
    MetadataElement(name="chrom_bed",
                    desc="Bed file specifying window positions",
                    default=None,
                    readonly=True)
    MetadataElement(name="chrom_windows",
                    desc="Chromosome window positions",
                    default=None,
                    readonly=True)
    MetadataElement(name="input_config",
                    desc="IDEAS input config",
                    default=None,
                    readonly=True)
    MetadataElement(name="tmp_archive",
                    desc="Compressed archive of compressed bed files",
                    default=None,
                    readonly=True)

    composite_type = 'auto_primary_file'
    allow_datatype_change = False
    file_ext = 'ideaspre'

    def __init__(self, **kwd):
        Html.__init__(self, **kwd)
        self.add_composite_file('chromosome_windows.txt',
                                description='Chromosome window positions',
                                is_binary=False,
                                optional=True)
        self.add_composite_file(
            'chromosomes.bed',
            description='Bed file specifying window positions',
            is_binary=False,
            optional=True)
        self.add_composite_file('IDEAS_input_config.txt',
                                description='IDEAS input config',
                                is_binary=False)
        self.add_composite_file(
            'tmp.tar.gz',
            description='Compressed archive of compressed bed files',
            is_binary=True)

    def set_meta(self, dataset, **kwd):
        Html.set_meta(self, dataset, **kwd)
        for fname in os.listdir(dataset.extra_files_path):
            if fname.startswith("chromosomes"):
                dataset.metadata.chrom_bed = os.path.join(
                    dataset.extra_files_path, fname)
            elif fname.startswith("chromosome_windows"):
                dataset.metadata.chrom_windows = os.path.join(
                    dataset.extra_files_path, fname)
            elif fname.startswith("IDEAS_input_config"):
                dataset.metadata.input_config = os.path.join(
                    dataset.extra_files_path, fname)
            elif fname.startswith("tmp"):
                dataset.metadata.tmp_archive = os.path.join(
                    dataset.extra_files_path, fname)
        self.regenerate_primary_file(dataset)

    def generate_primary_file(self, dataset=None):
        rval = ['<html><head></head><body>']
        rval.append('<h3>Files prepared for IDEAS</h3>')
        rval.append('<ul>')
        for composite_name, composite_file in self.get_composite_files(
                dataset=dataset).items():
            fn = composite_name
            rval.append('<li><a href="{}>{}</a></li>'.format(fn, fn))
        rval.append('</ul></body></html>\n')
        return "\n".join(rval)

    def regenerate_primary_file(self, dataset):
        # Cannot do this until we are setting metadata.
        rval = ['<html><head></head><body>']
        rval.append('<h3>Files prepared for IDEAS</h3>')
        rval.append('<ul>')
        for fname in os.listdir(dataset.extra_files_path):
            fn = os.path.split(fname)[-1]
            rval.append('<li><a href="{}">{}</a></li>'.format(fn, fn))
        rval.append('</ul></body></html>')
        with open(dataset.file_name, 'w') as f:
            f.write("\n".join(rval))
            f.write('\n')
Exemplo n.º 10
0
class SnpSiftDbNSFP(Text):
    """Class describing a dbNSFP database prepared fpr use by SnpSift dbnsfp """
    MetadataElement(name='reference_name', default='dbSNFP', desc='Reference Name', readonly=True, visible=True, set_in_upload=True, no_value='dbSNFP')
    MetadataElement(name="bgzip", default=None, desc="dbNSFP bgzip", readonly=True, visible=True, no_value=None)
    MetadataElement(name="index", default=None, desc="Tabix Index File", readonly=True, visible=True, no_value=None)
    MetadataElement(name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[])
    file_ext = "snpsiftdbnsfp"
    composite_type = 'auto_primary_file'
    """
    ## The dbNSFP file is a tabular file with 1 header line
    ## The first 4 columns are required to be: chrom	pos	ref	alt
    ## These match columns 1,2,4,5 of the VCF file
    ## SnpSift requires the file to be block-gzipped and the indexed with samtools tabix
    ## Example:
    ## Compress using block-gzip algorithm
    bgzip dbNSFP2.3.txt
    ## Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFP2.3.txt.gz
    """

    def __init__(self, **kwd):
        super().__init__(**kwd)
        self.add_composite_file('%s.gz', description='dbNSFP bgzip', substitute_name_with_metadata='reference_name', is_binary=True)
        self.add_composite_file('%s.gz.tbi', description='Tabix Index File', substitute_name_with_metadata='reference_name', is_binary=True)

    def generate_primary_file(self, dataset=None):
        """
        This is called only at upload to write the html file
        cannot rename the datasets here - they come with the default unfortunately
        """
        return '<html><head><title>SnpSiftDbNSFP Composite Dataset</title></head></html>'

    def regenerate_primary_file(self, dataset):
        """
        cannot do this until we are setting metadata
        """
        annotations = "dbNSFP Annotations: %s\n" % ','.join(dataset.metadata.annotation)
        with open(dataset.file_name, 'a') as f:
            if dataset.metadata.bgzip:
                bn = dataset.metadata.bgzip
                f.write(bn)
                f.write('\n')
            f.write(annotations)

    def set_meta(self, dataset, overwrite=True, **kwd):
        try:
            efp = dataset.extra_files_path
            if os.path.exists(efp):
                flist = os.listdir(efp)
                for fname in flist:
                    if fname.endswith('.gz'):
                        dataset.metadata.bgzip = fname
                        try:
                            with gzip.open(os.path.join(efp, fname), 'rt') as fh:
                                buf = fh.read(5000)
                                lines = buf.splitlines()
                                headers = lines[0].split('\t')
                                dataset.metadata.annotation = headers[4:]
                        except Exception as e:
                            log.warning("set_meta fname: %s  %s", fname, unicodify(e))
                    if fname.endswith('.tbi'):
                        dataset.metadata.index = fname
            self.regenerate_primary_file(dataset)
        except Exception as e:
            log.warning("set_meta fname: %s  %s", dataset.file_name if dataset and dataset.file_name else 'Unkwown', unicodify(e))

        def set_peek(self, dataset, is_multi_byte=False):
            if not dataset.dataset.purged:
                dataset.peek = '{} :  {}'.format(dataset.metadata.reference_name, ','.join(dataset.metadata.annotation))
                dataset.blurb = '%s' % dataset.metadata.reference_name
            else:
                dataset.peek = 'file does not exist'
                dataset.blurb = 'file purged from disc'
Exemplo n.º 11
0
class Rgenetics(Html):
    """
    base class to use for rgenetics datatypes
    derived from html - composite datatype elements
    stored in extra files path
    """

    MetadataElement(
        name="base_name",
        desc="base name for all transformed versions of this genetic dataset",
        default='RgeneticsData',
        readonly=True,
        set_in_upload=True)

    composite_type = 'auto_primary_file'
    allow_datatype_change = False
    file_ext = 'rgenetics'

    def generate_primary_file(self, dataset=None):
        rval = [
            '<html><head><title>Rgenetics Galaxy Composite Dataset </title></head><p/>'
        ]
        rval.append(
            '<div>This composite dataset is composed of the following files:<p/><ul>'
        )
        for composite_name, composite_file in self.get_composite_files(
                dataset=dataset).items():
            fn = composite_name
            opt_text = ''
            if composite_file.optional:
                opt_text = ' (optional)'
            if composite_file.get('description'):
                rval.append(
                    '<li><a href="{}" type="application/binary">{} ({})</a>{}</li>'
                    .format(fn, fn, composite_file.get('description'),
                            opt_text))
            else:
                rval.append(
                    '<li><a href="{}" type="application/binary">{}</a>{}</li>'.
                    format(fn, fn, opt_text))
        rval.append('</ul></div></html>')
        return "\n".join(rval)

    def regenerate_primary_file(self, dataset):
        """
        cannot do this until we are setting metadata
        """
        efp = dataset.extra_files_path
        flist = os.listdir(efp)
        rval = [
            '<html><head><title>Files for Composite Dataset {}</title></head><body><p/>Composite {} contains:<p/><ul>'
            .format(dataset.name, dataset.name)
        ]
        for i, fname in enumerate(flist):
            sfname = os.path.split(fname)[-1]
            f, e = os.path.splitext(fname)
            rval.append('<li><a href="{}">{}</a></li>'.format(sfname, sfname))
        rval.append('</ul></body></html>')
        with open(dataset.file_name, 'w') as f:
            f.write("\n".join(rval))
            f.write('\n')

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/html'

    def set_meta(self, dataset, **kwd):
        """
        for lped/pbed eg

        """
        Html.set_meta(self, dataset, **kwd)
        if not kwd.get('overwrite'):
            if verbose:
                gal_Log.debug(
                    '@@@ rgenetics set_meta called with overwrite = False')
            return True
        try:
            efp = dataset.extra_files_path
        except Exception:
            if verbose:
                gal_Log.debug(
                    '@@@rgenetics set_meta failed {} - dataset {} has no efp ?'
                    .format(sys.exc_info()[0], dataset.name))
            return False
        try:
            flist = os.listdir(efp)
        except Exception:
            if verbose:
                gal_Log.debug(
                    '@@@rgenetics set_meta failed {} - dataset {} has no efp ?'
                    .format(sys.exc_info()[0], dataset.name))
            return False
        if len(flist) == 0:
            if verbose:
                gal_Log.debug(
                    '@@@rgenetics set_meta failed - {} efp {} is empty?'.
                    format(dataset.name, efp))
            return False
        self.regenerate_primary_file(dataset)
        if not dataset.info:
            dataset.info = 'Galaxy genotype datatype object'
        if not dataset.blurb:
            dataset.blurb = 'Composite file - Rgenetics Galaxy toolkit'
        return True
Exemplo n.º 12
0
class SnpEffDb(Text):
    """Class describing a SnpEff genome build"""
    edam_format = "format_3624"
    file_ext = "snpeffdb"
    MetadataElement(name="genome_version", default=None, desc="Genome Version", readonly=True, visible=True, no_value=None)
    MetadataElement(name="snpeff_version", default="SnpEff4.0", desc="SnpEff Version", readonly=True, visible=True, no_value=None)
    MetadataElement(name="regulation", default=[], desc="Regulation Names", readonly=True, visible=True, no_value=[], optional=True)
    MetadataElement(name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[], optional=True)

    def __init__(self, **kwd):
        super().__init__(**kwd)

    # The SnpEff version line was added in SnpEff version 4.1
    def getSnpeffVersionFromFile(self, path):
        snpeff_version = None
        try:
            with gzip.open(path, 'rt') as fh:
                buf = fh.read(100)
                lines = buf.splitlines()
                m = re.match(r'^(SnpEff)\s+(\d+\.\d+).*$', lines[0].strip())
                if m:
                    snpeff_version = m.groups()[0] + m.groups()[1]
        except Exception:
            pass
        return snpeff_version

    def set_meta(self, dataset, **kwd):
        super().set_meta(dataset, **kwd)
        data_dir = dataset.extra_files_path
        # search data_dir/genome_version for files
        regulation_pattern = 'regulation_(.+).bin'
        #  annotation files that are included in snpEff by a flag
        annotations_dict = {'nextProt.bin': '-nextprot', 'motif.bin': '-motif', 'interactions.bin': '-interaction'}
        regulations = []
        annotations = []
        genome_version = None
        snpeff_version = None
        if data_dir and os.path.isdir(data_dir):
            for root, _, files in os.walk(data_dir):
                for fname in files:
                    if fname.startswith('snpEffectPredictor'):
                        # if snpEffectPredictor.bin download succeeded
                        genome_version = os.path.basename(root)
                        dataset.metadata.genome_version = genome_version
                        # read the first line of the gzipped snpEffectPredictor.bin file to get the SnpEff version
                        snpeff_version = self.getSnpeffVersionFromFile(os.path.join(root, fname))
                        if snpeff_version:
                            dataset.metadata.snpeff_version = snpeff_version
                    else:
                        m = re.match(regulation_pattern, fname)
                        if m:
                            name = m.groups()[0]
                            regulations.append(name)
                        elif fname in annotations_dict:
                            value = annotations_dict[fname]
                            name = value.lstrip('-')
                            annotations.append(name)
            dataset.metadata.regulation = regulations
            dataset.metadata.annotation = annotations
            try:
                with open(dataset.file_name, 'w') as fh:
                    fh.write("%s\n" % genome_version if genome_version else 'Genome unknown')
                    fh.write("%s\n" % snpeff_version if snpeff_version else 'SnpEff version unknown')
                    if annotations:
                        fh.write("annotations: %s\n" % ','.join(annotations))
                    if regulations:
                        fh.write("regulations: %s\n" % ','.join(regulations))
            except Exception:
                pass
Exemplo n.º 13
0
class ImgtJson(Json):
    file_ext = "imgt.json"
    MetadataElement(name="taxon_names", default=[], desc="taxonID: names", readonly=True, visible=True, no_value=[])
    """
        https://github.com/repseqio/library-imgt/releases
        Data coming from IMGT server may be used for academic research only,
        provided that it is referred to IMGT®, and cited as:
        "IMGT®, the international ImMunoGeneTics information system®
        http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France)."
    """

    def set_peek(self, dataset, is_multi_byte=False):
        super().set_peek(dataset)
        if not dataset.dataset.purged:
            dataset.blurb = "IMGT Library"

    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is in json format with imgt elements

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( '1.json' )
        >>> ImgtJson().sniff( fname )
        False
        >>> fname = get_test_fname( 'imgt.json' )
        >>> ImgtJson().sniff( fname )
        True
        """
        is_imgt = False
        if self._looks_like_json(file_prefix):
            is_imgt = self._looks_like_imgt(file_prefix)
        return is_imgt

    def _looks_like_imgt(self, file_prefix, load_size=5000):
        """
        @param filepath: [str] The path to the evaluated file.
        @param load_size: [int] The size of the file block load in RAM (in
                          bytes).
        """
        is_imgt = False
        try:
            with open(file_prefix.filename) as fh:
                segment_str = fh.read(load_size)
                if segment_str.strip().startswith('['):
                    if '"taxonId"' in segment_str and '"anchorPoints"' in segment_str:
                        is_imgt = True
        except Exception:
            pass
        return is_imgt

    def set_meta(self, dataset, **kwd):
        """
            Store metadata information from the imgt file.
        """
        if dataset.has_data():
            with open(dataset.file_name) as fh:
                try:
                    json_dict = json.load(fh)
                    tax_names = []
                    for entry in json_dict:
                        if 'taxonId' in entry:
                            names = "%d: %s" % (entry['taxonId'], ','.join(entry['speciesNames']))
                            tax_names.append(names)
                    dataset.metadata.taxon_names = tax_names
                except Exception:
                    return
Exemplo n.º 14
0
class Biom1(Json):
    """
        BIOM version 1.0 file format description
        http://biom-format.org/documentation/format_versions/biom-1.0.html
    """
    file_ext = "biom1"
    edam_format = "format_3746"

    MetadataElement(name="table_rows", default=[], desc="table_rows", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=[])
    MetadataElement(name="table_matrix_element_type", default="", desc="table_matrix_element_type", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value="")
    MetadataElement(name="table_format", default="", desc="table_format", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value="")
    MetadataElement(name="table_generated_by", default="", desc="table_generated_by", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value="")
    MetadataElement(name="table_matrix_type", default="", desc="table_matrix_type", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value="")
    MetadataElement(name="table_shape", default=[], desc="table_shape", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=[])
    MetadataElement(name="table_format_url", default="", desc="table_format_url", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value="")
    MetadataElement(name="table_date", default="", desc="table_date", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value="")
    MetadataElement(name="table_type", default="", desc="table_type", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value="")
    MetadataElement(name="table_id", default=None, desc="table_id", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value=None)
    MetadataElement(name="table_columns", default=[], desc="table_columns", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=[])
    MetadataElement(name="table_column_metadata_headers", default=[], desc="table_column_metadata_headers", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value=[])

    def set_peek(self, dataset, is_multi_byte=False):
        super().set_peek(dataset)
        if not dataset.dataset.purged:
            dataset.blurb = "Biological Observation Matrix v1"

    def sniff_prefix(self, file_prefix):
        is_biom = False
        if self._looks_like_json(file_prefix):
            is_biom = self._looks_like_biom(file_prefix)
        return is_biom

    def _looks_like_biom(self, file_prefix, load_size=50000):
        """
        @param filepath: [str] The path to the evaluated file.
        @param load_size: [int] The size of the file block load in RAM (in
                          bytes).
        """
        is_biom = False
        segment_size = int(load_size / 2)
        try:
            with open(file_prefix.filename) as fh:
                prev_str = ""
                segment_str = fh.read(segment_size)
                if segment_str.strip().startswith('{'):
                    while segment_str:
                        current_str = prev_str + segment_str
                        if '"format"' in current_str:
                            current_str = re.sub(r'\s', '', current_str)
                            if '"format":"BiologicalObservationMatrix' in current_str:
                                is_biom = True
                                break
                        prev_str = segment_str
                        segment_str = fh.read(segment_size)
        except Exception:
            pass
        return is_biom

    def set_meta(self, dataset, **kwd):
        """
            Store metadata information from the BIOM file.
        """
        if dataset.has_data():
            with open(dataset.file_name) as fh:
                try:
                    json_dict = json.load(fh)
                except Exception:
                    return

                def _transform_dict_list_ids(dict_list):
                    if dict_list:
                        return [x.get('id', None) for x in dict_list]
                    return []

                b_transform = {'rows': _transform_dict_list_ids, 'columns': _transform_dict_list_ids}
                for (m_name, b_name) in [('table_rows', 'rows'),
                                         ('table_matrix_element_type', 'matrix_element_type'),
                                         ('table_format', 'format'),
                                         ('table_generated_by', 'generated_by'),
                                         ('table_matrix_type', 'matrix_type'),
                                         ('table_shape', 'shape'),
                                         ('table_format_url', 'format_url'),
                                         ('table_date', 'date'),
                                         ('table_type', 'type'),
                                         ('table_id', 'id'),
                                         ('table_columns', 'columns')]:
                    try:
                        metadata_value = json_dict.get(b_name, None)
                        if b_name == "columns" and metadata_value:
                            keep_columns = set()
                            for column in metadata_value:
                                if column['metadata'] is not None:
                                    for k, v in column['metadata'].items():
                                        if v is not None:
                                            keep_columns.add(k)
                            final_list = sorted(list(keep_columns))
                            dataset.metadata.table_column_metadata_headers = final_list
                        if b_name in b_transform:
                            metadata_value = b_transform[b_name](metadata_value)
                        setattr(dataset.metadata, m_name, metadata_value)
                    except Exception:
                        log.exception("Something in the metadata detection for biom1 went wrong.")
Exemplo n.º 15
0
class NeperTesr(Binary):
    """
    Neper Raster Tessellation File
    ***tesr
      **format
        format
      **general
        dimension
        size_x size_y [size_z]
        voxsize_x voxsize_y [voxsize_z]
       [*origin
        origin_x origin_y [origin_z]]
       [*hasvoid has_void]
      [**cell
        number_of_cells
    """
    file_ext = "neper.tesr"
    MetadataElement(name="format",
                    default=None,
                    desc="format",
                    readonly=True,
                    visible=True)
    MetadataElement(name="dimension",
                    default=None,
                    desc="dimension",
                    readonly=True,
                    visible=True)
    MetadataElement(name="size",
                    default=[],
                    desc="size",
                    readonly=True,
                    visible=True)
    MetadataElement(name="voxsize",
                    default=[],
                    desc="voxsize",
                    readonly=True,
                    visible=True)
    MetadataElement(name="origin",
                    default=[],
                    desc="origin",
                    readonly=True,
                    visible=True)
    MetadataElement(name="cells",
                    default=None,
                    desc="cells",
                    readonly=True,
                    visible=True)

    def __init__(self, **kwd):
        Binary.__init__(self, **kwd)

    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        Neper tesr format startswith:***tesr
        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('test.neper.tesr')
        >>> NeperTesr().sniff(fname)
        True
        >>> fname = get_test_fname('test.neper.tess')
        >>> NeperTesr().sniff(fname)
        False
        """
        return file_prefix.text_io(
            errors='ignore').readline(10).startswith('***tesr')

    def set_meta(self, dataset, **kwd):
        if dataset.has_data():
            with open(dataset.file_name, errors='ignore') as fh:
                field = ''
                for i, line in enumerate(fh):
                    line = line.strip()
                    if not line or i > 12:
                        break
                    if i == 0 and not line.startswith('***tesr'):
                        break
                    if line.startswith('*'):
                        field = line
                        continue
                    if i == 2:
                        dataset.metadata.format = line.split()[0]
                        continue
                    if i == 4:
                        dataset.metadata.dimension = line.split()[0]
                        continue
                    if i == 5:
                        dataset.metadata.size = line.split()
                        continue
                    if i == 6:
                        dataset.metadata.voxsize = line.split()
                        continue
                    if field.startswith('*origin'):
                        dataset.metadata.origin = line.split()
                        continue
                    if field.startswith('**cell'):
                        dataset.metadata.cells = int(line)
                        break

    def set_peek(self, dataset):
        if not dataset.dataset.purged:
            dataset.peek = get_file_peek(dataset.file_name, LINE_COUNT=9)
            dataset.blurb = f'format: {str(dataset.metadata.format)} dim: {str(dataset.metadata.dimension)} cells: {str(dataset.metadata.cells)}'
        else:
            dataset.peek = 'File does not exist'
            dataset.blurb = 'File purged from disc'
Exemplo n.º 16
0
class RexpBase(Html):
    """
    base class for BioC data structures in Galaxy
    must be constructed with the pheno data in place since that
    goes into the metadata for each instance
    """
    MetadataElement(name="columns",
                    default=0,
                    desc="Number of columns",
                    visible=True)
    MetadataElement(name="column_names",
                    default=[],
                    desc="Column names",
                    visible=True)
    MetadataElement(name="pheCols",
                    default=[],
                    desc="Select list for potentially interesting variables",
                    visible=True)
    MetadataElement(
        name="base_name",
        desc=
        "base name for all transformed versions of this expression dataset",
        default='rexpression',
        set_in_upload=True)
    MetadataElement(name="pheno_path",
                    desc="Path to phenotype data for this experiment",
                    default="rexpression.pheno",
                    visible=True)
    file_ext = 'rexpbase'
    html_table = None
    composite_type = 'auto_primary_file'
    allow_datatype_change = False

    def __init__(self, **kwd):
        Html.__init__(self, **kwd)
        self.add_composite_file('%s.pheno',
                                description='Phenodata tab text file',
                                substitute_name_with_metadata='base_name',
                                is_binary=False)

    def generate_primary_file(self, dataset=None):
        """
        This is called only at upload to write the html file
        cannot rename the datasets here - they come with the default unfortunately
        """
        return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'

    def get_mime(self):
        """Returns the mime type of the datatype"""
        return 'text/html'

    def get_phecols(self, phenolist=[], maxConc=20):
        """
        sept 2009: cannot use whitespace to split - make a more complex structure here
        and adjust the methods that rely on this structure
        return interesting phenotype column names for an rexpression eset or affybatch
        to use in array subsetting and so on. Returns a data structure for a
        dynamic Galaxy select parameter.
        A column with only 1 value doesn't change, so is not interesting for
        analysis. A column with a different value in every row is equivalent to a unique
        identifier so is also not interesting for anova or limma analysis - both these
        are removed after the concordance (count of unique terms) is constructed for each
        column. Then a complication - each remaining pair of columns is tested for
        redundancy - if two columns are always paired, then only one is needed :)
        """
        for nrows, row in enumerate(phenolist):  # construct concordance
            if len(row.strip()) == 0:
                break
            row = row.strip().split('\t')
            if nrows == 0:  # set up from header
                head = row
                totcols = len(row)
                concordance = [{} for x in head]  # list of dicts
            else:
                for col, code in enumerate(row):  # keep column order correct
                    if col >= totcols:
                        gal_Log.warning(
                            '### get_phecols error in pheno file - row %d col %d (%s) longer than header %s'
                            % (nrows, col, row, head))
                    else:
                        concordance[col].setdefault(code,
                                                    0)  # first one is zero
                        concordance[col][code] += 1
        useCols = []
        useConc = []  # columns of interest to keep
        nrows = len(phenolist)
        nrows -= 1  # drop head from count
        for c, conc in enumerate(concordance):  # c is column number
            if (len(conc) > 1) and (len(conc) < min(
                    nrows, maxConc)):  # not all same and not all different!!
                useConc.append(conc)  # keep concordance
                useCols.append(c)  # keep column
        nuse = len(useCols)
        # now to check for pairs of concordant columns - drop one of these.
        delme = []
        p = phenolist[1:]  # drop header
        plist = [x.strip().split('\t') for x in p]  # list of lists
        phe = [[x[i] for i in useCols] for x in plist
               if len(x) >= totcols]  # strip unused data
        for i in range(0, (nuse - 1)):  # for each interesting column
            for j in range(i + 1, nuse):
                kdict = {}
                for row in phe:  # row is a list of lists
                    k = '{}{}'.format(row[i], row[j])  # composite key
                    kdict[k] = k
                if (len(kdict.keys()) == len(concordance[useCols[j]])
                    ):  # i and j are always matched
                    delme.append(j)
        delme = list(set(delme))  # remove dupes
        listCol = []
        delme.sort()
        delme.reverse()  # must delete from far end!
        for i in delme:
            del useConc[i]  # get rid of concordance
            del useCols[i]  # and usecols entry
        for i, conc in enumerate(
                useConc):  # these are all unique columns for the design matrix
            ccounts = sorted(
                (conc.get(code, 0), code) for code in conc.keys())  # decorate
            cc = [(x[1], x[0]) for x in ccounts]  # list of code count tuples
            codeDetails = (head[useCols[i]], cc
                           )  # ('foo',[('a',3),('b',11),..])
            listCol.append(codeDetails)
        if len(listCol) > 0:
            res = listCol
            # metadata.pheCols becomes [('bar;22,zot;113','foo'), ...]
        else:
            res = [
                ('no usable phenotype columns found', [
                    ('?', 0),
                ]),
            ]
        return res

    def get_pheno(self, dataset):
        """
        expects a .pheno file in the extra_files_dir - ugh
        note that R is wierd and adds the row.name in
        the header so the columns are all wrong - unless you tell it not to.
        A file can be written as
        write.table(file='foo.pheno',pData(foo),sep='\t',quote=F,row.names=F)
        """
        p = open(dataset.metadata.pheno_path).readlines()
        if len(p) > 0:  # should only need to fix an R pheno file once
            head = p[0].strip().split('\t')
            line1 = p[1].strip().split('\t')
            if len(head) < len(line1):
                head.insert(0, 'ChipFileName')  # fix R write.table b0rken-ness
                p[0] = '\t'.join(head)
        else:
            p = []
        return '\n'.join(p)

    def set_peek(self, dataset, **kwd):
        """
        expects a .pheno file in the extra_files_dir - ugh
        note that R is weird and does not include the row.name in
        the header. why?"""
        if not dataset.dataset.purged:
            pp = os.path.join(dataset.extra_files_path,
                              '%s.pheno' % dataset.metadata.base_name)
            try:
                with open(pp) as f:
                    p = f.readlines()
            except Exception:
                p = [
                    '##failed to find %s' % pp,
                ]
            dataset.peek = ''.join(p[:5])
            dataset.blurb = 'Galaxy Rexpression composite file'
        else:
            dataset.peek = 'file does not exist\n'
            dataset.blurb = 'file purged from disk'

    def get_peek(self, dataset):
        """
        expects a .pheno file in the extra_files_dir - ugh
        """
        pp = os.path.join(dataset.extra_files_path,
                          '%s.pheno' % dataset.metadata.base_name)
        try:
            with open(pp) as f:
                p = f.readlines()
        except Exception:
            p = ['##failed to find %s' % pp]
        return ''.join(p[:5])

    def get_file_peek(self, filename):
        """
        can't really peek at a filename - need the extra_files_path and such?
        """
        h = '## rexpression get_file_peek: no file found'
        try:
            with open(filename) as f:
                h = f.readlines()
        except Exception:
            pass
        return ''.join(h[:5])

    def regenerate_primary_file(self, dataset):
        """
        cannot do this until we are setting metadata
        """
        bn = dataset.metadata.base_name
        flist = os.listdir(dataset.extra_files_path)
        rval = [
            '<html><head><title>Files for Composite Dataset %s</title></head><p/>Comprises the following files:<p/><ul>'
            % (bn)
        ]
        for i, fname in enumerate(flist):
            sfname = os.path.split(fname)[-1]
            rval.append('<li><a href="{}">{}</a>'.format(sfname, sfname))
        rval.append('</ul></html>')
        with open(dataset.file_name, 'w') as f:
            f.write("\n".join(rval))
            f.write('\n')

    def init_meta(self, dataset, copy_from=None):
        if copy_from:
            dataset.metadata = copy_from.metadata

    def set_meta(self, dataset, **kwd):
        """
        NOTE we apply the tabular machinary to the phenodata extracted
        from a BioC eSet or affybatch.

        """
        Html.set_meta(self, dataset, **kwd)
        try:
            flist = os.listdir(dataset.extra_files_path)
        except Exception:
            if verbose:
                gal_Log.debug('@@@rexpression set_meta failed - no dataset?')
            return False
        bn = dataset.metadata.base_name
        if not bn:
            for f in flist:
                n = os.path.splitext(f)[0]
                bn = n
                dataset.metadata.base_name = bn
        if not bn:
            bn = '?'
            dataset.metadata.base_name = bn
        pn = '%s.pheno' % (bn)
        pp = os.path.join(dataset.extra_files_path, pn)
        dataset.metadata.pheno_path = pp
        try:
            with open(pp) as f:
                pf = f.readlines(
                )  # read the basename.phenodata in the extra_files_path
        except Exception:
            pf = None
        if pf:
            h = pf[0].strip()
            h = h.split('\t')  # hope is header
            h = [escape(x) for x in h]
            dataset.metadata.column_names = h
            dataset.metadata.columns = len(h)
            dataset.peek = ''.join(pf[:5])
        else:
            dataset.metadata.column_names = []
            dataset.metadata.columns = 0
            dataset.peek = 'No pheno file found'
        if pf and len(pf) > 1:
            dataset.metadata.pheCols = self.get_phecols(phenolist=pf)
        else:
            dataset.metadata.pheCols = [
                ('', 'No useable phenotypes found', False),
            ]
        if not dataset.info:
            dataset.info = 'Galaxy Expression datatype object'
        if not dataset.blurb:
            dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit'
        return True

    def make_html_table(self, pp='nothing supplied from peek\n'):
        """
        Create HTML table, used for displaying peek
        """
        out = [
            '<table cellspacing="0" cellpadding="3">',
        ]
        try:
            # Generate column header
            p = pp.split('\n')
            for i, row in enumerate(p):
                lrow = row.strip().split('\t')
                if i == 0:
                    orow = ['<th>%s</th>' % escape(x) for x in lrow]
                    orow.insert(0, '<tr>')
                    orow.append('</tr>')
                else:
                    orow = ['<td>%s</td>' % escape(x) for x in lrow]
                    orow.insert(0, '<tr>')
                    orow.append('</tr>')
                out.append(''.join(orow))
            out.append('</table>')
            out = "\n".join(out)
        except Exception as exc:
            out = "Can't create html table %s" % unicodify(exc)
        return out

    def display_peek(self, dataset):
        """
        Returns formatted html of peek
        """
        out = self.make_html_table(dataset.peek)
        return out
Exemplo n.º 17
0
class Otu(Text):
    file_ext = 'mothur.otu'
    MetadataElement(name="columns",
                    default=0,
                    desc="Number of columns",
                    readonly=True,
                    visible=True,
                    no_value=0)
    MetadataElement(name="labels",
                    default=[],
                    desc="Label Names",
                    readonly=True,
                    visible=True,
                    no_value=[])
    MetadataElement(name="otulabels",
                    default=[],
                    desc="OTU Names",
                    readonly=True,
                    visible=True,
                    no_value=[])

    def __init__(self, **kwd):
        super(Otu, self).__init__(**kwd)

    def set_meta(self, dataset, overwrite=True, **kwd):
        """
        Set metadata for Otu files.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> from galaxy.util.bunch import Bunch
        >>> dataset = Bunch()
        >>> dataset.metadata = Bunch
        >>> otu = Otu()
        >>> dataset.file_name = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> dataset.has_data = lambda: True
        >>> otu.set_meta(dataset)
        >>> dataset.metadata.columns
        100
        >>> len(dataset.metadata.labels) == 37
        True
        >>> len(dataset.metadata.otulabels) == 98
        True
        """
        super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd)

        if dataset.has_data():
            label_names = set()
            otulabel_names = set()
            ncols = 0
            data_lines = 0
            comment_lines = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            first_line = get_headers(dataset.file_name, sep='\t', count=1)
            if first_line:
                first_line = first_line[0]
            # set otulabels
            if len(first_line) > 2:
                otulabel_names = first_line[2:]
            # set label names and number of lines
            for line in headers:
                if len(line) >= 2 and not line[0].startswith('@'):
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                else:
                    comment_lines += 1
            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.otulabels = list(otulabel_names)
            dataset.metadata.otulabels.sort()

    def sniff(self, filename):
        """
        Determines whether the file is otu (operational taxonomic unit) format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> Otu().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' )
        >>> Otu().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 2:
                    return False
                if count >= 1:
                    try:
                        check = int(line[1])
                        if check + 2 != len(line):
                            return False
                    except ValueError:
                        return False
                count += 1
        if count > 2:
            return True

        return False
Exemplo n.º 18
0
class Tabular( data.Text ):
    """Tab delimited data"""

    # All tabular data is chunkable.
    CHUNKABLE = True

    """Add metadata elements"""
    MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 )
    MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 )
    MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] )
    MetadataElement( name="column_names", default=[], desc="Column names", readonly=True, visible=False, optional=True, no_value=[] )

    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = 100000, max_guess_type_data_lines = None, **kwd ):
        """
        Tries to determine the number of columns as well as those columns that
        contain numerical values in the dataset.  A skip parameter is used
        because various tabular data types reuse this function, and their data
        type classes are responsible to determine how many invalid comment
        lines should be skipped. Using None for skip will cause skip to be
        zero, but the first line will be processed as a header. A
        max_data_lines parameter is used because various tabular data types
        reuse this function, and their data type classes are responsible to
        determine how many data lines should be processed to ensure that the
        non-optional metadata parameters are properly set; if used, optional
        metadata parameters will be set to None, unless the entire file has
        already been read. Using None for max_data_lines will process all data
        lines.

        Items of interest:

        1. We treat 'overwrite' as always True (we always want to set tabular metadata when called).
        2. If a tabular file has no data, it will have one column of type 'str'.
        3. We used to check only the first 100 lines when setting metadata and this class's
           set_peek() method read the entire file to determine the number of lines in the file.
           Since metadata can now be processed on cluster nodes, we've merged the line count portion
           of the set_peek() processing here, and we now check the entire contents of the file.
        """
        # Store original skip value to check with later
        requested_skip = skip
        if skip is None:
            skip = 0
        column_type_set_order = [ 'int', 'float', 'list', 'str'  ] #Order to set column types in
        default_column_type = column_type_set_order[-1] # Default column type is lowest in list
        column_type_compare_order = list( column_type_set_order ) #Order to compare column types
        column_type_compare_order.reverse()
        def type_overrules_type( column_type1, column_type2 ):
            if column_type1 is None or column_type1 == column_type2:
                return False
            if column_type2 is None:
                return True
            for column_type in column_type_compare_order:
                if column_type1 == column_type:
                    return True
                if column_type2 == column_type:
                    return False
            #neither column type was found in our ordered list, this cannot happen
            raise "Tried to compare unknown column types"
        def is_int( column_text ):
            try:
                int( column_text )
                return True
            except:
                return False
        def is_float( column_text ):
            try:
                float( column_text )
                return True
            except:
                if column_text.strip().lower() == 'na':
                    return True #na is special cased to be a float
                return False
        def is_list( column_text ):
            return "," in column_text
        def is_str( column_text ):
            #anything, except an empty string, is True
            if column_text == "":
                return False
            return True
        is_column_type = {} #Dict to store column type string to checking function
        for column_type in column_type_set_order:
            is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ]
        def guess_column_type( column_text ):
            for column_type in column_type_set_order:
                if is_column_type[column_type]( column_text ):
                    return column_type
            return None
        data_lines = 0
        comment_lines = 0
        column_types = []
        first_line_column_types = [default_column_type] # default value is one column of type str
        if dataset.has_data():
            #NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
            dataset_fh = open( dataset.file_name )
            i = 0
            while True:
                line = dataset_fh.readline()
                if not line: break
                line = line.rstrip( '\r\n' )
                if i < skip or not line or line.startswith( '#' ):
                    # We'll call blank lines comments
                    comment_lines += 1
                else:
                    data_lines += 1
                    if max_guess_type_data_lines is None or data_lines <= max_guess_type_data_lines:
                        fields = line.split( '\t' )
                        for field_count, field in enumerate( fields ):
                            if field_count >= len( column_types ): #found a previously unknown column, we append None
                                column_types.append( None )
                            column_type = guess_column_type( field )
                            if type_overrules_type( column_type, column_types[field_count] ):
                                column_types[field_count] = column_type
                    if i == 0 and requested_skip is None:
                        # This is our first line, people seem to like to upload files that have a header line, but do not
                        # start with '#' (i.e. all column types would then most likely be detected as str).  We will assume
                        # that the first line is always a header (this was previous behavior - it was always skipped).  When
                        # the requested skip is None, we only use the data from the first line if we have no other data for
                        # a column.  This is far from perfect, as
                        # 1,2,3	1.1	2.2	qwerty
                        # 0	0		1,2,3
                        # will be detected as
                        # "column_types": ["int", "int", "float", "list"]
                        # instead of
                        # "column_types": ["list", "float", "float", "str"]  *** would seem to be the 'Truth' by manual
                        # observation that the first line should be included as data.  The old method would have detected as
                        # "column_types": ["int", "int", "str", "list"]
                        first_line_column_types = column_types
                        column_types = [ None for col in first_line_column_types ]
                if max_data_lines is not None and data_lines >= max_data_lines:
                    if dataset_fh.tell() != dataset.get_size():
                        data_lines = None #Clear optional data_lines metadata value
                        comment_lines = None #Clear optional comment_lines metadata value; additional comment lines could appear below this point
                    break
                i += 1
            dataset_fh.close()

        #we error on the larger number of columns
        #first we pad our column_types by using data from first line
        if len( first_line_column_types ) > len( column_types ):
            for column_type in first_line_column_types[len( column_types ):]:
                column_types.append( column_type )
        #Now we fill any unknown (None) column_types with data from first line
        for i in range( len( column_types ) ):
            if column_types[i] is None:
                if len( first_line_column_types ) <= i or first_line_column_types[i] is None:
                    column_types[i] = default_column_type
                else:
                    column_types[i] = first_line_column_types[i]
        # Set the discovered metadata values for the dataset
        dataset.metadata.data_lines = data_lines
        dataset.metadata.comment_lines = comment_lines
        dataset.metadata.column_types = column_types
        dataset.metadata.columns = len( column_types )
    def make_html_table( self, dataset, **kwargs ):
        """Create HTML table, used for displaying peek"""
        out = ['<table cellspacing="0" cellpadding="3">']
        try:
            out.append( self.make_html_peek_header( dataset, **kwargs ) )
            out.append( self.make_html_peek_rows( dataset, **kwargs ) )
            out.append( '</table>' )
            out = "".join( out )
        except Exception, exc:
            out = "Can't create peek %s" % str( exc )
        return out
Exemplo n.º 19
0
class Quantile(Tabular):
    file_ext = 'mothur.quan'
    MetadataElement(name="filtered",
                    default=False,
                    no_value=False,
                    optional=True,
                    desc="Quantiles calculated using a mask",
                    readonly=True)
    MetadataElement(name="masked",
                    default=False,
                    no_value=False,
                    optional=True,
                    desc="Quantiles calculated using a frequency filter",
                    readonly=True)

    def __init__(self, **kwd):
        """Quantiles for chimera analysis"""
        super(Quantile, self).__init__(**kwd)
        self.column_names = [
            'num', 'ten', 'twentyfive', 'fifty', 'seventyfive', 'ninetyfive',
            'ninetynine'
        ]
        self.column_types = [
            'int', 'float', 'float', 'float', 'float', 'float', 'float'
        ]

    def sniff(self, filename):
        """
        Determines whether the file is a quantiles tabular format for chimera analysis
        1	0	0	0	0	0	0
        2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
        3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
        ...

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' )
        >>> Quantile().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' )
        >>> Quantile().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if len(line) != 7:
                    return False
                try:
                    int(line[0])
                    float(line[1])
                    float(line[2])
                    float(line[3])
                    float(line[4])
                    float(line[5])
                    float(line[6])
                except Exception:
                    return False
                count += 1
        if count > 0:
            return True

        return False
Exemplo n.º 20
0
class Pileup( Tabular ):
    """Tab delimited data in pileup (6- or 10-column) format"""
    file_ext = "pileup"
    line_class = "genomic coordinate"
    data_sources = { "data": "tabix" }

    """Add metadata elements"""
    MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
    MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
    MetadataElement( name="endCol", default=2, desc="End column", param=metadata.ColumnParameter )
    MetadataElement( name="baseCol", default=3, desc="Reference base column", param=metadata.ColumnParameter )

    def init_meta( self, dataset, copy_from=None ):
        Tabular.init_meta( self, dataset, copy_from=copy_from )

    def display_peek( self, dataset ):
        """Returns formated html of peek"""
        return Tabular.make_html_table( self, dataset, column_parameter_alias={'chromCol':'Chrom', 'startCol':'Start', 'baseCol':'Base'} )

    def repair_methods( self, dataset ):
        """Return options for removing errors along with a description"""
        return [ ("lines", "Remove erroneous lines") ]

    def sniff( self, filename ):
        """
        Checks for 'pileup-ness'

        There are two main types of pileup: 6-column and 10-column. For both,
        the first three and last two columns are the same. We only check the
        first three to allow for some personalization of the format.

        >>> fname = get_test_fname( 'interval.interval' )
        >>> Pileup().sniff( fname )
        False
        >>> fname = get_test_fname( '6col.pileup' )
        >>> Pileup().sniff( fname )
        True
        >>> fname = get_test_fname( '10col.pileup' )
        >>> Pileup().sniff( fname )
        True
        """
        headers = get_headers( filename, '\t' )
        try:
            for hdr in headers:
                if hdr and not hdr[0].startswith( '#' ):
                    if len( hdr ) < 3:
                        return False
                    try:
                        # chrom start in column 1 (with 0-based columns)
                        # and reference base is in column 2
                        check = int( hdr[1] )
                        assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ]
                    except:
                        return False
            return True
        except:
            return False

    # ------------- Dataproviders
    @dataproviders.decorators.dataprovider_factory( 'genomic-region',
                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
    def genomic_region_dataprovider( self, dataset, **settings ):
        return dataproviders.dataset.GenomicRegionDataProvider( dataset, **settings )

    @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict',
                                                    dataproviders.dataset.GenomicRegionDataProvider.settings )
    def genomic_region_dict_dataprovider( self, dataset, **settings ):
        settings[ 'named_columns' ] = True
        return self.genomic_region_dataprovider( dataset, **settings )
Exemplo n.º 21
0
class Arff(Text):
    """
        An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes.
        http://weka.wikispaces.com/ARFF
    """
    file_ext = "arff"
    """Add metadata elements"""
    MetadataElement(name="comment_lines",
                    default=0,
                    desc="Number of comment lines",
                    readonly=True,
                    optional=True,
                    no_value=0)
    MetadataElement(name="columns",
                    default=0,
                    desc="Number of columns",
                    readonly=True,
                    visible=True,
                    no_value=0)

    def set_peek(self, dataset, is_multi_byte=False):
        if not dataset.dataset.purged:
            dataset.peek = get_file_peek(dataset.file_name,
                                         is_multi_byte=is_multi_byte)
            dataset.blurb = "Attribute-Relation File Format (ARFF)"
            dataset.blurb += ", %s comments, %s attributes" % (
                dataset.metadata.comment_lines, dataset.metadata.columns)
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disc'

    def sniff(self, filename):
        """
            Try to guess the Arff filetype.
            It usually starts with a "format-version:" string and has several stanzas which starts with "id:".
        """
        with open(filename) as handle:
            relation_found = False
            attribute_found = False
            prefix = ""
            for line_count, line in enumerate(handle):
                if line_count > 1000:
                    # only investigate the first 1000 lines
                    return False
                line = line.strip()
                if not line:
                    continue

                start_string = line[:20].upper()
                if start_string.startswith("@RELATION"):
                    relation_found = True
                elif start_string.startswith("@ATTRIBUTE"):
                    attribute_found = True
                elif start_string.startswith("@DATA"):
                    # @DATA should be the last data block
                    if relation_found and attribute_found:
                        return True
        return False

    def set_meta(self, dataset, **kwd):
        """
            Trying to count the comment lines and the number of columns included.
            A typical ARFF data block looks like this:
            @DATA
            5.1,3.5,1.4,0.2,Iris-setosa
            4.9,3.0,1.4,0.2,Iris-setosa
        """
        if dataset.has_data():
            comment_lines = 0
            first_real_line = False
            data_block = False
            with open(dataset.file_name) as handle:
                for line in handle:
                    line = line.strip()
                    if not line:
                        continue
                    if line.startswith('%') and not first_real_line:
                        comment_lines += 1
                    else:
                        first_real_line = True
                    if data_block:
                        if line.startswith('{'):
                            # Sparse representation
                            """
                                @data
                                0, X, 0, Y, "class A", {5}
                            or
                                @data
                                {1 X, 3 Y, 4 "class A"}, {5}
                            """
                            token = line.split('}', 1)
                            first_part = token[0]
                            last_column = first_part.split(',')[-1].strip()
                            numeric_value = last_column.split()[0]
                            column_count = int(numeric_value)
                            if len(token) > 1:
                                # we have an additional weight
                                column_count -= 1
                        else:
                            columns = line.strip().split(',')
                            column_count = len(columns)
                            if columns[-1].strip().startswith('{'):
                                # we have an additional weight at the end
                                column_count -= 1

                        # We have now the column_count and we know the initial comment lines. So we can terminate here.
                        break
                    if line[:5].upper() == "@DATA":
                        data_block = True
        dataset.metadata.comment_lines = comment_lines
        dataset.metadata.columns = column_count
Exemplo n.º 22
0
class Infernal_CM_1_1(Text):
    file_ext = "cm"

    MetadataElement(name="number_of_models",
                    default=0,
                    desc="Number of covariance models",
                    readonly=True,
                    visible=True,
                    optional=True,
                    no_value=0)

    def set_peek(self, dataset, is_multi_byte=False):
        if not dataset.dataset.purged:
            dataset.peek = get_file_peek(dataset.file_name,
                                         is_multi_byte=is_multi_byte)
            if (dataset.metadata.number_of_models == 1):
                dataset.blurb = "1 model"
            else:
                dataset.blurb = "%s models" % dataset.metadata.number_of_models
            dataset.peek = data.get_file_peek(dataset.file_name,
                                              is_multi_byte=is_multi_byte)
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disc'

    def sniff(self, filename):
        if count_special_lines("^INFERNAL1/a", filename) > 0:
            return True
        else:
            return False

    def set_meta(self, dataset, **kwd):
        """
        Set the number of models in dataset.
        """
        dataset.metadata.number_of_models = count_special_lines(
            "^INFERNAL1/a", dataset.file_name)

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by model records.
        """
        if split_params is None:
            return None

        if len(input_datasets) > 1:
            raise Exception(
                "CM-file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            raise Exception(
                'Split mode "%s" is currently not implemented for CM-files.' %
                split_params['split_mode'])
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' %
                            split_params['split_mode'])

        def _read_cm_records(filename):
            lines = []
            with open(filename) as handle:
                for line in handle:
                    if line.startswith("INFERNAL1/a") and lines:
                        yield lines
                        lines = [line]
                    else:
                        lines.append(line)
            yield lines

        def _write_part_cm_file(accumulated_lines):
            part_dir = subdir_generator_function()
            part_path = os.path.join(part_dir,
                                     os.path.basename(input_files[0]))
            part_file = open(part_path, 'w')
            part_file.writelines(accumulated_lines)
            part_file.close()

        try:
            cm_records = _read_cm_records(input_files[0])
            cm_lines_accumulated = []
            for counter, cm_record in enumerate(cm_records, start=1):
                cm_lines_accumulated.extend(cm_record)
                if counter % chunk_size == 0:
                    _write_part_cm_file(cm_lines_accumulated)
                    cm_lines_accumulated = []
            if cm_lines_accumulated:
                _write_part_cm_file(cm_lines_accumulated)
        except Exception, e:
            log.error('Unable to split files: %s' % str(e))
            raise
Exemplo n.º 23
0
class SnpSiftDbNSFP(Text):
    """Class describing a dbNSFP database prepared fpr use by SnpSift dbnsfp """
    MetadataElement(name='reference_name',
                    default='dbSNFP',
                    desc='Reference Name',
                    readonly=True,
                    visible=True,
                    set_in_upload=True,
                    no_value='dbSNFP')
    MetadataElement(name="bgzip",
                    default=None,
                    desc="dbNSFP bgzip",
                    readonly=True,
                    visible=True,
                    no_value=None)
    MetadataElement(name="index",
                    default=None,
                    desc="Tabix Index File",
                    readonly=True,
                    visible=True,
                    no_value=None)
    MetadataElement(name="annotation",
                    default=[],
                    desc="Annotation Names",
                    readonly=True,
                    visible=True,
                    no_value=[])
    file_ext = "snpsiftdbnsfp"
    composite_type = 'auto_primary_file'
    allow_datatype_change = False
    """
    ## The dbNSFP file is a tabular file with 1 header line
    ## The first 4 columns are required to be: chrom	pos	ref	alt
    ## These match columns 1,2,4,5 of the VCF file
    ## SnpSift requires the file to be block-gzipped and the indexed with samtools tabix
    ## Example:
    ## Compress using block-gzip algorithm
    bgzip dbNSFP2.3.txt
    ## Create tabix index
    tabix -s 1 -b 2 -e 2 dbNSFP2.3.txt.gz
    """
    def __init__(self, **kwd):
        Text.__init__(self, **kwd)
        self.add_composite_file('%s.grp',
                                description='Group File',
                                substitute_name_with_metadata='reference_name',
                                is_binary=False)
        self.add_composite_file('%s.ti',
                                description='',
                                substitute_name_with_metadata='reference_name',
                                is_binary=False)

    def init_meta(self, dataset, copy_from=None):
        Text.init_meta(self, dataset, copy_from=copy_from)

    def generate_primary_file(self, dataset=None):
        """
        This is called only at upload to write the html file
        cannot rename the datasets here - they come with the default unfortunately
        """
        self.regenerate_primary_file(dataset)

    def regenerate_primary_file(self, dataset):
        """
        cannot do this until we are setting metadata
        """
        annotations = "dbNSFP Annotations: %s\n" % ','.join(
            dataset.metadata.annotation)
        f = open(dataset.file_name, 'a')
        if dataset.metadata.bgzip:
            bn = dataset.metadata.bgzip
            f.write(bn)
            f.write('\n')
        f.write(annotations)
        f.close()

    def set_meta(self, dataset, overwrite=True, **kwd):
        try:
            efp = dataset.extra_files_path
            if os.path.exists(efp):
                flist = os.listdir(efp)
                for i, fname in enumerate(flist):
                    if fname.endswith('.gz'):
                        dataset.metadata.bgzip = fname
                        try:
                            fh = gzip.open(os.path.join(efp, fname), 'r')
                            buf = fh.read(5000)
                            lines = buf.splitlines()
                            headers = lines[0].split('\t')
                            dataset.metadata.annotation = headers[4:]
                        except Exception as e:
                            log.warn("set_meta fname: %s  %s" %
                                     (fname, str(e)))
                        finally:
                            fh.close()
                    if fname.endswith('.tbi'):
                        dataset.metadata.index = fname
            self.regenerate_primary_file(dataset)
        except Exception as e:
            log.warn("set_meta fname: %s  %s" %
                     (dataset.file_name
                      if dataset and dataset.file_name else 'Unkwown', str(e)))
Exemplo n.º 24
0
class Vtk:
    r"""
    The Visualization Toolkit provides a number of source and writer objects to
    read and write popular data file formats. The Visualization Toolkit also
    provides some of its own file formats.

    There are two different styles of file formats available in VTK. The simplest
    are the legacy, serial formats that are easy to read and write either by hand
    or programmatically. However, these formats are less flexible than the XML
    based file formats which support random access, parallel I/O, and portable
    data compression and are preferred to the serial VTK file formats whenever
    possible.

    All keyword phrases are written in ASCII form whether the file is binary or
    ASCII. The binary section of the file (if in binary form) is the data proper;
    i.e., the numbers that define points coordinates, scalars, cell indices, and
    so forth.

    Binary data must be placed into the file immediately after the newline
    ('\\n') character from the previous ASCII keyword and parameter sequence.

    TODO: only legacy formats are currently supported and support for XML formats
    should be added.
    """
    subtype = ''
    # Add metadata elements.
    MetadataElement(name="vtk_version",
                    default=None,
                    desc="Vtk version",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="file_format",
                    default=None,
                    desc="File format",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="dataset_type",
                    default=None,
                    desc="Dataset type",
                    readonly=True,
                    optional=True,
                    visible=True)

    # STRUCTURED_GRID data_type.
    MetadataElement(name="dimensions",
                    default=[],
                    desc="Dimensions",
                    readonly=True,
                    optional=True,
                    visible=True,
                    no_value=[])
    MetadataElement(name="origin",
                    default=[],
                    desc="Origin",
                    readonly=True,
                    optional=True,
                    visible=True,
                    no_value=[])
    MetadataElement(name="spacing",
                    default=[],
                    desc="Spacing",
                    readonly=True,
                    optional=True,
                    visible=True,
                    no_value=[])

    # POLYDATA data_type (Points element is also a component of UNSTRUCTURED_GRID..
    MetadataElement(name="points",
                    default=None,
                    desc="Points",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="vertices",
                    default=None,
                    desc="Vertices",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="lines",
                    default=None,
                    desc="Lines",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="polygons",
                    default=None,
                    desc="Polygons",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="triangle_strips",
                    default=None,
                    desc="Triangle strips",
                    readonly=True,
                    optional=True,
                    visible=True)

    # UNSTRUCTURED_GRID data_type.
    MetadataElement(name="cells",
                    default=None,
                    desc="Cells",
                    readonly=True,
                    optional=True,
                    visible=True)

    # Additional elements not categorized by data_type.
    MetadataElement(name="field_names",
                    default=[],
                    desc="Field names",
                    readonly=True,
                    optional=True,
                    visible=True,
                    no_value=[])
    # The keys in the field_components map to the list of field_names in the above element
    # which ensures order for select list options that are built from it.
    MetadataElement(name="field_components",
                    default={},
                    desc="Field names and components",
                    readonly=True,
                    optional=True,
                    visible=True,
                    no_value={})

    @abc.abstractmethod
    def __init__(self, **kwd):
        raise NotImplementedError

    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        VTK files can be either ASCII or binary, with two different
        styles of file formats: legacy or XML.  We'll assume if the
        file contains a valid VTK header, then it is a valid VTK file.
        """
        if self._is_vtk_header(file_prefix.text_io(errors='ignore'),
                               self.subtype):
            return True
        return False

    def _is_vtk_header(self, fh, subtype):
        """
        The Header section consists of at least 4, but possibly
        5 lines.  This is tricky because sometimes the 4th line
        is blank (in which case the 5th line consists of the
        data_kind) or the 4th line consists of the data_kind (in
        which case the 5th line is blank).
        """
        data_kinds = [
            'STRUCTURED_GRID', 'POLYDATA', 'UNSTRUCTURED_GRID',
            'STRUCTURED_POINTS', 'RECTILINEAR_GRID'
        ]

        def check_data_kind(line):
            for data_kind in data_kinds:
                if line.find(data_kind) >= 0:
                    return True
            return False

        # Line 1: vtk DataFile Version 3.0
        line = get_next_line(fh)
        if line.find('vtk') < 0:
            return False
        # Line 2: can be anything - skip it
        line = get_next_line(fh)
        # Line 3: ASCII or BINARY
        line = get_next_line(fh)
        if line.find(subtype) < 0:
            return False
        # Line 4:
        line = get_next_line(fh)
        if line:
            return check_data_kind(line)
        # line 5:
        line = get_next_line(fh)
        if line:
            return check_data_kind(line)
        return False

    def set_meta(self, dataset, **kwd):
        if dataset.has_data():
            dataset.metadata.field_names = []
            dataset.metadata.field_components = {}
            dataset_type = None
            field_components = {}
            dataset_structure_complete = False
            processing_field_section = False
            with open(dataset.file_name, errors='ignore') as fh:
                for i, line in enumerate(fh):
                    line = line.strip()
                    if not line:
                        continue
                    if i < 3:
                        dataset = self.set_initial_metadata(i, line, dataset)
                    elif dataset.metadata.file_format == 'ASCII' or not util.is_binary(
                            line):
                        if dataset_structure_complete:
                            """
                            The final part of legacy VTK files describes the dataset attributes.
                            This part begins with the keywords POINT_DATA or CELL_DATA, followed
                            by an integer number specifying the number of points or cells,
                            respectively. Other keyword/data combinations then define the actual
                            dataset attribute values (i.e., scalars, vectors, tensors, normals,
                            texture coordinates, or field data).  Dataset attributes are supported
                            for both points and cells.

                            Each type of attribute data has a dataName associated with it. This is
                            a character string (without embedded whitespace) used to identify a
                            particular data.  The dataName is used by the VTK readers to extract
                            data. As a result, more than one attribute data of the same type can be
                            included in a file.  For example, two different scalar fields defined
                            on the dataset points, pressure and temperature, can be contained in
                            the same file.  If the appropriate dataName is not specified in the VTK
                            reader, then the first data of that type is extracted from the file.
                            """
                            items = line.split()
                            if items[0] == 'SCALARS':
                                # Example: SCALARS surface_field double 3
                                # Scalar definition includes specification of a lookup table. The
                                # definition of a lookup table is optional. If not specified, the
                                # default VTK table will be used, and tableName should be
                                # "default". Also note that the numComp variable is optional.  By
                                # default the number of components is equal to one.  The parameter
                                # numComp must range between (1,4) inclusive; in versions of VTK
                                # prior to vtk2.3 this parameter was not supported.
                                field_name = items[1]
                                dataset.metadata.field_names.append(field_name)
                                try:
                                    num_components = int(items[-1])
                                except Exception:
                                    num_components = 1
                                field_component_indexes = [
                                    str(i) for i in range(num_components)
                                ]
                                field_components[
                                    field_name] = field_component_indexes
                            elif items[0] == 'FIELD':
                                # The dataset consists of CELL_DATA.
                                # FIELD FieldData 2
                                processing_field_section = True
                                num_fields = int(items[-1])
                                fields_processed: List[str] = []
                            elif processing_field_section:
                                if len(fields_processed) == num_fields:
                                    processing_field_section = False
                                else:
                                    try:
                                        float(items[0])
                                        # Don't process the cell data.
                                        # 0.0123457 0.197531
                                    except Exception:
                                        # Line consists of arrayName numComponents numTuples dataType.
                                        # Example: surface_field1 1 12 double
                                        field_name = items[0]
                                        dataset.metadata.field_names.append(
                                            field_name)
                                        num_components = int(items[1])
                                        field_component_indexes = [
                                            str(i)
                                            for i in range(num_components)
                                        ]
                                        field_components[
                                            field_name] = field_component_indexes
                                        fields_processed.append(field_name)
                        elif line.startswith('CELL_DATA'):
                            # CELL_DATA 3188
                            dataset_structure_complete = True
                            dataset.metadata.cells = int(line.split()[1])
                        elif line.startswith('POINT_DATA'):
                            # POINT_DATA 1876
                            dataset_structure_complete = True
                            dataset.metadata.points = int(line.split()[1])
                        else:
                            dataset, dataset_type = self.set_structure_metadata(
                                line, dataset, dataset_type)
            if len(field_components) > 0:
                dataset.metadata.field_components = field_components

    def set_initial_metadata(self, i, line, dataset):
        if i == 0:
            # The first part of legacy VTK files is the file version and
            # identifier. This part contains the single line:
            # # vtk DataFile Version X.Y
            dataset.metadata.vtk_version = line.lower().split('version')[1]
            # The second part of legacy VTK files is the header. The header
            # consists of a character string terminated by end-of-line
            # character \n. The header is 256 characters maximum. The header
            # can be used to describe the data and include any other pertinent
            # information.  We skip the header line...
        elif i == 2:
            # The third part of legacy VTK files is the file format.  The file
            # format describes the type of file, either ASCII or binary. On
            # this line the single word ASCII or BINARY must appear.
            dataset.metadata.file_format = line
        return dataset

    def set_structure_metadata(self, line, dataset, dataset_type):
        """
        The fourth part of legacy VTK files is the dataset structure. The
        geometry part describes the geometry and topology of the dataset.
        This part begins with a line containing the keyword DATASET followed
        by a keyword describing the type of dataset.  Then, depending upon
        the type of dataset, other keyword/ data combinations define the
        actual data.
        """
        if dataset_type is None and line.startswith('DATASET'):
            dataset_type = line.split()[1]
            dataset.metadata.dataset_type = dataset_type
        if dataset_type == 'STRUCTURED_GRID':
            # The STRUCTURED_GRID format supports 1D, 2D, and 3D structured
            # grid datasets.  The dimensions nx, ny, nz must be greater
            # than or equal to 1.  The point coordinates are defined by the
            # data in the POINTS section. This consists of x-y-z data values
            # for each point.
            if line.startswith('DIMENSIONS'):
                # DIMENSIONS 10 5 1
                dataset.metadata.dimensions = [line.split()[1:]]
            elif line.startswith('ORIGIN'):
                # ORIGIN 0 0 0
                dataset.metadata.origin = [line.split()[1:]]
            elif line.startswith('SPACING'):
                # SPACING 1 1 1
                dataset.metadata.spacing = [line.split()[1:]]
        elif dataset_type == 'POLYDATA':
            # The polygonal dataset consists of arbitrary combinations
            # of surface graphics primitives vertices, lines, polygons
            # and triangle strips.  Polygonal data is defined by the POINTS,
            # VERTICES, LINES, POLYGONS, or TRIANGLE_STRIPS sections.
            if line.startswith('POINTS'):
                # POINTS 18 float
                dataset.metadata.points = int(line.split()[1])
            elif line.startswith('VERTICES'):
                dataset.metadata.vertices = int(line.split()[1])
            elif line.startswith('LINES'):
                # LINES 5 17
                dataset.metadata.lines = int(line.split()[1])
            elif line.startswith('POLYGONS'):
                # POLYGONS 6 30
                dataset.metadata.polygons = int(line.split()[1])
            elif line.startswith('TRIANGLE_STRIPS'):
                # TRIANGLE_STRIPS 2212 16158
                dataset.metadata.triangle_strips = int(line.split()[1])
        elif dataset_type == 'UNSTRUCTURED_GRID':
            # The unstructured grid dataset consists of arbitrary combinations
            # of any possible cell type. Unstructured grids are defined by points,
            # cells, and cell types.
            if line.startswith('POINTS'):
                # POINTS 18 float
                dataset.metadata.points = int(line.split()[1])
            if line.startswith('CELLS'):
                # CELLS 756 3024
                dataset.metadata.cells = int(line.split()[1])
        return dataset, dataset_type

    def get_blurb(self, dataset):
        blurb = ""
        if dataset.metadata.vtk_version is not None:
            blurb += f'VTK Version {str(dataset.metadata.vtk_version)}'
        if dataset.metadata.dataset_type is not None:
            if blurb:
                blurb += ' '
            blurb += str(dataset.metadata.dataset_type)
        return blurb or 'VTK data'

    def set_peek(self, dataset):
        if not dataset.dataset.purged:
            dataset.peek = get_file_peek(dataset.file_name)
            dataset.blurb = self.get_blurb(dataset)
        else:
            dataset.peek = 'File does not exist'
            dataset.blurb = 'File purged from disc'

    def display_peek(self, dataset):
        try:
            return dataset.peek
        except Exception:
            return f"Vtk file ({nice_size(dataset.get_size())})"
Exemplo n.º 25
0
class Gdm(Tabular):
    """Tab delimited data in Gdm format"""

    file_ext = "gdm"
    column_names = ['chr', 'left', 'right', 'strand', 'name', 'score']

    MetadataElement(name='columns',
                    default='6',
                    desc='Number of Columns',
                    readonly=True,
                    visible=False)
    MetadataElement(name='column_types',
                    default=['str', 'int', 'int', 'str', 'str', 'float'],
                    param=metadata.ColumnTypesParameter,
                    desc="Column types",
                    readonly=True,
                    visible=False)

    def display_peek(self, dataset):
        """Returns formatted html of peek"""

        return self.make_html_table(dataset, column_names=self.column_names)

    def sniff(self, filename):
        """
        Determines whether a file is in gdm format

        GDM files have at least 6 required fields.
        (Actually in the format definition only the first 5 are mandatory, but the ones returned by the system have
        always at least 6).

        Required fields must be tab separated.

        Columns 0, 3, 4 must be strings.
        Columns 1, 2, 5 numbers.

        Column 5 (Score) can be not provided.


        """

        headers = get_headers(filename, '\t', count=10)

        try:
            for hdr in headers:
                if hdr and hdr[0] and not hdr[0].startswith('#'):
                    if len(hdr) != 6:
                        return False
                    try:
                        int(hdr[1])
                        int(hdr[2])
                    except:
                        return False
                    if hdr[5] != '.':
                        try:
                            float(hdr[5])
                        except:
                            return False
                    return True
        except:
            return False
Exemplo n.º 26
0
class Ply:
    """
    The PLY format describes an object as a collection of vertices,
    faces and other elements, along with properties such as color and
    normal direction that can be attached to these elements.  A PLY
    file contains the description of exactly one object.
    """
    subtype = ''
    # Add metadata elements.
    MetadataElement(name="file_format",
                    default=None,
                    desc="File format",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="vertex",
                    default=None,
                    desc="Vertex",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="face",
                    default=None,
                    desc="Face",
                    readonly=True,
                    optional=True,
                    visible=True)
    MetadataElement(name="other_elements",
                    default=[],
                    desc="Other elements",
                    readonly=True,
                    optional=True,
                    visible=True,
                    no_value=[])

    @abc.abstractmethod
    def __init__(self, **kwd):
        raise NotImplementedError

    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        The structure of a typical PLY file:
        Header, Vertex List, Face List, (lists of other elements)
        """
        if not self._is_ply_header(file_prefix.text_io(errors='ignore'),
                                   self.subtype):
            return False
        return True

    def _is_ply_header(self, fh, subtype):
        """
        The header is a series of carriage-return terminated lines of
        text that describe the remainder of the file.
        """
        valid_header_items = ['comment', 'obj_info', 'element', 'property']
        # Line 1: ply
        line = get_next_line(fh)
        if line != 'ply':
            return False
        # Line 2: format ascii 1.0
        line = get_next_line(fh)
        if line.find(subtype) < 0:
            return False
        stop_index = 0
        for line in util.iter_start_of_line(fh, MAX_LINE_LEN):
            line = line.strip()
            stop_index += 1
            if line == 'end_header':
                return True
            items = line.split()
            if items[0] not in valid_header_items:
                return False
            if stop_index > MAX_HEADER_LINES:
                # If this is a PLY file, there must be an unusually
                # large number of comments.
                break
        return False

    def set_meta(self, dataset, **kwd):
        if dataset.has_data():
            with open(dataset.file_name, errors='ignore') as fh:
                for line in fh:
                    line = line.strip()
                    if not line:
                        continue
                    if line.startswith('format'):
                        items = line.split()
                        dataset.metadata.file_format = items[1]
                    elif line == 'end_header':
                        # Metadata is complete.
                        break
                    elif line.startswith('element'):
                        items = line.split()
                        if items[1] == 'face':
                            dataset.metadata.face = int(items[2])
                        elif items[1] == 'vertex':
                            dataset.metadata.vertex = int(items[2])
                        else:
                            element_tuple = (items[1], int(items[2]))
                            dataset.metadata.other_elements.append(
                                element_tuple)

    def set_peek(self, dataset):
        if not dataset.dataset.purged:
            dataset.peek = get_file_peek(dataset.file_name)
            dataset.blurb = f"Faces: {str(dataset.metadata.face)}, Vertices: {str(dataset.metadata.vertex)}"
        else:
            dataset.peek = 'File does not exist'
            dataset.blurb = 'File purged from disc'

    def display_peek(self, dataset):
        try:
            return dataset.peek
        except Exception:
            return f"Ply file ({nice_size(dataset.get_size())})"
Exemplo n.º 27
0
class Maf( Alignment ):
    """Class describing a Maf alignment"""
    file_ext = "maf"
    
    #Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
    MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
    MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
    MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )

    def init_meta( self, dataset, copy_from=None ):
        Alignment.init_meta( self, dataset, copy_from=copy_from )
    def set_meta( self, dataset, overwrite = True, **kwd ):
        """
        Parses and sets species, chromosomes, index from MAF file.
        """
        #these metadata values are not accessable by users, always overwrite
        indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name )
        if indexes is None:
            return #this is not a MAF file
        dataset.metadata.species = species
        dataset.metadata.blocks = blocks
        
        #write species chromosomes to a file
        chrom_file = dataset.metadata.species_chromosomes
        if not chrom_file:
            chrom_file = dataset.metadata.spec['species_chromosomes'].param.new_file( dataset = dataset )
        chrom_out = open( chrom_file.file_name, 'wb' )
        for spec, chroms in species_chromosomes.items():
            chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) )
        chrom_out.close()
        dataset.metadata.species_chromosomes = chrom_file
        
        index_file = dataset.metadata.maf_index
        if not index_file:
            index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
        indexes.write( open( index_file.file_name, 'wb' ) )
        dataset.metadata.maf_index = index_file
    def set_peek( self, dataset, is_multi_byte=False ):
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
            if dataset.metadata.blocks:
                dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) )
            else:
                # Number of blocks is not known ( this should not happen ), and auto-detect is
                # needed to set metadata
                dataset.blurb = "? blocks"
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'
    def display_peek( self, dataset ):
        """Returns formated html of peek"""
        return self.make_html_table( dataset )
    def make_html_table( self, dataset, skipchars=[] ):
        """Create HTML table, used for displaying peek"""
        out = ['<table cellspacing="0" cellpadding="3">']
        try:
            out.append('<tr><th>Species:&nbsp;')
            for species in dataset.metadata.species:
                out.append( '%s&nbsp;' % species )
            out.append( '</th></tr>' )
            if not dataset.peek:
                dataset.set_peek()
            data = dataset.peek
            lines =  data.splitlines()
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                out.append( '<tr><td>%s</td></tr>' % escape( line ) )
            out.append( '</table>' )
            out = "".join( out )
        except Exception, exc:
            out = "Can't create peek %s" % exc
        return out
Exemplo n.º 28
0
class NeperTess(data.Text):
    """
    Neper Tessellation File
    ***tess
      **format
        format
      **general
        dim type
      **cell
        number_of_cells
    """
    file_ext = "neper.tess"
    MetadataElement(name="format",
                    default=None,
                    desc="format",
                    readonly=True,
                    visible=True)
    MetadataElement(name="dimension",
                    default=None,
                    desc="dimension",
                    readonly=True,
                    visible=True)
    MetadataElement(name="cells",
                    default=None,
                    desc="cells",
                    readonly=True,
                    visible=True)

    def __init__(self, **kwd):
        data.Text.__init__(self, **kwd)

    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        Neper tess format startswith:***tess
        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('test.neper.tess')
        >>> NeperTess().sniff(fname)
        True
        >>> fname = get_test_fname('test.neper.tesr')
        >>> NeperTess().sniff(fname)
        False
        """
        return file_prefix.text_io(
            errors='ignore').readline(10).startswith('***tess')

    def set_meta(self, dataset, **kwd):
        if dataset.has_data():
            with open(dataset.file_name, errors='ignore') as fh:
                for i, line in enumerate(fh):
                    line = line.strip()
                    if not line or i > 6:
                        break
                    if i == 0 and not line.startswith('***tess'):
                        break
                    if i == 2:
                        dataset.metadata.format = line
                    if i == 4:
                        dataset.metadata.dimension = int(line.split()[0])
                    if i == 6:
                        dataset.metadata.cells = int(line)

    def set_peek(self, dataset):
        if not dataset.dataset.purged:
            dataset.peek = get_file_peek(dataset.file_name, LINE_COUNT=7)
            dataset.blurb = f'format: {str(dataset.metadata.format)} dim: {str(dataset.metadata.dimension)} cells: {str(dataset.metadata.cells)}'
        else:
            dataset.peek = 'File does not exist'
            dataset.blurb = 'File purged from disc'
Exemplo n.º 29
0
class Maf(Alignment):
    """Class describing a Maf alignment"""
    edam_format = "format_3008"
    file_ext = "maf"

    # Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
    MetadataElement(name="blocks",
                    default=0,
                    desc="Number of blocks",
                    readonly=True,
                    optional=True,
                    visible=False,
                    no_value=0)
    MetadataElement(name="species_chromosomes",
                    desc="Species Chromosomes",
                    param=metadata.FileParameter,
                    readonly=True,
                    no_value=None,
                    visible=False,
                    optional=True)
    MetadataElement(name="maf_index",
                    desc="MAF Index File",
                    param=metadata.FileParameter,
                    readonly=True,
                    no_value=None,
                    visible=False,
                    optional=True)

    def init_meta(self, dataset, copy_from=None):
        Alignment.init_meta(self, dataset, copy_from=copy_from)

    def set_meta(self, dataset, overwrite=True, **kwd):
        """
        Parses and sets species, chromosomes, index from MAF file.
        """
        # these metadata values are not accessable by users, always overwrite
        # Imported here to avoid circular dependency
        from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
        indexes, species, species_chromosomes, blocks = build_maf_index_species_chromosomes(
            dataset.file_name)
        if indexes is None:
            return  # this is not a MAF file
        dataset.metadata.species = species
        dataset.metadata.blocks = blocks

        # write species chromosomes to a file
        chrom_file = dataset.metadata.species_chromosomes
        if not chrom_file:
            chrom_file = dataset.metadata.spec[
                'species_chromosomes'].param.new_file(dataset=dataset)
        chrom_out = open(chrom_file.file_name, 'wb')
        for spec, chroms in species_chromosomes.items():
            chrom_out.write("%s\t%s\n" % (spec, "\t".join(chroms)))
        chrom_out.close()
        dataset.metadata.species_chromosomes = chrom_file

        index_file = dataset.metadata.maf_index
        if not index_file:
            index_file = dataset.metadata.spec['maf_index'].param.new_file(
                dataset=dataset)
        indexes.write(open(index_file.file_name, 'wb'))
        dataset.metadata.maf_index = index_file

    def set_peek(self, dataset, is_multi_byte=False):
        if not dataset.dataset.purged:
            # The file must exist on disk for the get_file_peek() method
            dataset.peek = data.get_file_peek(dataset.file_name,
                                              is_multi_byte=is_multi_byte)
            if dataset.metadata.blocks:
                dataset.blurb = "%s blocks" % util.commaify(
                    str(dataset.metadata.blocks))
            else:
                # Number of blocks is not known ( this should not happen ), and auto-detect is
                # needed to set metadata
                dataset.blurb = "? blocks"
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def display_peek(self, dataset):
        """Returns formated html of peek"""
        return self.make_html_table(dataset)

    def make_html_table(self, dataset, skipchars=[]):
        """Create HTML table, used for displaying peek"""
        out = ['<table cellspacing="0" cellpadding="3">']
        try:
            out.append('<tr><th>Species:&nbsp;')
            for species in dataset.metadata.species:
                out.append('%s&nbsp;' % species)
            out.append('</th></tr>')
            if not dataset.peek:
                dataset.set_peek()
            data = dataset.peek
            lines = data.splitlines()
            for line in lines:
                line = line.strip()
                if not line:
                    continue
                out.append('<tr><td>%s</td></tr>' % escape(line))
            out.append('</table>')
            out = "".join(out)
        except Exception as exc:
            out = "Can't create peek %s" % exc
        return out

    def sniff(self, filename):
        """
        Determines wether the file is in maf format

        The .maf format is line-oriented. Each multiple alignment ends with a blank line.
        Each sequence in an alignment is on a single line, which can get quite long, but
        there is no length limit. Words in a line are delimited by any white space.
        Lines starting with # are considered to be comments. Lines starting with ## can
        be ignored by most programs, but contain meta-data of one form or another.

        The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
        variable=value pairs. There should be no white space surrounding the "=".

        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'sequence.maf' )
        >>> Maf().sniff( fname )
        True
        >>> fname = get_test_fname( 'sequence.fasta' )
        >>> Maf().sniff( fname )
        False
        """
        headers = get_headers(filename, None)
        try:
            if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf":
                return True
            else:
                return False
        except:
            return False
Exemplo n.º 30
0
class CML(GenericXml):
    """
    Chemical Markup Language
    http://cml.sourceforge.net/
    """
    file_ext = "cml"
    MetadataElement(name="number_of_molecules",
                    default=0,
                    desc="Number of molecules",
                    readonly=True,
                    visible=True,
                    optional=True,
                    no_value=0)

    def set_meta(self, dataset, **kwd):
        """
        Set the number of lines of data in dataset.
        """
        dataset.metadata.number_of_molecules = count_special_lines(
            '^\s*<molecule', dataset.file_name)

    def set_peek(self, dataset, is_multi_byte=False):
        if not dataset.dataset.purged:
            if (dataset.metadata.number_of_molecules == 1):
                dataset.blurb = "1 molecule"
            else:
                dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules
            dataset.peek = get_file_peek(dataset.file_name)
        else:
            dataset.peek = 'file does not exist'
            dataset.blurb = 'file purged from disk'

    def sniff_prefix(self, file_prefix):
        """
        Try to guess if the file is a CML file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('interval.interval')
        >>> CML().sniff(fname)
        False
        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> CML().sniff(fname)
        True
        """
        for expected_string in [
                '<?xml version="1.0"?>', 'http://www.xml-cml.org/schema'
        ]:
            if expected_string not in file_prefix.contents_header:
                return False

        return True

    def split(cls, input_datasets, subdir_generator_function, split_params):
        """
        Split the input files by molecule records.
        """
        if split_params is None:
            return None

        if len(input_datasets) > 1:
            raise Exception(
                "CML-file splitting does not support multiple files")
        input_files = [ds.file_name for ds in input_datasets]

        chunk_size = None
        if split_params['split_mode'] == 'number_of_parts':
            raise Exception(
                'Split mode "%s" is currently not implemented for CML-files.' %
                split_params['split_mode'])
        elif split_params['split_mode'] == 'to_size':
            chunk_size = int(split_params['split_size'])
        else:
            raise Exception('Unsupported split mode %s' %
                            split_params['split_mode'])

        def _read_cml_records(filename):
            lines = []
            with open(filename) as handle:
                for line in handle:
                    if line.lstrip().startswith('<?xml version="1.0"?>') or \
                       line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema') or \
                       line.lstrip().startswith('</cml>'):
                        continue
                    lines.append(line)
                    if line.lstrip().startswith('</molecule>'):
                        yield lines
                        lines = []

        header_lines = [
            '<?xml version="1.0"?>\n',
            '<cml xmlns="http://www.xml-cml.org/schema">\n'
        ]
        footer_line = ['</cml>\n']

        def _write_part_cml_file(accumulated_lines):
            part_dir = subdir_generator_function()
            part_path = os.path.join(part_dir,
                                     os.path.basename(input_files[0]))
            with open(part_path, 'w') as part_file:
                part_file.writelines(header_lines)
                part_file.writelines(accumulated_lines)
                part_file.writelines(footer_line)

        try:
            cml_records = _read_cml_records(input_files[0])
            cml_lines_accumulated = []
            for counter, cml_record in enumerate(cml_records, start=1):
                cml_lines_accumulated.extend(cml_record)
                if counter % chunk_size == 0:
                    _write_part_cml_file(cml_lines_accumulated)
                    cml_lines_accumulated = []
            if cml_lines_accumulated:
                _write_part_cml_file(cml_lines_accumulated)
        except Exception as e:
            log.error('Unable to split files: %s' % str(e))
            raise

    split = classmethod(split)

    def merge(split_files, output_file):
        """
        Merging CML files.
        """
        if len(split_files) == 1:
            # For one file only, use base class method (move/copy)
            return data.Text.merge(split_files, output_file)
        if not split_files:
            raise ValueError("Given no CML files, %r, to merge into %s" %
                             (split_files, output_file))
        with open(output_file, "w") as out:
            for filename in split_files:
                with open(filename) as handle:
                    header = handle.readline()
                    if not header:
                        raise ValueError("CML file %s was empty" % filename)
                    if not header.lstrip().startswith('<?xml version="1.0"?>'):
                        out.write(header)
                        raise ValueError("%s is not a valid XML file!" %
                                         filename)
                    line = handle.readline()
                    header += line
                    if not line.lstrip().startswith(
                            '<cml xmlns="http://www.xml-cml.org/schema'):
                        out.write(header)
                        raise ValueError("%s is not a CML file!" % filename)
                    molecule_found = False
                    for line in handle.readlines():
                        # We found two required header lines, the next line should start with <molecule >
                        if line.lstrip().startswith('</cml>'):
                            continue
                        if line.lstrip().startswith('<molecule'):
                            molecule_found = True
                        if molecule_found:
                            out.write(line)
            out.write("</cml>\n")

    merge = staticmethod(merge)