class BPF(Text): """Munich BPF annotation format https://www.phonetik.uni-muenchen.de/Bas/BasFormatseng.html#Partitur >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('1_1119_2_22_001.par') >>> BPF().sniff(fname) True >>> fname = get_test_fname('1_1119_2_22_001-1.par') >>> BPF().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> BPF().sniff(fname) False """ file_ext = "par" MetadataElement(name="annotations", default=[], desc="Annotation types", param=ListParameter, readonly=True, visible=True, optional=True, no_value=[]) mandatory_headers = [ 'LHD', 'REP', 'SNB', 'SAM', 'SBF', 'SSB', 'NCH', 'SPN', 'LBD' ] optional_headers = [ 'FIL', 'TYP', 'DBN', 'VOL', 'DIR', 'SRC', 'BEG', 'END', 'RED', 'RET', 'RCC', 'CMT', 'SPI', 'PCF', 'PCN', 'EXP', 'SYS', 'DAT', 'SPA', 'MAO', 'GPO', 'SAO' ] def set_meta(self, dataset, overwrite=True, **kwd): """Set the metadata for this dataset from the file contents""" types = set() with open(dataset.dataset.file_name) as fd: for line in fd: # Split the line on a colon rather than regexing it parts = line.split(':') # And if the first part is a 3 character string, then it's # interesting. if len(parts) and len(parts[0]) == 3: types.add(parts[0]) else: return False dataset.metadata.annotations = list(types) def sniff(self, filename): # We loop over 30 as there are 9 mandatory headers (the last should be # `LBD:`), while there are 21 optional headers that can be # interspersed. seen_headers = [ line[0] for line in get_headers(filename, sep=':', count=40) ] # We cut everything after LBD, where the headers end and contents # start. We choose not to validate contents. if 'LBD' in seen_headers: seen_headers = seen_headers[0:seen_headers.index('LBD') + 1] # Check that every mandatory header is present in the seen headers for header in self.mandatory_headers: if header not in seen_headers: return False # Check that every seen header is either in mandatory or optional for header in seen_headers: if not (header in self.mandatory_headers or header in self.optional_headers): return False return True
class Sequence( data.Text ): """Class describing a sequence""" """Add metadata elements""" MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 ) def set_meta( self, dataset, **kwd ): """ Set the number of sequences and the number of data lines in dataset. """ data_lines = 0 sequences = 0 for line in file( dataset.file_name ): line = line.strip() if line and line.startswith( '#' ): # We don't count comment lines for sequence data types continue if line and line.startswith( '>' ): sequences += 1 data_lines +=1 else: data_lines += 1 dataset.metadata.data_lines = data_lines dataset.metadata.sequences = sequences def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) if dataset.metadata.sequences: dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) ) else: dataset.blurb = data.nice_size( dataset.get_size() ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def get_sequences_per_file(total_sequences, split_params): if split_params['split_mode'] == 'number_of_parts': # legacy basic mode - split into a specified number of parts parts = int(split_params['split_size']) sequences_per_file = [total_sequences/parts for i in range(parts)] for i in range(total_sequences % parts): sequences_per_file[i] += 1 elif split_params['split_mode'] == 'to_size': # loop through the sections and calculate the number of sequences chunk_size = long(split_params['split_size']) chunks = total_sequences / chunk_size rem = total_sequences % chunk_size sequences_per_file = [chunk_size for i in range(total_sequences / chunk_size)] # TODO: Should we invest the time in a better way to handle small remainders? if rem > 0: sequences_per_file.append(rem) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) return sequences_per_file get_sequences_per_file = staticmethod(get_sequences_per_file) def do_slow_split( cls, input_datasets, subdir_generator_function, split_params): # count the sequences so we can split # TODO: if metadata is present, take the number of lines / 4 if input_datasets[0].metadata is not None and input_datasets[0].metadata.sequences is not None: total_sequences = input_datasets[0].metadata.sequences else: input_file = input_datasets[0].file_name compress = is_gzip(input_file) if compress: # gzip is really slow before python 2.7! in_file = gzip.GzipFile(input_file, 'r') else: # TODO # if a file is not compressed, seek locations can be calculated and stored # ideally, this would be done in metadata # TODO # Add BufferedReader if python 2.7? in_file = open(input_file, 'rt') total_sequences = long(0) for i, line in enumerate(in_file): total_sequences += 1 in_file.close() total_sequences /= 4 sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params) return cls.write_split_files(input_datasets, None, subdir_generator_function, sequences_per_file) do_slow_split = classmethod(do_slow_split) def do_fast_split( cls, input_datasets, toc_file_datasets, subdir_generator_function, split_params): data = simplejson.load(open(toc_file_datasets[0].file_name)) sections = data['sections'] total_sequences = long(0) for section in sections: total_sequences += long(section['sequences']) sequences_per_file = cls.get_sequences_per_file(total_sequences, split_params) return cls.write_split_files(input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file) do_fast_split = classmethod(do_fast_split) def write_split_files(cls, input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file): directories = [] def get_subdir(idx): if idx < len(directories): return directories[idx] dir = subdir_generator_function() directories.append(dir) return dir # we know how many splits and how many sequences in each. What remains is to write out instructions for the # splitting of all the input files. To decouple the format of those instructions from this code, the exact format of # those instructions is delegated to scripts start_sequence=0 for part_no in range(len(sequences_per_file)): dir = get_subdir(part_no) for ds_no in range(len(input_datasets)): ds = input_datasets[ds_no] base_name = os.path.basename(ds.file_name) part_path = os.path.join(dir, base_name) split_data = dict(class_name='%s.%s' % (cls.__module__, cls.__name__), output_name=part_path, input_name=ds.file_name, args=dict(start_sequence=start_sequence, num_sequences=sequences_per_file[part_no])) if toc_file_datasets is not None: toc = toc_file_datasets[ds_no] split_data['args']['toc_file'] = toc.file_name f = open(os.path.join(dir, 'split_info_%s.json' % base_name), 'w') simplejson.dump(split_data, f) f.close() start_sequence += sequences_per_file[part_no] return directories write_split_files = classmethod(write_split_files) def split( cls, input_datasets, subdir_generator_function, split_params): """Split a generic sequence file (not sensible or possible, see subclasses).""" if split_params is None: return None raise NotImplementedError("Can't split generic sequence files")
class Sequence(data.Text): """Class describing a sequence""" edam_data = "data_2044" """Add metadata elements""" MetadataElement(name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0) def set_meta(self, dataset, **kwd): """ Set the number of sequences and the number of data lines in dataset. """ data_lines = 0 sequences = 0 for line in open(dataset.file_name): line = line.strip() if line and line.startswith('#'): # We don't count comment lines for sequence data types continue if line and line.startswith('>'): sequences += 1 data_lines += 1 else: data_lines += 1 dataset.metadata.data_lines = data_lines dataset.metadata.sequences = sequences def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) if dataset.metadata.sequences: dataset.blurb = "%s sequences" % util.commaify( str(dataset.metadata.sequences)) else: dataset.blurb = nice_size(dataset.get_size()) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def get_sequences_per_file(total_sequences, split_params): if split_params['split_mode'] == 'number_of_parts': # legacy basic mode - split into a specified number of parts parts = int(split_params['split_size']) sequences_per_file = [ total_sequences / parts for i in range(parts) ] for i in range(total_sequences % parts): sequences_per_file[i] += 1 elif split_params['split_mode'] == 'to_size': # loop through the sections and calculate the number of sequences chunk_size = long(split_params['split_size']) rem = total_sequences % chunk_size sequences_per_file = [ chunk_size for i in range(total_sequences / chunk_size) ] # TODO: Should we invest the time in a better way to handle small remainders? if rem > 0: sequences_per_file.append(rem) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) return sequences_per_file get_sequences_per_file = staticmethod(get_sequences_per_file) def do_slow_split(cls, input_datasets, subdir_generator_function, split_params): # count the sequences so we can split # TODO: if metadata is present, take the number of lines / 4 if input_datasets[0].metadata is not None and input_datasets[ 0].metadata.sequences is not None: total_sequences = input_datasets[0].metadata.sequences else: input_file = input_datasets[0].file_name compress = is_gzip(input_file) if compress: # gzip is really slow before python 2.7! in_file = gzip.GzipFile(input_file, 'r') else: # TODO # if a file is not compressed, seek locations can be calculated and stored # ideally, this would be done in metadata # TODO # Add BufferedReader if python 2.7? in_file = open(input_file, 'rt') total_sequences = long(0) for i, line in enumerate(in_file): total_sequences += 1 in_file.close() total_sequences /= 4 sequences_per_file = cls.get_sequences_per_file( total_sequences, split_params) return cls.write_split_files(input_datasets, None, subdir_generator_function, sequences_per_file) do_slow_split = classmethod(do_slow_split) def do_fast_split(cls, input_datasets, toc_file_datasets, subdir_generator_function, split_params): data = json.load(open(toc_file_datasets[0].file_name)) sections = data['sections'] total_sequences = long(0) for section in sections: total_sequences += long(section['sequences']) sequences_per_file = cls.get_sequences_per_file( total_sequences, split_params) return cls.write_split_files(input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file) do_fast_split = classmethod(do_fast_split) def write_split_files(cls, input_datasets, toc_file_datasets, subdir_generator_function, sequences_per_file): directories = [] def get_subdir(idx): if idx < len(directories): return directories[idx] dir = subdir_generator_function() directories.append(dir) return dir # we know how many splits and how many sequences in each. What remains is to write out instructions for the # splitting of all the input files. To decouple the format of those instructions from this code, the exact format of # those instructions is delegated to scripts start_sequence = 0 for part_no in range(len(sequences_per_file)): dir = get_subdir(part_no) for ds_no in range(len(input_datasets)): ds = input_datasets[ds_no] base_name = os.path.basename(ds.file_name) part_path = os.path.join(dir, base_name) split_data = dict( class_name='%s.%s' % (cls.__module__, cls.__name__), output_name=part_path, input_name=ds.file_name, args=dict(start_sequence=start_sequence, num_sequences=sequences_per_file[part_no])) if toc_file_datasets is not None: toc = toc_file_datasets[ds_no] split_data['args']['toc_file'] = toc.file_name f = open(os.path.join(dir, 'split_info_%s.json' % base_name), 'w') json.dump(split_data, f) f.close() start_sequence += sequences_per_file[part_no] return directories write_split_files = classmethod(write_split_files) def split(cls, input_datasets, subdir_generator_function, split_params): """Split a generic sequence file (not sensible or possible, see subclasses).""" if split_params is None: return None raise NotImplementedError("Can't split generic sequence files") def get_split_commands_with_toc(input_name, output_name, toc_file, start_sequence, sequence_count): """ Uses a Table of Contents dict, parsed from an FQTOC file, to come up with a set of shell commands that will extract the parts necessary >>> three_sections=[dict(start=0, end=74, sequences=10), dict(start=74, end=148, sequences=10), dict(start=148, end=148+76, sequences=10)] >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=0, sequence_count=10) ['dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null >> ./output.gz'] >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=1, sequence_count=5) ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +5 2> /dev/null) | head -20 | gzip -c >> ./output.gz'] >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=0, sequence_count=20) ['dd bs=1 skip=0 count=148 if=./input.gz 2> /dev/null >> ./output.gz'] >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=5, sequence_count=10) ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +21 2> /dev/null) | head -20 | gzip -c >> ./output.gz', '(dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +1 2> /dev/null) | head -20 | gzip -c >> ./output.gz'] >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=10, sequence_count=10) ['dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null >> ./output.gz'] >>> Sequence.get_split_commands_with_toc('./input.gz', './output.gz', dict(sections=three_sections), start_sequence=5, sequence_count=20) ['(dd bs=1 skip=0 count=74 if=./input.gz 2> /dev/null )| zcat | ( tail -n +21 2> /dev/null) | head -20 | gzip -c >> ./output.gz', 'dd bs=1 skip=74 count=74 if=./input.gz 2> /dev/null >> ./output.gz', '(dd bs=1 skip=148 count=76 if=./input.gz 2> /dev/null )| zcat | ( tail -n +1 2> /dev/null) | head -20 | gzip -c >> ./output.gz'] """ sections = toc_file['sections'] result = [] current_sequence = long(0) i = 0 # skip to the section that contains my starting sequence while i < len(sections) and start_sequence >= current_sequence + long( sections[i]['sequences']): current_sequence += long(sections[i]['sequences']) i += 1 if i == len(sections): # bad input data! raise Exception('No FQTOC section contains starting sequence %s' % start_sequence) # These two variables act as an accumulator for consecutive entire blocks that # can be copied verbatim (without decompressing) start_chunk = long(-1) end_chunk = long(-1) copy_chunk_cmd = 'dd bs=1 skip=%s count=%s if=%s 2> /dev/null >> %s' while sequence_count > 0 and i < len(sections): # we need to extract partial data. So, find the byte offsets of the chunks that contain the data we need # use a combination of dd (to pull just the right sections out) tail (to skip lines) and head (to get the # right number of lines sequences = long(sections[i]['sequences']) skip_sequences = start_sequence - current_sequence sequences_to_extract = min(sequence_count, sequences - skip_sequences) start_copy = long(sections[i]['start']) end_copy = long(sections[i]['end']) if sequences_to_extract < sequences: if start_chunk > -1: result.append(copy_chunk_cmd % (start_chunk, end_chunk - start_chunk, input_name, output_name)) start_chunk = -1 # extract, unzip, trim, recompress result.append( '(dd bs=1 skip=%s count=%s if=%s 2> /dev/null )| zcat | ( tail -n +%s 2> /dev/null) | head -%s | gzip -c >> %s' % (start_copy, end_copy - start_copy, input_name, skip_sequences * 4 + 1, sequences_to_extract * 4, output_name)) else: # whole section - add it to the start_chunk/end_chunk accumulator if start_chunk == -1: start_chunk = start_copy end_chunk = end_copy sequence_count -= sequences_to_extract start_sequence += sequences_to_extract current_sequence += sequences i += 1 if start_chunk > -1: result.append(copy_chunk_cmd % (start_chunk, end_chunk - start_chunk, input_name, output_name)) if sequence_count > 0: raise Exception('%s sequences not found in file' % sequence_count) return result get_split_commands_with_toc = staticmethod(get_split_commands_with_toc) def get_split_commands_sequential(is_compressed, input_name, output_name, start_sequence, sequence_count): """ Does a brain-dead sequential scan & extract of certain sequences >>> Sequence.get_split_commands_sequential(True, './input.gz', './output.gz', start_sequence=0, sequence_count=10) ['zcat "./input.gz" | ( tail -n +1 2> /dev/null) | head -40 | gzip -c > "./output.gz"'] >>> Sequence.get_split_commands_sequential(False, './input.fastq', './output.fastq', start_sequence=10, sequence_count=10) ['tail -n +41 "./input.fastq" 2> /dev/null | head -40 > "./output.fastq"'] """ start_line = start_sequence * 4 line_count = sequence_count * 4 # TODO: verify that tail can handle 64-bit numbers if is_compressed: cmd = 'zcat "%s" | ( tail -n +%s 2> /dev/null) | head -%s | gzip -c' % ( input_name, start_line + 1, line_count) else: cmd = 'tail -n +%s "%s" 2> /dev/null | head -%s' % ( start_line + 1, input_name, line_count) cmd += ' > "%s"' % output_name return [cmd] get_split_commands_sequential = staticmethod(get_split_commands_sequential)
class SnpEffDb(Text): """Class describing a SnpEff genome build""" file_ext = "snpeffdb" MetadataElement(name="genome_version", default=None, desc="Genome Version", readonly=True, visible=True, no_value=None) MetadataElement(name="regulation", default=[], desc="Regulation Names", readonly=True, visible=True, no_value=[], optional=True) MetadataElement(name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[], optional=True) def __init__(self, **kwd): Text.__init__(self, **kwd) def set_meta(self, dataset, **kwd): Text.set_meta(self, dataset, **kwd) data_dir = dataset.extra_files_path # search data_dir/genome_version for files regulation_pattern = 'regulation_(.+).bin' # annotation files that are included in snpEff by a flag annotations_dict = {'nextProt.bin': '-nextprot', 'motif.bin': '-motif'} regulations = [] annotations = [] if data_dir and os.path.isdir(data_dir): for root, dirs, files in os.walk(data_dir): for fname in files: if fname.startswith('snpEffectPredictor'): # if snpEffectPredictor.bin download succeeded genome_version = os.path.basename(root) dataset.metadata.genome_version = genome_version else: m = re.match(regulation_pattern, fname) if m: name = m.groups()[0] regulations.append(name) elif fname in annotations_dict: value = annotations_dict[fname] name = value.lstrip('-') annotations.append(name) dataset.metadata.regulation = regulations dataset.metadata.annotation = annotations try: fh = file(dataset.file_name, 'w') fh.write("%s\n" % genome_version) if annotations: fh.write("annotations: %s\n" % ','.join(annotations)) if regulations: fh.write("regulations: %s\n" % ','.join(regulations)) fh.close() except: pass
class Stockholm_1_0(Text): file_ext = "stockholm" MetadataElement(name="number_of_alignments", default=0, desc="Number of multiple alignments", readonly=True, visible=True, optional=True, no_value=0) def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) if (dataset.metadata.number_of_models == 1): dataset.blurb = "1 alignment" else: dataset.blurb = "%s alignments" % dataset.metadata.number_of_models else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disc' def sniff(self, filename): if count_special_lines('^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', filename) > 0: return True else: return False def set_meta(self, dataset, **kwd): """ Set the number of models in dataset. """ dataset.metadata.number_of_models = count_special_lines( '^#[[:space:]+]STOCKHOLM[[:space:]+]1.0', dataset.file_name) def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by model records. """ if split_params is None: return None if len(input_datasets) > 1: raise Exception( "STOCKHOLM-file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] chunk_size = None if split_params['split_mode'] == 'number_of_parts': raise Exception( 'Split mode "%s" is currently not implemented for STOCKHOLM-files.' % split_params['split_mode']) elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) def _read_stockholm_records(filename): lines = [] with open(filename) as handle: for line in handle: lines.append(line) if line.strip() == '//': yield lines lines = [] def _write_part_stockholm_file(accumulated_lines): part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.writelines(accumulated_lines) part_file.close() try: stockholm_records = _read_stockholm_records(input_files[0]) stockholm_lines_accumulated = [] for counter, stockholm_record in enumerate(stockholm_records, start=1): stockholm_lines_accumulated.extend(stockholm_record) if counter % chunk_size == 0: _write_part_stockholm_file(stockholm_lines_accumulated) stockholm_lines_accumulated = [] if stockholm_lines_accumulated: _write_part_stockholm_file(stockholm_lines_accumulated) except Exception as e: log.error('Unable to split files: %s' % str(e)) raise split = classmethod(split)
class GroupAbund(Otu): file_ext = 'mothur.shared' MetadataElement(name="groups", default=[], desc="Group Names", readonly=True, visible=True, no_value=[]) def __init__(self, **kwd): super(GroupAbund, self).__init__(**kwd) def init_meta(self, dataset, copy_from=None): super(GroupAbund, self).init_meta(dataset, copy_from=copy_from) def set_meta(self, dataset, overwrite=True, skip=1, **kwd): super(GroupAbund, self).set_meta(dataset, overwrite=overwrite, **kwd) # See if file starts with header line if dataset.has_data(): label_names = set() group_names = set() data_lines = 0 comment_lines = 0 ncols = 0 headers = iter_headers(dataset.file_name, sep='\t', count=-1) for line in headers: if line[0] == 'label' and line[1] == 'Group': skip = 1 comment_lines += 1 else: skip = 0 data_lines += 1 ncols = max(ncols, len(line)) label_names.add(line[0]) group_names.add(line[1]) # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() dataset.metadata.groups = list(group_names) dataset.metadata.groups.sort() dataset.metadata.skip = skip def sniff(self, filename, vals_are_int=False): """ Determines whether the file is a otu (operational taxonomic unit) Shared format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.2 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' ) >>> GroupAbund().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' ) >>> GroupAbund().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 3: return False if count > 0 or line[0] != 'label': try: check = int(line[2]) if check + 3 != len(line): return False for i in range(3, len(line)): if vals_are_int: int(line[i]) else: float(line[i]) except ValueError: return False count += 1 if count > 1: return True return False
class SffFlow(Tabular): MetadataElement(name="flow_values", default="", no_value="", optional=True, desc="Total number of flow values", readonly=True) MetadataElement(name="flow_order", default="TACG", no_value="TACG", desc="Total number of flow values", readonly=False) file_ext = 'mothur.sff.flow' """ http://www.mothur.org/wiki/Flow_file The first line is the total number of flow values - 800 for Titanium data. For GS FLX it would be 400. Following lines contain: - SequenceName - the number of useable flows as defined by 454's software - the flow intensity for each base going in the order of TACG. Example: 800 GQY1XT001CQL4K 85 1.04 0.00 1.00 0.02 0.03 1.02 0.05 ... GQY1XT001CQIRF 84 1.02 0.06 0.98 0.06 0.09 1.05 0.07 ... GQY1XT001CF5YW 88 1.02 0.02 1.01 0.04 0.06 1.02 0.03 ... """ def __init__(self, **kwd): super(SffFlow, self).__init__(**kwd) def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd): super(SffFlow, self).set_meta(dataset, overwrite, 1, max_data_lines) headers = get_headers(dataset.file_name, sep='\t', count=1) try: flow_values = int(headers[0][0]) dataset.metadata.flow_values = flow_values except Exception as e: log.warning("SffFlow set_meta %s" % e) def make_html_table(self, dataset, skipchars=[]): """Create HTML table, used for displaying peek""" try: out = '<table cellspacing="0" cellpadding="3">' # Generate column header out += '<tr>' out += '<th>%d. Name</th>' % 1 out += '<th>%d. Flows</th>' % 2 for i in range(3, dataset.metadata.columns + 1): base = dataset.metadata.flow_order[(i + 1) % 4] out += '<th>%d. %d %s</th>' % (i - 2, base) out += '</tr>' out += self.make_html_peek_rows(dataset, skipchars=skipchars) out += '</table>' except Exception as exc: out = "Can't create peek %s" % str(exc) return out
class GenomeGraphs(Tabular): """ Tab delimited data containing a marker id and any number of numeric values """ MetadataElement(name="markerCol", default=1, desc="Marker ID column", param=metadata.ColumnParameter) MetadataElement(name="columns", default=3, desc="Number of columns", readonly=True) MetadataElement(name="column_types", default=[], desc="Column types", readonly=True, visible=False) file_ext = 'gg' def __init__(self, **kwd): """ Initialize gg datatype, by adding UCSC display apps """ Tabular.__init__(self, **kwd) self.add_display_app('ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links') def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, **kwd) dataset.metadata.markerCol = 1 header = open(dataset.file_name).readlines()[0].strip().split('\t') dataset.metadata.columns = len(header) t = ['numeric' for x in header] t[0] = 'string' dataset.metadata.column_types = t return True def as_ucsc_display_file(self, dataset, **kwd): """ Returns file """ return open(dataset.file_name, 'rb') def ucsc_links(self, dataset, type, app, base_url): """ from the ever-helpful angie hinrichs [email protected] a genome graphs call looks like this http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname &hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess &hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals= &hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index &hgGenome_doSubmitUpload=submit Galaxy gives this for an interval file http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText= http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc """ ret_val = [] if not dataset.dbkey: dataset.dbkey = 'hg18' # punt! if dataset.has_data(): for site_name, site_url in app.datatypes_registry.get_legacy_sites_by_build( 'ucsc', dataset.dbkey): if site_name in app.datatypes_registry.get_display_sites( 'ucsc'): site_url = site_url.replace( '/hgTracks?', '/hgGenome?') # for genome graphs internal_url = "%s" % app.url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name) display_url = "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % ( base_url, app.url_for(controller='root'), dataset.id, type) display_url = quote_plus(display_url) # was display_url = quote_plus( "%s/display_as?id=%i&display_app=%s" % (base_url, dataset.id, type) ) # redirect_url = quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) ) sl = [ "{}db={}".format(site_url, dataset.dbkey), ] # sl.append("&hgt.customText=%s") sl.append( "&hgGenome_dataSetName={}&hgGenome_dataSetDescription={}" .format(dataset.name, 'GalaxyGG_data')) sl.append( "&hgGenome_formatType=best guess&hgGenome_markerType=best guess" ) sl.append( "&hgGenome_columnLabels=first row&hgGenome_maxVal=&hgGenome_labelVals=" ) sl.append("&hgGenome_doSubmitUpload=submit") sl.append( "&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%s" % display_url) s = ''.join(sl) s = quote_plus(s) redirect_url = s link = '{}?redirect_url={}&display_url={}'.format( internal_url, redirect_url, display_url) ret_val.append((site_name, link)) return ret_val def make_html_table(self, dataset, skipchars=[]): """ Create HTML table, used for displaying peek """ out = ['<table cellspacing="0" cellpadding="3">'] try: with open(dataset.file_name) as f: d = f.readlines()[:5] if len(d) == 0: out = "Cannot find anything to parse in %s" % dataset.name return out hasheader = 0 try: ['%f' % x for x in d[0][1:] ] # first is name - see if starts all numerics except Exception: hasheader = 1 # Generate column header out.append('<tr>') if hasheader: for i, name in enumerate(d[0].split()): out.append('<th>{}.{}</th>'.format(i + 1, name)) d.pop(0) out.append('</tr>') for row in d: out.append('<tr>') out.append(''.join('<td>%s</td>' % x for x in row.split())) out.append('</tr>') out.append('</table>') out = "".join(out) except Exception as exc: out = "Can't create peek %s" % exc return out def validate(self, dataset, **kwd): """ Validate a gg file - all numeric after header row """ with open(dataset.file_name) as infile: next(infile) # header for i, row in enumerate(infile): ll = row.strip().split('\t')[ 1:] # first is alpha feature identifier for j, x in enumerate(ll): x = float(x) return DatatypeValidation.validated() def sniff_prefix(self, file_prefix): """ Determines whether the file is in gg format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'test_space.txt' ) >>> GenomeGraphs().sniff( fname ) False >>> fname = get_test_fname( '1.gg' ) >>> GenomeGraphs().sniff( fname ) True """ buf = file_prefix.contents_header rows = [l.split() for l in buf.splitlines()[1:4] ] # break on lines and drop header, small sample if len(rows) < 1: return False for row in rows: if len(row) < 2: # Must actually have a marker and at least one numeric value return False first_val = row[0] if not VALID_GENOME_GRAPH_MARKERS.match(first_val): return False rest_row = row[1:] try: [float(x) for x in rest_row] # first col has been removed except ValueError: return False return True def get_mime(self): """Returns the mime type of the datatype""" return 'application/vnd.ms-excel'
class IdeasPre(Html): """ This datatype defines the input format required by IDEAS: https://academic.oup.com/nar/article/44/14/6721/2468150 The IDEAS preprocessor tool produces an output using this format. The extra_files_path of the primary input dataset contains the following files and directories. - chromosome_windows.txt (optional) - chromosomes.bed (optional) - IDEAS_input_config.txt - compressed archived tmp directory containing a number of compressed bed files. """ MetadataElement(name="base_name", desc="Base name for this dataset", default='IDEASData', readonly=True, set_in_upload=True) MetadataElement(name="chrom_bed", desc="Bed file specifying window positions", default=None, readonly=True) MetadataElement(name="chrom_windows", desc="Chromosome window positions", default=None, readonly=True) MetadataElement(name="input_config", desc="IDEAS input config", default=None, readonly=True) MetadataElement(name="tmp_archive", desc="Compressed archive of compressed bed files", default=None, readonly=True) composite_type = 'auto_primary_file' allow_datatype_change = False file_ext = 'ideaspre' def __init__(self, **kwd): Html.__init__(self, **kwd) self.add_composite_file('chromosome_windows.txt', description='Chromosome window positions', is_binary=False, optional=True) self.add_composite_file( 'chromosomes.bed', description='Bed file specifying window positions', is_binary=False, optional=True) self.add_composite_file('IDEAS_input_config.txt', description='IDEAS input config', is_binary=False) self.add_composite_file( 'tmp.tar.gz', description='Compressed archive of compressed bed files', is_binary=True) def set_meta(self, dataset, **kwd): Html.set_meta(self, dataset, **kwd) for fname in os.listdir(dataset.extra_files_path): if fname.startswith("chromosomes"): dataset.metadata.chrom_bed = os.path.join( dataset.extra_files_path, fname) elif fname.startswith("chromosome_windows"): dataset.metadata.chrom_windows = os.path.join( dataset.extra_files_path, fname) elif fname.startswith("IDEAS_input_config"): dataset.metadata.input_config = os.path.join( dataset.extra_files_path, fname) elif fname.startswith("tmp"): dataset.metadata.tmp_archive = os.path.join( dataset.extra_files_path, fname) self.regenerate_primary_file(dataset) def generate_primary_file(self, dataset=None): rval = ['<html><head></head><body>'] rval.append('<h3>Files prepared for IDEAS</h3>') rval.append('<ul>') for composite_name, composite_file in self.get_composite_files( dataset=dataset).items(): fn = composite_name rval.append('<li><a href="{}>{}</a></li>'.format(fn, fn)) rval.append('</ul></body></html>\n') return "\n".join(rval) def regenerate_primary_file(self, dataset): # Cannot do this until we are setting metadata. rval = ['<html><head></head><body>'] rval.append('<h3>Files prepared for IDEAS</h3>') rval.append('<ul>') for fname in os.listdir(dataset.extra_files_path): fn = os.path.split(fname)[-1] rval.append('<li><a href="{}">{}</a></li>'.format(fn, fn)) rval.append('</ul></body></html>') with open(dataset.file_name, 'w') as f: f.write("\n".join(rval)) f.write('\n')
class SnpSiftDbNSFP(Text): """Class describing a dbNSFP database prepared fpr use by SnpSift dbnsfp """ MetadataElement(name='reference_name', default='dbSNFP', desc='Reference Name', readonly=True, visible=True, set_in_upload=True, no_value='dbSNFP') MetadataElement(name="bgzip", default=None, desc="dbNSFP bgzip", readonly=True, visible=True, no_value=None) MetadataElement(name="index", default=None, desc="Tabix Index File", readonly=True, visible=True, no_value=None) MetadataElement(name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[]) file_ext = "snpsiftdbnsfp" composite_type = 'auto_primary_file' """ ## The dbNSFP file is a tabular file with 1 header line ## The first 4 columns are required to be: chrom pos ref alt ## These match columns 1,2,4,5 of the VCF file ## SnpSift requires the file to be block-gzipped and the indexed with samtools tabix ## Example: ## Compress using block-gzip algorithm bgzip dbNSFP2.3.txt ## Create tabix index tabix -s 1 -b 2 -e 2 dbNSFP2.3.txt.gz """ def __init__(self, **kwd): super().__init__(**kwd) self.add_composite_file('%s.gz', description='dbNSFP bgzip', substitute_name_with_metadata='reference_name', is_binary=True) self.add_composite_file('%s.gz.tbi', description='Tabix Index File', substitute_name_with_metadata='reference_name', is_binary=True) def generate_primary_file(self, dataset=None): """ This is called only at upload to write the html file cannot rename the datasets here - they come with the default unfortunately """ return '<html><head><title>SnpSiftDbNSFP Composite Dataset</title></head></html>' def regenerate_primary_file(self, dataset): """ cannot do this until we are setting metadata """ annotations = "dbNSFP Annotations: %s\n" % ','.join(dataset.metadata.annotation) with open(dataset.file_name, 'a') as f: if dataset.metadata.bgzip: bn = dataset.metadata.bgzip f.write(bn) f.write('\n') f.write(annotations) def set_meta(self, dataset, overwrite=True, **kwd): try: efp = dataset.extra_files_path if os.path.exists(efp): flist = os.listdir(efp) for fname in flist: if fname.endswith('.gz'): dataset.metadata.bgzip = fname try: with gzip.open(os.path.join(efp, fname), 'rt') as fh: buf = fh.read(5000) lines = buf.splitlines() headers = lines[0].split('\t') dataset.metadata.annotation = headers[4:] except Exception as e: log.warning("set_meta fname: %s %s", fname, unicodify(e)) if fname.endswith('.tbi'): dataset.metadata.index = fname self.regenerate_primary_file(dataset) except Exception as e: log.warning("set_meta fname: %s %s", dataset.file_name if dataset and dataset.file_name else 'Unkwown', unicodify(e)) def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: dataset.peek = '{} : {}'.format(dataset.metadata.reference_name, ','.join(dataset.metadata.annotation)) dataset.blurb = '%s' % dataset.metadata.reference_name else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disc'
class Rgenetics(Html): """ base class to use for rgenetics datatypes derived from html - composite datatype elements stored in extra files path """ MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default='RgeneticsData', readonly=True, set_in_upload=True) composite_type = 'auto_primary_file' allow_datatype_change = False file_ext = 'rgenetics' def generate_primary_file(self, dataset=None): rval = [ '<html><head><title>Rgenetics Galaxy Composite Dataset </title></head><p/>' ] rval.append( '<div>This composite dataset is composed of the following files:<p/><ul>' ) for composite_name, composite_file in self.get_composite_files( dataset=dataset).items(): fn = composite_name opt_text = '' if composite_file.optional: opt_text = ' (optional)' if composite_file.get('description'): rval.append( '<li><a href="{}" type="application/binary">{} ({})</a>{}</li>' .format(fn, fn, composite_file.get('description'), opt_text)) else: rval.append( '<li><a href="{}" type="application/binary">{}</a>{}</li>'. format(fn, fn, opt_text)) rval.append('</ul></div></html>') return "\n".join(rval) def regenerate_primary_file(self, dataset): """ cannot do this until we are setting metadata """ efp = dataset.extra_files_path flist = os.listdir(efp) rval = [ '<html><head><title>Files for Composite Dataset {}</title></head><body><p/>Composite {} contains:<p/><ul>' .format(dataset.name, dataset.name) ] for i, fname in enumerate(flist): sfname = os.path.split(fname)[-1] f, e = os.path.splitext(fname) rval.append('<li><a href="{}">{}</a></li>'.format(sfname, sfname)) rval.append('</ul></body></html>') with open(dataset.file_name, 'w') as f: f.write("\n".join(rval)) f.write('\n') def get_mime(self): """Returns the mime type of the datatype""" return 'text/html' def set_meta(self, dataset, **kwd): """ for lped/pbed eg """ Html.set_meta(self, dataset, **kwd) if not kwd.get('overwrite'): if verbose: gal_Log.debug( '@@@ rgenetics set_meta called with overwrite = False') return True try: efp = dataset.extra_files_path except Exception: if verbose: gal_Log.debug( '@@@rgenetics set_meta failed {} - dataset {} has no efp ?' .format(sys.exc_info()[0], dataset.name)) return False try: flist = os.listdir(efp) except Exception: if verbose: gal_Log.debug( '@@@rgenetics set_meta failed {} - dataset {} has no efp ?' .format(sys.exc_info()[0], dataset.name)) return False if len(flist) == 0: if verbose: gal_Log.debug( '@@@rgenetics set_meta failed - {} efp {} is empty?'. format(dataset.name, efp)) return False self.regenerate_primary_file(dataset) if not dataset.info: dataset.info = 'Galaxy genotype datatype object' if not dataset.blurb: dataset.blurb = 'Composite file - Rgenetics Galaxy toolkit' return True
class SnpEffDb(Text): """Class describing a SnpEff genome build""" edam_format = "format_3624" file_ext = "snpeffdb" MetadataElement(name="genome_version", default=None, desc="Genome Version", readonly=True, visible=True, no_value=None) MetadataElement(name="snpeff_version", default="SnpEff4.0", desc="SnpEff Version", readonly=True, visible=True, no_value=None) MetadataElement(name="regulation", default=[], desc="Regulation Names", readonly=True, visible=True, no_value=[], optional=True) MetadataElement(name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[], optional=True) def __init__(self, **kwd): super().__init__(**kwd) # The SnpEff version line was added in SnpEff version 4.1 def getSnpeffVersionFromFile(self, path): snpeff_version = None try: with gzip.open(path, 'rt') as fh: buf = fh.read(100) lines = buf.splitlines() m = re.match(r'^(SnpEff)\s+(\d+\.\d+).*$', lines[0].strip()) if m: snpeff_version = m.groups()[0] + m.groups()[1] except Exception: pass return snpeff_version def set_meta(self, dataset, **kwd): super().set_meta(dataset, **kwd) data_dir = dataset.extra_files_path # search data_dir/genome_version for files regulation_pattern = 'regulation_(.+).bin' # annotation files that are included in snpEff by a flag annotations_dict = {'nextProt.bin': '-nextprot', 'motif.bin': '-motif', 'interactions.bin': '-interaction'} regulations = [] annotations = [] genome_version = None snpeff_version = None if data_dir and os.path.isdir(data_dir): for root, _, files in os.walk(data_dir): for fname in files: if fname.startswith('snpEffectPredictor'): # if snpEffectPredictor.bin download succeeded genome_version = os.path.basename(root) dataset.metadata.genome_version = genome_version # read the first line of the gzipped snpEffectPredictor.bin file to get the SnpEff version snpeff_version = self.getSnpeffVersionFromFile(os.path.join(root, fname)) if snpeff_version: dataset.metadata.snpeff_version = snpeff_version else: m = re.match(regulation_pattern, fname) if m: name = m.groups()[0] regulations.append(name) elif fname in annotations_dict: value = annotations_dict[fname] name = value.lstrip('-') annotations.append(name) dataset.metadata.regulation = regulations dataset.metadata.annotation = annotations try: with open(dataset.file_name, 'w') as fh: fh.write("%s\n" % genome_version if genome_version else 'Genome unknown') fh.write("%s\n" % snpeff_version if snpeff_version else 'SnpEff version unknown') if annotations: fh.write("annotations: %s\n" % ','.join(annotations)) if regulations: fh.write("regulations: %s\n" % ','.join(regulations)) except Exception: pass
class ImgtJson(Json): file_ext = "imgt.json" MetadataElement(name="taxon_names", default=[], desc="taxonID: names", readonly=True, visible=True, no_value=[]) """ https://github.com/repseqio/library-imgt/releases Data coming from IMGT server may be used for academic research only, provided that it is referred to IMGT®, and cited as: "IMGT®, the international ImMunoGeneTics information system® http://www.imgt.org (founder and director: Marie-Paule Lefranc, Montpellier, France)." """ def set_peek(self, dataset, is_multi_byte=False): super().set_peek(dataset) if not dataset.dataset.purged: dataset.blurb = "IMGT Library" def sniff_prefix(self, file_prefix): """ Determines whether the file is in json format with imgt elements >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( '1.json' ) >>> ImgtJson().sniff( fname ) False >>> fname = get_test_fname( 'imgt.json' ) >>> ImgtJson().sniff( fname ) True """ is_imgt = False if self._looks_like_json(file_prefix): is_imgt = self._looks_like_imgt(file_prefix) return is_imgt def _looks_like_imgt(self, file_prefix, load_size=5000): """ @param filepath: [str] The path to the evaluated file. @param load_size: [int] The size of the file block load in RAM (in bytes). """ is_imgt = False try: with open(file_prefix.filename) as fh: segment_str = fh.read(load_size) if segment_str.strip().startswith('['): if '"taxonId"' in segment_str and '"anchorPoints"' in segment_str: is_imgt = True except Exception: pass return is_imgt def set_meta(self, dataset, **kwd): """ Store metadata information from the imgt file. """ if dataset.has_data(): with open(dataset.file_name) as fh: try: json_dict = json.load(fh) tax_names = [] for entry in json_dict: if 'taxonId' in entry: names = "%d: %s" % (entry['taxonId'], ','.join(entry['speciesNames'])) tax_names.append(names) dataset.metadata.taxon_names = tax_names except Exception: return
class Biom1(Json): """ BIOM version 1.0 file format description http://biom-format.org/documentation/format_versions/biom-1.0.html """ file_ext = "biom1" edam_format = "format_3746" MetadataElement(name="table_rows", default=[], desc="table_rows", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=[]) MetadataElement(name="table_matrix_element_type", default="", desc="table_matrix_element_type", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value="") MetadataElement(name="table_format", default="", desc="table_format", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value="") MetadataElement(name="table_generated_by", default="", desc="table_generated_by", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value="") MetadataElement(name="table_matrix_type", default="", desc="table_matrix_type", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value="") MetadataElement(name="table_shape", default=[], desc="table_shape", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=[]) MetadataElement(name="table_format_url", default="", desc="table_format_url", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value="") MetadataElement(name="table_date", default="", desc="table_date", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value="") MetadataElement(name="table_type", default="", desc="table_type", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value="") MetadataElement(name="table_id", default=None, desc="table_id", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value=None) MetadataElement(name="table_columns", default=[], desc="table_columns", param=MetadataParameter, readonly=True, visible=False, optional=True, no_value=[]) MetadataElement(name="table_column_metadata_headers", default=[], desc="table_column_metadata_headers", param=MetadataParameter, readonly=True, visible=True, optional=True, no_value=[]) def set_peek(self, dataset, is_multi_byte=False): super().set_peek(dataset) if not dataset.dataset.purged: dataset.blurb = "Biological Observation Matrix v1" def sniff_prefix(self, file_prefix): is_biom = False if self._looks_like_json(file_prefix): is_biom = self._looks_like_biom(file_prefix) return is_biom def _looks_like_biom(self, file_prefix, load_size=50000): """ @param filepath: [str] The path to the evaluated file. @param load_size: [int] The size of the file block load in RAM (in bytes). """ is_biom = False segment_size = int(load_size / 2) try: with open(file_prefix.filename) as fh: prev_str = "" segment_str = fh.read(segment_size) if segment_str.strip().startswith('{'): while segment_str: current_str = prev_str + segment_str if '"format"' in current_str: current_str = re.sub(r'\s', '', current_str) if '"format":"BiologicalObservationMatrix' in current_str: is_biom = True break prev_str = segment_str segment_str = fh.read(segment_size) except Exception: pass return is_biom def set_meta(self, dataset, **kwd): """ Store metadata information from the BIOM file. """ if dataset.has_data(): with open(dataset.file_name) as fh: try: json_dict = json.load(fh) except Exception: return def _transform_dict_list_ids(dict_list): if dict_list: return [x.get('id', None) for x in dict_list] return [] b_transform = {'rows': _transform_dict_list_ids, 'columns': _transform_dict_list_ids} for (m_name, b_name) in [('table_rows', 'rows'), ('table_matrix_element_type', 'matrix_element_type'), ('table_format', 'format'), ('table_generated_by', 'generated_by'), ('table_matrix_type', 'matrix_type'), ('table_shape', 'shape'), ('table_format_url', 'format_url'), ('table_date', 'date'), ('table_type', 'type'), ('table_id', 'id'), ('table_columns', 'columns')]: try: metadata_value = json_dict.get(b_name, None) if b_name == "columns" and metadata_value: keep_columns = set() for column in metadata_value: if column['metadata'] is not None: for k, v in column['metadata'].items(): if v is not None: keep_columns.add(k) final_list = sorted(list(keep_columns)) dataset.metadata.table_column_metadata_headers = final_list if b_name in b_transform: metadata_value = b_transform[b_name](metadata_value) setattr(dataset.metadata, m_name, metadata_value) except Exception: log.exception("Something in the metadata detection for biom1 went wrong.")
class NeperTesr(Binary): """ Neper Raster Tessellation File ***tesr **format format **general dimension size_x size_y [size_z] voxsize_x voxsize_y [voxsize_z] [*origin origin_x origin_y [origin_z]] [*hasvoid has_void] [**cell number_of_cells """ file_ext = "neper.tesr" MetadataElement(name="format", default=None, desc="format", readonly=True, visible=True) MetadataElement(name="dimension", default=None, desc="dimension", readonly=True, visible=True) MetadataElement(name="size", default=[], desc="size", readonly=True, visible=True) MetadataElement(name="voxsize", default=[], desc="voxsize", readonly=True, visible=True) MetadataElement(name="origin", default=[], desc="origin", readonly=True, visible=True) MetadataElement(name="cells", default=None, desc="cells", readonly=True, visible=True) def __init__(self, **kwd): Binary.__init__(self, **kwd) def sniff_prefix(self, file_prefix: FilePrefix): """ Neper tesr format startswith:***tesr >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test.neper.tesr') >>> NeperTesr().sniff(fname) True >>> fname = get_test_fname('test.neper.tess') >>> NeperTesr().sniff(fname) False """ return file_prefix.text_io( errors='ignore').readline(10).startswith('***tesr') def set_meta(self, dataset, **kwd): if dataset.has_data(): with open(dataset.file_name, errors='ignore') as fh: field = '' for i, line in enumerate(fh): line = line.strip() if not line or i > 12: break if i == 0 and not line.startswith('***tesr'): break if line.startswith('*'): field = line continue if i == 2: dataset.metadata.format = line.split()[0] continue if i == 4: dataset.metadata.dimension = line.split()[0] continue if i == 5: dataset.metadata.size = line.split() continue if i == 6: dataset.metadata.voxsize = line.split() continue if field.startswith('*origin'): dataset.metadata.origin = line.split() continue if field.startswith('**cell'): dataset.metadata.cells = int(line) break def set_peek(self, dataset): if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.file_name, LINE_COUNT=9) dataset.blurb = f'format: {str(dataset.metadata.format)} dim: {str(dataset.metadata.dimension)} cells: {str(dataset.metadata.cells)}' else: dataset.peek = 'File does not exist' dataset.blurb = 'File purged from disc'
class RexpBase(Html): """ base class for BioC data structures in Galaxy must be constructed with the pheno data in place since that goes into the metadata for each instance """ MetadataElement(name="columns", default=0, desc="Number of columns", visible=True) MetadataElement(name="column_names", default=[], desc="Column names", visible=True) MetadataElement(name="pheCols", default=[], desc="Select list for potentially interesting variables", visible=True) MetadataElement( name="base_name", desc= "base name for all transformed versions of this expression dataset", default='rexpression', set_in_upload=True) MetadataElement(name="pheno_path", desc="Path to phenotype data for this experiment", default="rexpression.pheno", visible=True) file_ext = 'rexpbase' html_table = None composite_type = 'auto_primary_file' allow_datatype_change = False def __init__(self, **kwd): Html.__init__(self, **kwd) self.add_composite_file('%s.pheno', description='Phenodata tab text file', substitute_name_with_metadata='base_name', is_binary=False) def generate_primary_file(self, dataset=None): """ This is called only at upload to write the html file cannot rename the datasets here - they come with the default unfortunately """ return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>' def get_mime(self): """Returns the mime type of the datatype""" return 'text/html' def get_phecols(self, phenolist=[], maxConc=20): """ sept 2009: cannot use whitespace to split - make a more complex structure here and adjust the methods that rely on this structure return interesting phenotype column names for an rexpression eset or affybatch to use in array subsetting and so on. Returns a data structure for a dynamic Galaxy select parameter. A column with only 1 value doesn't change, so is not interesting for analysis. A column with a different value in every row is equivalent to a unique identifier so is also not interesting for anova or limma analysis - both these are removed after the concordance (count of unique terms) is constructed for each column. Then a complication - each remaining pair of columns is tested for redundancy - if two columns are always paired, then only one is needed :) """ for nrows, row in enumerate(phenolist): # construct concordance if len(row.strip()) == 0: break row = row.strip().split('\t') if nrows == 0: # set up from header head = row totcols = len(row) concordance = [{} for x in head] # list of dicts else: for col, code in enumerate(row): # keep column order correct if col >= totcols: gal_Log.warning( '### get_phecols error in pheno file - row %d col %d (%s) longer than header %s' % (nrows, col, row, head)) else: concordance[col].setdefault(code, 0) # first one is zero concordance[col][code] += 1 useCols = [] useConc = [] # columns of interest to keep nrows = len(phenolist) nrows -= 1 # drop head from count for c, conc in enumerate(concordance): # c is column number if (len(conc) > 1) and (len(conc) < min( nrows, maxConc)): # not all same and not all different!! useConc.append(conc) # keep concordance useCols.append(c) # keep column nuse = len(useCols) # now to check for pairs of concordant columns - drop one of these. delme = [] p = phenolist[1:] # drop header plist = [x.strip().split('\t') for x in p] # list of lists phe = [[x[i] for i in useCols] for x in plist if len(x) >= totcols] # strip unused data for i in range(0, (nuse - 1)): # for each interesting column for j in range(i + 1, nuse): kdict = {} for row in phe: # row is a list of lists k = '{}{}'.format(row[i], row[j]) # composite key kdict[k] = k if (len(kdict.keys()) == len(concordance[useCols[j]]) ): # i and j are always matched delme.append(j) delme = list(set(delme)) # remove dupes listCol = [] delme.sort() delme.reverse() # must delete from far end! for i in delme: del useConc[i] # get rid of concordance del useCols[i] # and usecols entry for i, conc in enumerate( useConc): # these are all unique columns for the design matrix ccounts = sorted( (conc.get(code, 0), code) for code in conc.keys()) # decorate cc = [(x[1], x[0]) for x in ccounts] # list of code count tuples codeDetails = (head[useCols[i]], cc ) # ('foo',[('a',3),('b',11),..]) listCol.append(codeDetails) if len(listCol) > 0: res = listCol # metadata.pheCols becomes [('bar;22,zot;113','foo'), ...] else: res = [ ('no usable phenotype columns found', [ ('?', 0), ]), ] return res def get_pheno(self, dataset): """ expects a .pheno file in the extra_files_dir - ugh note that R is wierd and adds the row.name in the header so the columns are all wrong - unless you tell it not to. A file can be written as write.table(file='foo.pheno',pData(foo),sep='\t',quote=F,row.names=F) """ p = open(dataset.metadata.pheno_path).readlines() if len(p) > 0: # should only need to fix an R pheno file once head = p[0].strip().split('\t') line1 = p[1].strip().split('\t') if len(head) < len(line1): head.insert(0, 'ChipFileName') # fix R write.table b0rken-ness p[0] = '\t'.join(head) else: p = [] return '\n'.join(p) def set_peek(self, dataset, **kwd): """ expects a .pheno file in the extra_files_dir - ugh note that R is weird and does not include the row.name in the header. why?""" if not dataset.dataset.purged: pp = os.path.join(dataset.extra_files_path, '%s.pheno' % dataset.metadata.base_name) try: with open(pp) as f: p = f.readlines() except Exception: p = [ '##failed to find %s' % pp, ] dataset.peek = ''.join(p[:5]) dataset.blurb = 'Galaxy Rexpression composite file' else: dataset.peek = 'file does not exist\n' dataset.blurb = 'file purged from disk' def get_peek(self, dataset): """ expects a .pheno file in the extra_files_dir - ugh """ pp = os.path.join(dataset.extra_files_path, '%s.pheno' % dataset.metadata.base_name) try: with open(pp) as f: p = f.readlines() except Exception: p = ['##failed to find %s' % pp] return ''.join(p[:5]) def get_file_peek(self, filename): """ can't really peek at a filename - need the extra_files_path and such? """ h = '## rexpression get_file_peek: no file found' try: with open(filename) as f: h = f.readlines() except Exception: pass return ''.join(h[:5]) def regenerate_primary_file(self, dataset): """ cannot do this until we are setting metadata """ bn = dataset.metadata.base_name flist = os.listdir(dataset.extra_files_path) rval = [ '<html><head><title>Files for Composite Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn) ] for i, fname in enumerate(flist): sfname = os.path.split(fname)[-1] rval.append('<li><a href="{}">{}</a>'.format(sfname, sfname)) rval.append('</ul></html>') with open(dataset.file_name, 'w') as f: f.write("\n".join(rval)) f.write('\n') def init_meta(self, dataset, copy_from=None): if copy_from: dataset.metadata = copy_from.metadata def set_meta(self, dataset, **kwd): """ NOTE we apply the tabular machinary to the phenodata extracted from a BioC eSet or affybatch. """ Html.set_meta(self, dataset, **kwd) try: flist = os.listdir(dataset.extra_files_path) except Exception: if verbose: gal_Log.debug('@@@rexpression set_meta failed - no dataset?') return False bn = dataset.metadata.base_name if not bn: for f in flist: n = os.path.splitext(f)[0] bn = n dataset.metadata.base_name = bn if not bn: bn = '?' dataset.metadata.base_name = bn pn = '%s.pheno' % (bn) pp = os.path.join(dataset.extra_files_path, pn) dataset.metadata.pheno_path = pp try: with open(pp) as f: pf = f.readlines( ) # read the basename.phenodata in the extra_files_path except Exception: pf = None if pf: h = pf[0].strip() h = h.split('\t') # hope is header h = [escape(x) for x in h] dataset.metadata.column_names = h dataset.metadata.columns = len(h) dataset.peek = ''.join(pf[:5]) else: dataset.metadata.column_names = [] dataset.metadata.columns = 0 dataset.peek = 'No pheno file found' if pf and len(pf) > 1: dataset.metadata.pheCols = self.get_phecols(phenolist=pf) else: dataset.metadata.pheCols = [ ('', 'No useable phenotypes found', False), ] if not dataset.info: dataset.info = 'Galaxy Expression datatype object' if not dataset.blurb: dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit' return True def make_html_table(self, pp='nothing supplied from peek\n'): """ Create HTML table, used for displaying peek """ out = [ '<table cellspacing="0" cellpadding="3">', ] try: # Generate column header p = pp.split('\n') for i, row in enumerate(p): lrow = row.strip().split('\t') if i == 0: orow = ['<th>%s</th>' % escape(x) for x in lrow] orow.insert(0, '<tr>') orow.append('</tr>') else: orow = ['<td>%s</td>' % escape(x) for x in lrow] orow.insert(0, '<tr>') orow.append('</tr>') out.append(''.join(orow)) out.append('</table>') out = "\n".join(out) except Exception as exc: out = "Can't create html table %s" % unicodify(exc) return out def display_peek(self, dataset): """ Returns formatted html of peek """ out = self.make_html_table(dataset.peek) return out
class Otu(Text): file_ext = 'mothur.otu' MetadataElement(name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0) MetadataElement(name="labels", default=[], desc="Label Names", readonly=True, visible=True, no_value=[]) MetadataElement(name="otulabels", default=[], desc="OTU Names", readonly=True, visible=True, no_value=[]) def __init__(self, **kwd): super(Otu, self).__init__(**kwd) def set_meta(self, dataset, overwrite=True, **kwd): """ Set metadata for Otu files. >>> from galaxy.datatypes.sniff import get_test_fname >>> from galaxy.util.bunch import Bunch >>> dataset = Bunch() >>> dataset.metadata = Bunch >>> otu = Otu() >>> dataset.file_name = get_test_fname( 'mothur_datatypetest_true.mothur.otu' ) >>> dataset.has_data = lambda: True >>> otu.set_meta(dataset) >>> dataset.metadata.columns 100 >>> len(dataset.metadata.labels) == 37 True >>> len(dataset.metadata.otulabels) == 98 True """ super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd) if dataset.has_data(): label_names = set() otulabel_names = set() ncols = 0 data_lines = 0 comment_lines = 0 headers = iter_headers(dataset.file_name, sep='\t', count=-1) first_line = get_headers(dataset.file_name, sep='\t', count=1) if first_line: first_line = first_line[0] # set otulabels if len(first_line) > 2: otulabel_names = first_line[2:] # set label names and number of lines for line in headers: if len(line) >= 2 and not line[0].startswith('@'): data_lines += 1 ncols = max(ncols, len(line)) label_names.add(line[0]) else: comment_lines += 1 # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() dataset.metadata.otulabels = list(otulabel_names) dataset.metadata.otulabels.sort() def sniff(self, filename): """ Determines whether the file is otu (operational taxonomic unit) format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' ) >>> Otu().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' ) >>> Otu().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 2: return False if count >= 1: try: check = int(line[1]) if check + 2 != len(line): return False except ValueError: return False count += 1 if count > 2: return True return False
class Tabular( data.Text ): """Tab delimited data""" # All tabular data is chunkable. CHUNKABLE = True """Add metadata elements""" MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 ) MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 ) MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] ) MetadataElement( name="column_names", default=[], desc="Column names", readonly=True, visible=False, optional=True, no_value=[] ) def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = 100000, max_guess_type_data_lines = None, **kwd ): """ Tries to determine the number of columns as well as those columns that contain numerical values in the dataset. A skip parameter is used because various tabular data types reuse this function, and their data type classes are responsible to determine how many invalid comment lines should be skipped. Using None for skip will cause skip to be zero, but the first line will be processed as a header. A max_data_lines parameter is used because various tabular data types reuse this function, and their data type classes are responsible to determine how many data lines should be processed to ensure that the non-optional metadata parameters are properly set; if used, optional metadata parameters will be set to None, unless the entire file has already been read. Using None for max_data_lines will process all data lines. Items of interest: 1. We treat 'overwrite' as always True (we always want to set tabular metadata when called). 2. If a tabular file has no data, it will have one column of type 'str'. 3. We used to check only the first 100 lines when setting metadata and this class's set_peek() method read the entire file to determine the number of lines in the file. Since metadata can now be processed on cluster nodes, we've merged the line count portion of the set_peek() processing here, and we now check the entire contents of the file. """ # Store original skip value to check with later requested_skip = skip if skip is None: skip = 0 column_type_set_order = [ 'int', 'float', 'list', 'str' ] #Order to set column types in default_column_type = column_type_set_order[-1] # Default column type is lowest in list column_type_compare_order = list( column_type_set_order ) #Order to compare column types column_type_compare_order.reverse() def type_overrules_type( column_type1, column_type2 ): if column_type1 is None or column_type1 == column_type2: return False if column_type2 is None: return True for column_type in column_type_compare_order: if column_type1 == column_type: return True if column_type2 == column_type: return False #neither column type was found in our ordered list, this cannot happen raise "Tried to compare unknown column types" def is_int( column_text ): try: int( column_text ) return True except: return False def is_float( column_text ): try: float( column_text ) return True except: if column_text.strip().lower() == 'na': return True #na is special cased to be a float return False def is_list( column_text ): return "," in column_text def is_str( column_text ): #anything, except an empty string, is True if column_text == "": return False return True is_column_type = {} #Dict to store column type string to checking function for column_type in column_type_set_order: is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ] def guess_column_type( column_text ): for column_type in column_type_set_order: if is_column_type[column_type]( column_text ): return column_type return None data_lines = 0 comment_lines = 0 column_types = [] first_line_column_types = [default_column_type] # default value is one column of type str if dataset.has_data(): #NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default dataset_fh = open( dataset.file_name ) i = 0 while True: line = dataset_fh.readline() if not line: break line = line.rstrip( '\r\n' ) if i < skip or not line or line.startswith( '#' ): # We'll call blank lines comments comment_lines += 1 else: data_lines += 1 if max_guess_type_data_lines is None or data_lines <= max_guess_type_data_lines: fields = line.split( '\t' ) for field_count, field in enumerate( fields ): if field_count >= len( column_types ): #found a previously unknown column, we append None column_types.append( None ) column_type = guess_column_type( field ) if type_overrules_type( column_type, column_types[field_count] ): column_types[field_count] = column_type if i == 0 and requested_skip is None: # This is our first line, people seem to like to upload files that have a header line, but do not # start with '#' (i.e. all column types would then most likely be detected as str). We will assume # that the first line is always a header (this was previous behavior - it was always skipped). When # the requested skip is None, we only use the data from the first line if we have no other data for # a column. This is far from perfect, as # 1,2,3 1.1 2.2 qwerty # 0 0 1,2,3 # will be detected as # "column_types": ["int", "int", "float", "list"] # instead of # "column_types": ["list", "float", "float", "str"] *** would seem to be the 'Truth' by manual # observation that the first line should be included as data. The old method would have detected as # "column_types": ["int", "int", "str", "list"] first_line_column_types = column_types column_types = [ None for col in first_line_column_types ] if max_data_lines is not None and data_lines >= max_data_lines: if dataset_fh.tell() != dataset.get_size(): data_lines = None #Clear optional data_lines metadata value comment_lines = None #Clear optional comment_lines metadata value; additional comment lines could appear below this point break i += 1 dataset_fh.close() #we error on the larger number of columns #first we pad our column_types by using data from first line if len( first_line_column_types ) > len( column_types ): for column_type in first_line_column_types[len( column_types ):]: column_types.append( column_type ) #Now we fill any unknown (None) column_types with data from first line for i in range( len( column_types ) ): if column_types[i] is None: if len( first_line_column_types ) <= i or first_line_column_types[i] is None: column_types[i] = default_column_type else: column_types[i] = first_line_column_types[i] # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.comment_lines = comment_lines dataset.metadata.column_types = column_types dataset.metadata.columns = len( column_types ) def make_html_table( self, dataset, **kwargs ): """Create HTML table, used for displaying peek""" out = ['<table cellspacing="0" cellpadding="3">'] try: out.append( self.make_html_peek_header( dataset, **kwargs ) ) out.append( self.make_html_peek_rows( dataset, **kwargs ) ) out.append( '</table>' ) out = "".join( out ) except Exception, exc: out = "Can't create peek %s" % str( exc ) return out
class Quantile(Tabular): file_ext = 'mothur.quan' MetadataElement(name="filtered", default=False, no_value=False, optional=True, desc="Quantiles calculated using a mask", readonly=True) MetadataElement(name="masked", default=False, no_value=False, optional=True, desc="Quantiles calculated using a frequency filter", readonly=True) def __init__(self, **kwd): """Quantiles for chimera analysis""" super(Quantile, self).__init__(**kwd) self.column_names = [ 'num', 'ten', 'twentyfive', 'fifty', 'seventyfive', 'ninetyfive', 'ninetynine' ] self.column_types = [ 'int', 'float', 'float', 'float', 'float', 'float', 'float' ] def sniff(self, filename): """ Determines whether the file is a quantiles tabular format for chimera analysis 1 0 0 0 0 0 0 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 ... >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' ) >>> Quantile().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' ) >>> Quantile().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if len(line) != 7: return False try: int(line[0]) float(line[1]) float(line[2]) float(line[3]) float(line[4]) float(line[5]) float(line[6]) except Exception: return False count += 1 if count > 0: return True return False
class Pileup( Tabular ): """Tab delimited data in pileup (6- or 10-column) format""" file_ext = "pileup" line_class = "genomic coordinate" data_sources = { "data": "tabix" } """Add metadata elements""" MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter ) MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter ) MetadataElement( name="endCol", default=2, desc="End column", param=metadata.ColumnParameter ) MetadataElement( name="baseCol", default=3, desc="Reference base column", param=metadata.ColumnParameter ) def init_meta( self, dataset, copy_from=None ): Tabular.init_meta( self, dataset, copy_from=copy_from ) def display_peek( self, dataset ): """Returns formated html of peek""" return Tabular.make_html_table( self, dataset, column_parameter_alias={'chromCol':'Chrom', 'startCol':'Start', 'baseCol':'Base'} ) def repair_methods( self, dataset ): """Return options for removing errors along with a description""" return [ ("lines", "Remove erroneous lines") ] def sniff( self, filename ): """ Checks for 'pileup-ness' There are two main types of pileup: 6-column and 10-column. For both, the first three and last two columns are the same. We only check the first three to allow for some personalization of the format. >>> fname = get_test_fname( 'interval.interval' ) >>> Pileup().sniff( fname ) False >>> fname = get_test_fname( '6col.pileup' ) >>> Pileup().sniff( fname ) True >>> fname = get_test_fname( '10col.pileup' ) >>> Pileup().sniff( fname ) True """ headers = get_headers( filename, '\t' ) try: for hdr in headers: if hdr and not hdr[0].startswith( '#' ): if len( hdr ) < 3: return False try: # chrom start in column 1 (with 0-based columns) # and reference base is in column 2 check = int( hdr[1] ) assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ] except: return False return True except: return False # ------------- Dataproviders @dataproviders.decorators.dataprovider_factory( 'genomic-region', dataproviders.dataset.GenomicRegionDataProvider.settings ) def genomic_region_dataprovider( self, dataset, **settings ): return dataproviders.dataset.GenomicRegionDataProvider( dataset, **settings ) @dataproviders.decorators.dataprovider_factory( 'genomic-region-dict', dataproviders.dataset.GenomicRegionDataProvider.settings ) def genomic_region_dict_dataprovider( self, dataset, **settings ): settings[ 'named_columns' ] = True return self.genomic_region_dataprovider( dataset, **settings )
class Arff(Text): """ An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. http://weka.wikispaces.com/ARFF """ file_ext = "arff" """Add metadata elements""" MetadataElement(name="comment_lines", default=0, desc="Number of comment lines", readonly=True, optional=True, no_value=0) MetadataElement(name="columns", default=0, desc="Number of columns", readonly=True, visible=True, no_value=0) def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) dataset.blurb = "Attribute-Relation File Format (ARFF)" dataset.blurb += ", %s comments, %s attributes" % ( dataset.metadata.comment_lines, dataset.metadata.columns) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disc' def sniff(self, filename): """ Try to guess the Arff filetype. It usually starts with a "format-version:" string and has several stanzas which starts with "id:". """ with open(filename) as handle: relation_found = False attribute_found = False prefix = "" for line_count, line in enumerate(handle): if line_count > 1000: # only investigate the first 1000 lines return False line = line.strip() if not line: continue start_string = line[:20].upper() if start_string.startswith("@RELATION"): relation_found = True elif start_string.startswith("@ATTRIBUTE"): attribute_found = True elif start_string.startswith("@DATA"): # @DATA should be the last data block if relation_found and attribute_found: return True return False def set_meta(self, dataset, **kwd): """ Trying to count the comment lines and the number of columns included. A typical ARFF data block looks like this: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa """ if dataset.has_data(): comment_lines = 0 first_real_line = False data_block = False with open(dataset.file_name) as handle: for line in handle: line = line.strip() if not line: continue if line.startswith('%') and not first_real_line: comment_lines += 1 else: first_real_line = True if data_block: if line.startswith('{'): # Sparse representation """ @data 0, X, 0, Y, "class A", {5} or @data {1 X, 3 Y, 4 "class A"}, {5} """ token = line.split('}', 1) first_part = token[0] last_column = first_part.split(',')[-1].strip() numeric_value = last_column.split()[0] column_count = int(numeric_value) if len(token) > 1: # we have an additional weight column_count -= 1 else: columns = line.strip().split(',') column_count = len(columns) if columns[-1].strip().startswith('{'): # we have an additional weight at the end column_count -= 1 # We have now the column_count and we know the initial comment lines. So we can terminate here. break if line[:5].upper() == "@DATA": data_block = True dataset.metadata.comment_lines = comment_lines dataset.metadata.columns = column_count
class Infernal_CM_1_1(Text): file_ext = "cm" MetadataElement(name="number_of_models", default=0, desc="Number of covariance models", readonly=True, visible=True, optional=True, no_value=0) def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) if (dataset.metadata.number_of_models == 1): dataset.blurb = "1 model" else: dataset.blurb = "%s models" % dataset.metadata.number_of_models dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disc' def sniff(self, filename): if count_special_lines("^INFERNAL1/a", filename) > 0: return True else: return False def set_meta(self, dataset, **kwd): """ Set the number of models in dataset. """ dataset.metadata.number_of_models = count_special_lines( "^INFERNAL1/a", dataset.file_name) def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by model records. """ if split_params is None: return None if len(input_datasets) > 1: raise Exception( "CM-file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] chunk_size = None if split_params['split_mode'] == 'number_of_parts': raise Exception( 'Split mode "%s" is currently not implemented for CM-files.' % split_params['split_mode']) elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) def _read_cm_records(filename): lines = [] with open(filename) as handle: for line in handle: if line.startswith("INFERNAL1/a") and lines: yield lines lines = [line] else: lines.append(line) yield lines def _write_part_cm_file(accumulated_lines): part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) part_file = open(part_path, 'w') part_file.writelines(accumulated_lines) part_file.close() try: cm_records = _read_cm_records(input_files[0]) cm_lines_accumulated = [] for counter, cm_record in enumerate(cm_records, start=1): cm_lines_accumulated.extend(cm_record) if counter % chunk_size == 0: _write_part_cm_file(cm_lines_accumulated) cm_lines_accumulated = [] if cm_lines_accumulated: _write_part_cm_file(cm_lines_accumulated) except Exception, e: log.error('Unable to split files: %s' % str(e)) raise
class SnpSiftDbNSFP(Text): """Class describing a dbNSFP database prepared fpr use by SnpSift dbnsfp """ MetadataElement(name='reference_name', default='dbSNFP', desc='Reference Name', readonly=True, visible=True, set_in_upload=True, no_value='dbSNFP') MetadataElement(name="bgzip", default=None, desc="dbNSFP bgzip", readonly=True, visible=True, no_value=None) MetadataElement(name="index", default=None, desc="Tabix Index File", readonly=True, visible=True, no_value=None) MetadataElement(name="annotation", default=[], desc="Annotation Names", readonly=True, visible=True, no_value=[]) file_ext = "snpsiftdbnsfp" composite_type = 'auto_primary_file' allow_datatype_change = False """ ## The dbNSFP file is a tabular file with 1 header line ## The first 4 columns are required to be: chrom pos ref alt ## These match columns 1,2,4,5 of the VCF file ## SnpSift requires the file to be block-gzipped and the indexed with samtools tabix ## Example: ## Compress using block-gzip algorithm bgzip dbNSFP2.3.txt ## Create tabix index tabix -s 1 -b 2 -e 2 dbNSFP2.3.txt.gz """ def __init__(self, **kwd): Text.__init__(self, **kwd) self.add_composite_file('%s.grp', description='Group File', substitute_name_with_metadata='reference_name', is_binary=False) self.add_composite_file('%s.ti', description='', substitute_name_with_metadata='reference_name', is_binary=False) def init_meta(self, dataset, copy_from=None): Text.init_meta(self, dataset, copy_from=copy_from) def generate_primary_file(self, dataset=None): """ This is called only at upload to write the html file cannot rename the datasets here - they come with the default unfortunately """ self.regenerate_primary_file(dataset) def regenerate_primary_file(self, dataset): """ cannot do this until we are setting metadata """ annotations = "dbNSFP Annotations: %s\n" % ','.join( dataset.metadata.annotation) f = open(dataset.file_name, 'a') if dataset.metadata.bgzip: bn = dataset.metadata.bgzip f.write(bn) f.write('\n') f.write(annotations) f.close() def set_meta(self, dataset, overwrite=True, **kwd): try: efp = dataset.extra_files_path if os.path.exists(efp): flist = os.listdir(efp) for i, fname in enumerate(flist): if fname.endswith('.gz'): dataset.metadata.bgzip = fname try: fh = gzip.open(os.path.join(efp, fname), 'r') buf = fh.read(5000) lines = buf.splitlines() headers = lines[0].split('\t') dataset.metadata.annotation = headers[4:] except Exception as e: log.warn("set_meta fname: %s %s" % (fname, str(e))) finally: fh.close() if fname.endswith('.tbi'): dataset.metadata.index = fname self.regenerate_primary_file(dataset) except Exception as e: log.warn("set_meta fname: %s %s" % (dataset.file_name if dataset and dataset.file_name else 'Unkwown', str(e)))
class Vtk: r""" The Visualization Toolkit provides a number of source and writer objects to read and write popular data file formats. The Visualization Toolkit also provides some of its own file formats. There are two different styles of file formats available in VTK. The simplest are the legacy, serial formats that are easy to read and write either by hand or programmatically. However, these formats are less flexible than the XML based file formats which support random access, parallel I/O, and portable data compression and are preferred to the serial VTK file formats whenever possible. All keyword phrases are written in ASCII form whether the file is binary or ASCII. The binary section of the file (if in binary form) is the data proper; i.e., the numbers that define points coordinates, scalars, cell indices, and so forth. Binary data must be placed into the file immediately after the newline ('\\n') character from the previous ASCII keyword and parameter sequence. TODO: only legacy formats are currently supported and support for XML formats should be added. """ subtype = '' # Add metadata elements. MetadataElement(name="vtk_version", default=None, desc="Vtk version", readonly=True, optional=True, visible=True) MetadataElement(name="file_format", default=None, desc="File format", readonly=True, optional=True, visible=True) MetadataElement(name="dataset_type", default=None, desc="Dataset type", readonly=True, optional=True, visible=True) # STRUCTURED_GRID data_type. MetadataElement(name="dimensions", default=[], desc="Dimensions", readonly=True, optional=True, visible=True, no_value=[]) MetadataElement(name="origin", default=[], desc="Origin", readonly=True, optional=True, visible=True, no_value=[]) MetadataElement(name="spacing", default=[], desc="Spacing", readonly=True, optional=True, visible=True, no_value=[]) # POLYDATA data_type (Points element is also a component of UNSTRUCTURED_GRID.. MetadataElement(name="points", default=None, desc="Points", readonly=True, optional=True, visible=True) MetadataElement(name="vertices", default=None, desc="Vertices", readonly=True, optional=True, visible=True) MetadataElement(name="lines", default=None, desc="Lines", readonly=True, optional=True, visible=True) MetadataElement(name="polygons", default=None, desc="Polygons", readonly=True, optional=True, visible=True) MetadataElement(name="triangle_strips", default=None, desc="Triangle strips", readonly=True, optional=True, visible=True) # UNSTRUCTURED_GRID data_type. MetadataElement(name="cells", default=None, desc="Cells", readonly=True, optional=True, visible=True) # Additional elements not categorized by data_type. MetadataElement(name="field_names", default=[], desc="Field names", readonly=True, optional=True, visible=True, no_value=[]) # The keys in the field_components map to the list of field_names in the above element # which ensures order for select list options that are built from it. MetadataElement(name="field_components", default={}, desc="Field names and components", readonly=True, optional=True, visible=True, no_value={}) @abc.abstractmethod def __init__(self, **kwd): raise NotImplementedError def sniff_prefix(self, file_prefix: FilePrefix): """ VTK files can be either ASCII or binary, with two different styles of file formats: legacy or XML. We'll assume if the file contains a valid VTK header, then it is a valid VTK file. """ if self._is_vtk_header(file_prefix.text_io(errors='ignore'), self.subtype): return True return False def _is_vtk_header(self, fh, subtype): """ The Header section consists of at least 4, but possibly 5 lines. This is tricky because sometimes the 4th line is blank (in which case the 5th line consists of the data_kind) or the 4th line consists of the data_kind (in which case the 5th line is blank). """ data_kinds = [ 'STRUCTURED_GRID', 'POLYDATA', 'UNSTRUCTURED_GRID', 'STRUCTURED_POINTS', 'RECTILINEAR_GRID' ] def check_data_kind(line): for data_kind in data_kinds: if line.find(data_kind) >= 0: return True return False # Line 1: vtk DataFile Version 3.0 line = get_next_line(fh) if line.find('vtk') < 0: return False # Line 2: can be anything - skip it line = get_next_line(fh) # Line 3: ASCII or BINARY line = get_next_line(fh) if line.find(subtype) < 0: return False # Line 4: line = get_next_line(fh) if line: return check_data_kind(line) # line 5: line = get_next_line(fh) if line: return check_data_kind(line) return False def set_meta(self, dataset, **kwd): if dataset.has_data(): dataset.metadata.field_names = [] dataset.metadata.field_components = {} dataset_type = None field_components = {} dataset_structure_complete = False processing_field_section = False with open(dataset.file_name, errors='ignore') as fh: for i, line in enumerate(fh): line = line.strip() if not line: continue if i < 3: dataset = self.set_initial_metadata(i, line, dataset) elif dataset.metadata.file_format == 'ASCII' or not util.is_binary( line): if dataset_structure_complete: """ The final part of legacy VTK files describes the dataset attributes. This part begins with the keywords POINT_DATA or CELL_DATA, followed by an integer number specifying the number of points or cells, respectively. Other keyword/data combinations then define the actual dataset attribute values (i.e., scalars, vectors, tensors, normals, texture coordinates, or field data). Dataset attributes are supported for both points and cells. Each type of attribute data has a dataName associated with it. This is a character string (without embedded whitespace) used to identify a particular data. The dataName is used by the VTK readers to extract data. As a result, more than one attribute data of the same type can be included in a file. For example, two different scalar fields defined on the dataset points, pressure and temperature, can be contained in the same file. If the appropriate dataName is not specified in the VTK reader, then the first data of that type is extracted from the file. """ items = line.split() if items[0] == 'SCALARS': # Example: SCALARS surface_field double 3 # Scalar definition includes specification of a lookup table. The # definition of a lookup table is optional. If not specified, the # default VTK table will be used, and tableName should be # "default". Also note that the numComp variable is optional. By # default the number of components is equal to one. The parameter # numComp must range between (1,4) inclusive; in versions of VTK # prior to vtk2.3 this parameter was not supported. field_name = items[1] dataset.metadata.field_names.append(field_name) try: num_components = int(items[-1]) except Exception: num_components = 1 field_component_indexes = [ str(i) for i in range(num_components) ] field_components[ field_name] = field_component_indexes elif items[0] == 'FIELD': # The dataset consists of CELL_DATA. # FIELD FieldData 2 processing_field_section = True num_fields = int(items[-1]) fields_processed: List[str] = [] elif processing_field_section: if len(fields_processed) == num_fields: processing_field_section = False else: try: float(items[0]) # Don't process the cell data. # 0.0123457 0.197531 except Exception: # Line consists of arrayName numComponents numTuples dataType. # Example: surface_field1 1 12 double field_name = items[0] dataset.metadata.field_names.append( field_name) num_components = int(items[1]) field_component_indexes = [ str(i) for i in range(num_components) ] field_components[ field_name] = field_component_indexes fields_processed.append(field_name) elif line.startswith('CELL_DATA'): # CELL_DATA 3188 dataset_structure_complete = True dataset.metadata.cells = int(line.split()[1]) elif line.startswith('POINT_DATA'): # POINT_DATA 1876 dataset_structure_complete = True dataset.metadata.points = int(line.split()[1]) else: dataset, dataset_type = self.set_structure_metadata( line, dataset, dataset_type) if len(field_components) > 0: dataset.metadata.field_components = field_components def set_initial_metadata(self, i, line, dataset): if i == 0: # The first part of legacy VTK files is the file version and # identifier. This part contains the single line: # # vtk DataFile Version X.Y dataset.metadata.vtk_version = line.lower().split('version')[1] # The second part of legacy VTK files is the header. The header # consists of a character string terminated by end-of-line # character \n. The header is 256 characters maximum. The header # can be used to describe the data and include any other pertinent # information. We skip the header line... elif i == 2: # The third part of legacy VTK files is the file format. The file # format describes the type of file, either ASCII or binary. On # this line the single word ASCII or BINARY must appear. dataset.metadata.file_format = line return dataset def set_structure_metadata(self, line, dataset, dataset_type): """ The fourth part of legacy VTK files is the dataset structure. The geometry part describes the geometry and topology of the dataset. This part begins with a line containing the keyword DATASET followed by a keyword describing the type of dataset. Then, depending upon the type of dataset, other keyword/ data combinations define the actual data. """ if dataset_type is None and line.startswith('DATASET'): dataset_type = line.split()[1] dataset.metadata.dataset_type = dataset_type if dataset_type == 'STRUCTURED_GRID': # The STRUCTURED_GRID format supports 1D, 2D, and 3D structured # grid datasets. The dimensions nx, ny, nz must be greater # than or equal to 1. The point coordinates are defined by the # data in the POINTS section. This consists of x-y-z data values # for each point. if line.startswith('DIMENSIONS'): # DIMENSIONS 10 5 1 dataset.metadata.dimensions = [line.split()[1:]] elif line.startswith('ORIGIN'): # ORIGIN 0 0 0 dataset.metadata.origin = [line.split()[1:]] elif line.startswith('SPACING'): # SPACING 1 1 1 dataset.metadata.spacing = [line.split()[1:]] elif dataset_type == 'POLYDATA': # The polygonal dataset consists of arbitrary combinations # of surface graphics primitives vertices, lines, polygons # and triangle strips. Polygonal data is defined by the POINTS, # VERTICES, LINES, POLYGONS, or TRIANGLE_STRIPS sections. if line.startswith('POINTS'): # POINTS 18 float dataset.metadata.points = int(line.split()[1]) elif line.startswith('VERTICES'): dataset.metadata.vertices = int(line.split()[1]) elif line.startswith('LINES'): # LINES 5 17 dataset.metadata.lines = int(line.split()[1]) elif line.startswith('POLYGONS'): # POLYGONS 6 30 dataset.metadata.polygons = int(line.split()[1]) elif line.startswith('TRIANGLE_STRIPS'): # TRIANGLE_STRIPS 2212 16158 dataset.metadata.triangle_strips = int(line.split()[1]) elif dataset_type == 'UNSTRUCTURED_GRID': # The unstructured grid dataset consists of arbitrary combinations # of any possible cell type. Unstructured grids are defined by points, # cells, and cell types. if line.startswith('POINTS'): # POINTS 18 float dataset.metadata.points = int(line.split()[1]) if line.startswith('CELLS'): # CELLS 756 3024 dataset.metadata.cells = int(line.split()[1]) return dataset, dataset_type def get_blurb(self, dataset): blurb = "" if dataset.metadata.vtk_version is not None: blurb += f'VTK Version {str(dataset.metadata.vtk_version)}' if dataset.metadata.dataset_type is not None: if blurb: blurb += ' ' blurb += str(dataset.metadata.dataset_type) return blurb or 'VTK data' def set_peek(self, dataset): if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.file_name) dataset.blurb = self.get_blurb(dataset) else: dataset.peek = 'File does not exist' dataset.blurb = 'File purged from disc' def display_peek(self, dataset): try: return dataset.peek except Exception: return f"Vtk file ({nice_size(dataset.get_size())})"
class Gdm(Tabular): """Tab delimited data in Gdm format""" file_ext = "gdm" column_names = ['chr', 'left', 'right', 'strand', 'name', 'score'] MetadataElement(name='columns', default='6', desc='Number of Columns', readonly=True, visible=False) MetadataElement(name='column_types', default=['str', 'int', 'int', 'str', 'str', 'float'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False) def display_peek(self, dataset): """Returns formatted html of peek""" return self.make_html_table(dataset, column_names=self.column_names) def sniff(self, filename): """ Determines whether a file is in gdm format GDM files have at least 6 required fields. (Actually in the format definition only the first 5 are mandatory, but the ones returned by the system have always at least 6). Required fields must be tab separated. Columns 0, 3, 4 must be strings. Columns 1, 2, 5 numbers. Column 5 (Score) can be not provided. """ headers = get_headers(filename, '\t', count=10) try: for hdr in headers: if hdr and hdr[0] and not hdr[0].startswith('#'): if len(hdr) != 6: return False try: int(hdr[1]) int(hdr[2]) except: return False if hdr[5] != '.': try: float(hdr[5]) except: return False return True except: return False
class Ply: """ The PLY format describes an object as a collection of vertices, faces and other elements, along with properties such as color and normal direction that can be attached to these elements. A PLY file contains the description of exactly one object. """ subtype = '' # Add metadata elements. MetadataElement(name="file_format", default=None, desc="File format", readonly=True, optional=True, visible=True) MetadataElement(name="vertex", default=None, desc="Vertex", readonly=True, optional=True, visible=True) MetadataElement(name="face", default=None, desc="Face", readonly=True, optional=True, visible=True) MetadataElement(name="other_elements", default=[], desc="Other elements", readonly=True, optional=True, visible=True, no_value=[]) @abc.abstractmethod def __init__(self, **kwd): raise NotImplementedError def sniff_prefix(self, file_prefix: FilePrefix): """ The structure of a typical PLY file: Header, Vertex List, Face List, (lists of other elements) """ if not self._is_ply_header(file_prefix.text_io(errors='ignore'), self.subtype): return False return True def _is_ply_header(self, fh, subtype): """ The header is a series of carriage-return terminated lines of text that describe the remainder of the file. """ valid_header_items = ['comment', 'obj_info', 'element', 'property'] # Line 1: ply line = get_next_line(fh) if line != 'ply': return False # Line 2: format ascii 1.0 line = get_next_line(fh) if line.find(subtype) < 0: return False stop_index = 0 for line in util.iter_start_of_line(fh, MAX_LINE_LEN): line = line.strip() stop_index += 1 if line == 'end_header': return True items = line.split() if items[0] not in valid_header_items: return False if stop_index > MAX_HEADER_LINES: # If this is a PLY file, there must be an unusually # large number of comments. break return False def set_meta(self, dataset, **kwd): if dataset.has_data(): with open(dataset.file_name, errors='ignore') as fh: for line in fh: line = line.strip() if not line: continue if line.startswith('format'): items = line.split() dataset.metadata.file_format = items[1] elif line == 'end_header': # Metadata is complete. break elif line.startswith('element'): items = line.split() if items[1] == 'face': dataset.metadata.face = int(items[2]) elif items[1] == 'vertex': dataset.metadata.vertex = int(items[2]) else: element_tuple = (items[1], int(items[2])) dataset.metadata.other_elements.append( element_tuple) def set_peek(self, dataset): if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.file_name) dataset.blurb = f"Faces: {str(dataset.metadata.face)}, Vertices: {str(dataset.metadata.vertex)}" else: dataset.peek = 'File does not exist' dataset.blurb = 'File purged from disc' def display_peek(self, dataset): try: return dataset.peek except Exception: return f"Ply file ({nice_size(dataset.get_size())})"
class Maf( Alignment ): """Class describing a Maf alignment""" file_ext = "maf" #Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 ) MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True ) def init_meta( self, dataset, copy_from=None ): Alignment.init_meta( self, dataset, copy_from=copy_from ) def set_meta( self, dataset, overwrite = True, **kwd ): """ Parses and sets species, chromosomes, index from MAF file. """ #these metadata values are not accessable by users, always overwrite indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name ) if indexes is None: return #this is not a MAF file dataset.metadata.species = species dataset.metadata.blocks = blocks #write species chromosomes to a file chrom_file = dataset.metadata.species_chromosomes if not chrom_file: chrom_file = dataset.metadata.spec['species_chromosomes'].param.new_file( dataset = dataset ) chrom_out = open( chrom_file.file_name, 'wb' ) for spec, chroms in species_chromosomes.items(): chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) ) chrom_out.close() dataset.metadata.species_chromosomes = chrom_file index_file = dataset.metadata.maf_index if not index_file: index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset ) indexes.write( open( index_file.file_name, 'wb' ) ) dataset.metadata.maf_index = index_file def set_peek( self, dataset, is_multi_byte=False ): if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) if dataset.metadata.blocks: dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) ) else: # Number of blocks is not known ( this should not happen ), and auto-detect is # needed to set metadata dataset.blurb = "? blocks" else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def display_peek( self, dataset ): """Returns formated html of peek""" return self.make_html_table( dataset ) def make_html_table( self, dataset, skipchars=[] ): """Create HTML table, used for displaying peek""" out = ['<table cellspacing="0" cellpadding="3">'] try: out.append('<tr><th>Species: ') for species in dataset.metadata.species: out.append( '%s ' % species ) out.append( '</th></tr>' ) if not dataset.peek: dataset.set_peek() data = dataset.peek lines = data.splitlines() for line in lines: line = line.strip() if not line: continue out.append( '<tr><td>%s</td></tr>' % escape( line ) ) out.append( '</table>' ) out = "".join( out ) except Exception, exc: out = "Can't create peek %s" % exc return out
class NeperTess(data.Text): """ Neper Tessellation File ***tess **format format **general dim type **cell number_of_cells """ file_ext = "neper.tess" MetadataElement(name="format", default=None, desc="format", readonly=True, visible=True) MetadataElement(name="dimension", default=None, desc="dimension", readonly=True, visible=True) MetadataElement(name="cells", default=None, desc="cells", readonly=True, visible=True) def __init__(self, **kwd): data.Text.__init__(self, **kwd) def sniff_prefix(self, file_prefix: FilePrefix): """ Neper tess format startswith:***tess >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test.neper.tess') >>> NeperTess().sniff(fname) True >>> fname = get_test_fname('test.neper.tesr') >>> NeperTess().sniff(fname) False """ return file_prefix.text_io( errors='ignore').readline(10).startswith('***tess') def set_meta(self, dataset, **kwd): if dataset.has_data(): with open(dataset.file_name, errors='ignore') as fh: for i, line in enumerate(fh): line = line.strip() if not line or i > 6: break if i == 0 and not line.startswith('***tess'): break if i == 2: dataset.metadata.format = line if i == 4: dataset.metadata.dimension = int(line.split()[0]) if i == 6: dataset.metadata.cells = int(line) def set_peek(self, dataset): if not dataset.dataset.purged: dataset.peek = get_file_peek(dataset.file_name, LINE_COUNT=7) dataset.blurb = f'format: {str(dataset.metadata.format)} dim: {str(dataset.metadata.dimension)} cells: {str(dataset.metadata.cells)}' else: dataset.peek = 'File does not exist' dataset.blurb = 'File purged from disc'
class Maf(Alignment): """Class describing a Maf alignment""" edam_format = "format_3008" file_ext = "maf" # Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition MetadataElement(name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0) MetadataElement(name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True) MetadataElement(name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True) def init_meta(self, dataset, copy_from=None): Alignment.init_meta(self, dataset, copy_from=copy_from) def set_meta(self, dataset, overwrite=True, **kwd): """ Parses and sets species, chromosomes, index from MAF file. """ # these metadata values are not accessable by users, always overwrite # Imported here to avoid circular dependency from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes indexes, species, species_chromosomes, blocks = build_maf_index_species_chromosomes( dataset.file_name) if indexes is None: return # this is not a MAF file dataset.metadata.species = species dataset.metadata.blocks = blocks # write species chromosomes to a file chrom_file = dataset.metadata.species_chromosomes if not chrom_file: chrom_file = dataset.metadata.spec[ 'species_chromosomes'].param.new_file(dataset=dataset) chrom_out = open(chrom_file.file_name, 'wb') for spec, chroms in species_chromosomes.items(): chrom_out.write("%s\t%s\n" % (spec, "\t".join(chroms))) chrom_out.close() dataset.metadata.species_chromosomes = chrom_file index_file = dataset.metadata.maf_index if not index_file: index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset=dataset) indexes.write(open(index_file.file_name, 'wb')) dataset.metadata.maf_index = index_file def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: # The file must exist on disk for the get_file_peek() method dataset.peek = data.get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) if dataset.metadata.blocks: dataset.blurb = "%s blocks" % util.commaify( str(dataset.metadata.blocks)) else: # Number of blocks is not known ( this should not happen ), and auto-detect is # needed to set metadata dataset.blurb = "? blocks" else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def display_peek(self, dataset): """Returns formated html of peek""" return self.make_html_table(dataset) def make_html_table(self, dataset, skipchars=[]): """Create HTML table, used for displaying peek""" out = ['<table cellspacing="0" cellpadding="3">'] try: out.append('<tr><th>Species: ') for species in dataset.metadata.species: out.append('%s ' % species) out.append('</th></tr>') if not dataset.peek: dataset.set_peek() data = dataset.peek lines = data.splitlines() for line in lines: line = line.strip() if not line: continue out.append('<tr><td>%s</td></tr>' % escape(line)) out.append('</table>') out = "".join(out) except Exception as exc: out = "Can't create peek %s" % exc return out def sniff(self, filename): """ Determines wether the file is in maf format The .maf format is line-oriented. Each multiple alignment ends with a blank line. Each sequence in an alignment is on a single line, which can get quite long, but there is no length limit. Words in a line are delimited by any white space. Lines starting with # are considered to be comments. Lines starting with ## can be ignored by most programs, but contain meta-data of one form or another. The first line of a .maf file begins with ##maf. This word is followed by white-space-separated variable=value pairs. There should be no white space surrounding the "=". For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'sequence.maf' ) >>> Maf().sniff( fname ) True >>> fname = get_test_fname( 'sequence.fasta' ) >>> Maf().sniff( fname ) False """ headers = get_headers(filename, None) try: if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf": return True else: return False except: return False
class CML(GenericXml): """ Chemical Markup Language http://cml.sourceforge.net/ """ file_ext = "cml" MetadataElement(name="number_of_molecules", default=0, desc="Number of molecules", readonly=True, visible=True, optional=True, no_value=0) def set_meta(self, dataset, **kwd): """ Set the number of lines of data in dataset. """ dataset.metadata.number_of_molecules = count_special_lines( '^\s*<molecule', dataset.file_name) def set_peek(self, dataset, is_multi_byte=False): if not dataset.dataset.purged: if (dataset.metadata.number_of_molecules == 1): dataset.blurb = "1 molecule" else: dataset.blurb = "%s molecules" % dataset.metadata.number_of_molecules dataset.peek = get_file_peek(dataset.file_name) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' def sniff_prefix(self, file_prefix): """ Try to guess if the file is a CML file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('interval.interval') >>> CML().sniff(fname) False >>> fname = get_test_fname('drugbank_drugs.cml') >>> CML().sniff(fname) True """ for expected_string in [ '<?xml version="1.0"?>', 'http://www.xml-cml.org/schema' ]: if expected_string not in file_prefix.contents_header: return False return True def split(cls, input_datasets, subdir_generator_function, split_params): """ Split the input files by molecule records. """ if split_params is None: return None if len(input_datasets) > 1: raise Exception( "CML-file splitting does not support multiple files") input_files = [ds.file_name for ds in input_datasets] chunk_size = None if split_params['split_mode'] == 'number_of_parts': raise Exception( 'Split mode "%s" is currently not implemented for CML-files.' % split_params['split_mode']) elif split_params['split_mode'] == 'to_size': chunk_size = int(split_params['split_size']) else: raise Exception('Unsupported split mode %s' % split_params['split_mode']) def _read_cml_records(filename): lines = [] with open(filename) as handle: for line in handle: if line.lstrip().startswith('<?xml version="1.0"?>') or \ line.lstrip().startswith('<cml xmlns="http://www.xml-cml.org/schema') or \ line.lstrip().startswith('</cml>'): continue lines.append(line) if line.lstrip().startswith('</molecule>'): yield lines lines = [] header_lines = [ '<?xml version="1.0"?>\n', '<cml xmlns="http://www.xml-cml.org/schema">\n' ] footer_line = ['</cml>\n'] def _write_part_cml_file(accumulated_lines): part_dir = subdir_generator_function() part_path = os.path.join(part_dir, os.path.basename(input_files[0])) with open(part_path, 'w') as part_file: part_file.writelines(header_lines) part_file.writelines(accumulated_lines) part_file.writelines(footer_line) try: cml_records = _read_cml_records(input_files[0]) cml_lines_accumulated = [] for counter, cml_record in enumerate(cml_records, start=1): cml_lines_accumulated.extend(cml_record) if counter % chunk_size == 0: _write_part_cml_file(cml_lines_accumulated) cml_lines_accumulated = [] if cml_lines_accumulated: _write_part_cml_file(cml_lines_accumulated) except Exception as e: log.error('Unable to split files: %s' % str(e)) raise split = classmethod(split) def merge(split_files, output_file): """ Merging CML files. """ if len(split_files) == 1: # For one file only, use base class method (move/copy) return data.Text.merge(split_files, output_file) if not split_files: raise ValueError("Given no CML files, %r, to merge into %s" % (split_files, output_file)) with open(output_file, "w") as out: for filename in split_files: with open(filename) as handle: header = handle.readline() if not header: raise ValueError("CML file %s was empty" % filename) if not header.lstrip().startswith('<?xml version="1.0"?>'): out.write(header) raise ValueError("%s is not a valid XML file!" % filename) line = handle.readline() header += line if not line.lstrip().startswith( '<cml xmlns="http://www.xml-cml.org/schema'): out.write(header) raise ValueError("%s is not a CML file!" % filename) molecule_found = False for line in handle.readlines(): # We found two required header lines, the next line should start with <molecule > if line.lstrip().startswith('</cml>'): continue if line.lstrip().startswith('<molecule'): molecule_found = True if molecule_found: out.write(line) out.write("</cml>\n") merge = staticmethod(merge)