def sniff_prefix(self, file_prefix: FilePrefix): # @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . if file_prefix.search(TURTLE_PREFIX_PATTERN): return True if file_prefix.search(TURTLE_BASE_PATTERN): return True return False
def sniff_prefix(self, file_prefix: FilePrefix): """Quick test on file headings""" if file_prefix.startswith("fcs_files\tcluster_id\tlabel\tfcs_names"): header_line = file_prefix.string_io().readline() if header_line.strip().split("\t")[-1] == 'fraction': return True elif file_prefix.truncated and file_prefix.string_io().read( ) == header_line: return True return False
def sniff_prefix(self, file_prefix: FilePrefix): """ >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'sequence.fasta' ) >>> QualityScore454().sniff( fname ) False >>> fname = get_test_fname( 'sequence.qual454' ) >>> QualityScore454().sniff( fname ) True """ fh = file_prefix.string_io() for line in fh: line = line.strip() if line and not line.startswith( '#'): # first non-empty non-comment line if line.startswith('>'): line = fh.readline().strip() if line == '' or line.startswith('>'): break try: [int(x) for x in line.split()] except Exception: return False return True else: break # we found a non-empty line, but it's not a header return False
def sniff_prefix(self, file_prefix: FilePrefix): """ >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'sequence.fasta' ) >>> QualityScoreSOLiD().sniff( fname ) False >>> fname = get_test_fname( 'sequence.qualsolid' ) >>> QualityScoreSOLiD().sniff( fname ) True """ fh = file_prefix.string_io() readlen = None goodblock = 0 for line in fh: line = line.strip() if not line.startswith('#'): # first non-empty non-comment line if line.startswith('>'): line = fh.readline().strip() if line == '' or line.startswith('>'): return False try: [int(x) for x in line.split()] if not readlen: readlen = len(line.split()) assert len( line.split() ) == readlen # SOLiD reads should be of the same length except Exception: return False goodblock += 1 if goodblock > 10: return True else: return False return goodblock > 0
def sniff_prefix(self, file_prefix: FilePrefix): """ Determines whether the file is a velveth produced fasta format The id line has 3 fields separated by tabs: sequence_name sequence_index category:: >SEQUENCE_0_length_35 1 1 GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT >SEQUENCE_1_length_35 2 1 CGACGAATGACAGGTCACGAATTTGGCGGGGATTA """ fh = file_prefix.string_io() for line in fh: line = line.strip() if line: # first non-empty line if line.startswith('>'): if not re.match(r'>[^\t]+\t\d+\t\d+$', line): return False # The next line.strip() must not be '', nor startwith '>' line = fh.readline().strip() if line == '' or line.startswith('>'): return False return True else: return False return False
def sniff_prefix(self, file_prefix: FilePrefix): """Determines whether the file is blastxml >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml') >>> BlastXml().sniff(fname) True >>> fname = get_test_fname('tblastn_four_human_vs_rhodopsin.blastxml') >>> BlastXml().sniff(fname) True >>> fname = get_test_fname('interval.interval') >>> BlastXml().sniff(fname) False """ handle = file_prefix.string_io() line = handle.readline() if line.strip() != '<?xml version="1.0"?>': return False line = handle.readline() if line.strip() not in [ '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">' ]: return False line = handle.readline() if line.strip() != '<BlastOutput>': return False return True
def sniff_prefix(self, file_prefix: FilePrefix): """ Determines whether the file is a SpectraST generated file. """ contents = file_prefix.string_io() return Msp.next_line_starts_with( contents, "Name:") and Msp.next_line_starts_with( contents, "LibID:")
def sniff_prefix(self, file_prefix: FilePrefix): """ Try to guess the Arff filetype. It usually starts with a "format-version:" string and has several stanzas which starts with "id:". """ handle = file_prefix.string_io() relation_found = False attribute_found = False for line_count, line in enumerate(handle): if line_count > 1000: # only investigate the first 1000 lines return False line = line.strip() if not line: continue start_string = line[:20].upper() if start_string.startswith("@RELATION"): relation_found = True elif start_string.startswith("@ATTRIBUTE"): attribute_found = True elif start_string.startswith("@DATA"): # @DATA should be the last data block if relation_found and attribute_found: return True return False
def sniff_prefix(self, file_prefix: FilePrefix): """ Determines whether the file is an amos assembly file format Example:: {CTG iid:1 eid:1 seq: CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA . qlt: DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD . {TLE src:1027 off:0 clr:618,0 gap: 250 612 . } } """ for line in file_prefix.line_iterator(): if not line: break # EOF line = line.strip() if line: # first non-empty line if line.startswith('{'): if re.match(r'{(RED|CTG|TLE)$', line): return True return False
def _has_root_element_in_prefix(self, file_prefix: FilePrefix, root): for line in file_prefix.line_iterator(): if not line.startswith('<?'): break # pattern match <root or <ns:root for any ns string pattern = r'^<(\w*:)?%s' % root return re.match(pattern, line) is not None
def sniff_prefix(self, file_prefix: FilePrefix): """Each file must have one or more data blocks. The start of a data block is defined by the keyword "data_" followed by an optional string for identification (e.g., "data_images"). All text before the first "data_" keyword are comments >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('1.star') >>> Star().sniff(fname) True >>> fname = get_test_fname('interval.interval') >>> Star().sniff(fname) False """ in_data_block = False for line in file_prefix.line_iterator(): # All lines before the first # data_ block must be comments. line = line.strip() if len(line) == 0: continue if line.startswith("data_"): in_data_block = True continue if in_data_block: # Lines within data blocks must # be blank, start with loop_, or # start with _. if len(line) == 0: continue if line.startswith("loop_") or line.startswith("_"): return True return False return False
def sniff_prefix(self, file_prefix: FilePrefix): """Determines whether the file is a Gifti file >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('Human.colin.R.activations.label.gii') >>> Gifti().sniff(fname) True >>> fname = get_test_fname('interval.interval') >>> Gifti().sniff(fname) False >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml') >>> Gifti().sniff(fname) False >>> fname = get_test_fname('tblastn_four_human_vs_rhodopsin.blastxml') >>> Gifti().sniff(fname) False """ handle = file_prefix.string_io() line = handle.readline() if not line.strip().startswith('<?xml version="1.0"'): return False line = handle.readline() if line.strip( ) == '<!DOCTYPE GIFTI SYSTEM "http://www.nitrc.org/frs/download.php/1594/gifti.dtd">': return True line = handle.readline() if line.strip().startswith('<GIFTI'): return True return False
def sniff_prefix(self, file_prefix: FilePrefix): """ >>> classname = DataIn >>> from galaxy.datatypes.sniff import get_test_fname >>> extn_true = classname().file_ext >>> file_true = get_test_fname("linkstudies." + extn_true) >>> classname().sniff(file_true) True >>> false_files = list(LinkageStudies.test_files) >>> false_files.remove("linkstudies." + extn_true) >>> result_true = [] >>> for fname in false_files: ... file_false = get_test_fname(fname) ... res = classname().sniff(file_false) ... if res: ... result_true.append(fname) >>> >>> result_true [] """ intermarkers = 0 num_markers = None def eof_function(): return intermarkers > 0 fio = file_prefix.string_io() for lcount, line in enumerate(fio): if lcount > self.max_lines: return eof_function() tokens = line.split() try: if lcount == 0: num_markers = int(tokens[0]) map(int, tokens[1:]) elif lcount == 1: map(float, tokens) if len(tokens) != 4: return False elif lcount == 2: map(int, tokens) last_token = int(tokens[-1]) if num_markers is None: return False if len(tokens) != last_token: return False if num_markers != last_token: return False elif tokens[0] == "3" and tokens[1] == "2": intermarkers += 1 except (ValueError, IndexError): return False return eof_function()
def sniff_prefix(self, file_prefix: FilePrefix): """ Determines whether the file is the correct XML type. """ for line in file_prefix.line_iterator(): line = line.strip() if not line.startswith('<?'): break # pattern match <root or <ns:root for any ns string pattern = r'<(\w*:)?%s' % self.root return re.search(pattern, line) is not None
def sniff_prefix(self, file_prefix: FilePrefix): """ The structure of a typical PLY file: Header, Vertex List, Face List, (lists of other elements) """ if not self._is_ply_header(file_prefix.text_io(errors='ignore'), self.subtype): return False return True
def sniff_prefix(self, file_prefix: FilePrefix): """ VTK files can be either ASCII or binary, with two different styles of file formats: legacy or XML. We'll assume if the file contains a valid VTK header, then it is a valid VTK file. """ if self._is_vtk_header(file_prefix.text_io(errors='ignore'), self.subtype): return True return False
def sniff_prefix(self, file_prefix: FilePrefix): """ >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'infernal_model.cm' ) >>> InfernalCM().sniff( fname ) True >>> fname = get_test_fname( '2.txt' ) >>> InfernalCM().sniff( fname ) False """ return file_prefix.startswith("INFERNAL")
def sniff_prefix(self, file_prefix: FilePrefix): fh = file_prefix.string_io() line = [_.strip() for _ in fh.readline().split("\t")] if line != self.column_names: return False line = fh.readline().split("\t") try: [int(_) for _ in line[1:5]] [float(_) for _ in line[5:13]] except ValueError: return False return True
def sniff_prefix(self, file_prefix: FilePrefix): """ Neper tesr format startswith:***tesr >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test.neper.tesr') >>> NeperTesr().sniff(fname) True >>> fname = get_test_fname('test.neper.tess') >>> NeperTesr().sniff(fname) False """ return file_prefix.text_io( errors='ignore').readline(10).startswith('***tesr')
def sniff_prefix(self, file_prefix: FilePrefix): """ Gmsh msh format startswith:$MeshFormat >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test.gmsh.msh') >>> GmshMsh().sniff(fname) True >>> fname = get_test_fname('test.neper.tesr') >>> GmshMsh().sniff(fname) False """ return file_prefix.text_io( errors='ignore').readline().startswith('$MeshFormat')
def sniff_prefix(self, file_prefix: FilePrefix): """ Determines whether the file is XML or not >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' ) >>> GenericXml().sniff( fname ) True >>> fname = get_test_fname( 'interval.interval' ) >>> GenericXml().sniff( fname ) False """ return file_prefix.startswith('<?xml ')
def sniff_prefix(self, file_prefix: FilePrefix): """ >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'test.peff' ) >>> PEFF().sniff( fname ) True >>> fname = get_test_fname( 'sequence.fasta' ) >>> PEFF().sniff( fname ) False """ fh = file_prefix.string_io() if re.match(r"# PEFF \d+.\d+", fh.readline()): return True else: return False
def sniff_prefix(self, file_prefix: FilePrefix): """ Checking for keyword - 'Collection' or 'Image' in the first 200 lines. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('1.dzi') >>> Dzi().sniff(fname) True >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml') >>> Dzi().sniff(fname) False """ for line in file_prefix.line_iterator(): line = line.lower() if line.find('<collection') >= 0 or line.find('<image') >= 0: return True return False
def sniff_prefix(self, file_prefix: FilePrefix): """ Try to guess the Obo filetype. It usually starts with a "format-version:" string and has several stanzas which starts with "id:". """ stanza = re.compile(r'^\[.*\]$') handle = file_prefix.string_io() first_line = handle.readline() if not first_line.startswith('format-version:'): return False for line in handle: if stanza.match(line.strip()): # a stanza needs to begin with an ID tag if next(handle).startswith('id:'): return True return False
def sniff_prefix(self, file_prefix: FilePrefix): """ The use of ESTScan implies the creation of scores matrices which reflect the codons preferences in the studied organisms. The ESTScan package includes scripts for generating these files. The output of these scripts consists of the matrices, one for each isochor, and which look like this: FORMAT: hse_4is.conf CODING REGION 6 3 1 s C+G: 0 44 -1 0 2 -2 2 1 -8 0 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('test_space.txt') >>> Smat().sniff(fname) False >>> fname = get_test_fname('test_tab.bed') >>> Smat().sniff(fname) False >>> fname = get_test_fname('1.smat') >>> Smat().sniff(fname) True """ line_no = 0 fh = file_prefix.string_io() for line in fh: line_no += 1 if line_no > 10000: return True if line_no == 1 and not line.startswith('FORMAT'): # The first line is always the start of a format section. return False if not line.startswith('FORMAT'): if line.find('\t') >= 0: # Smat files are not tabular. return False items = line.split() if len(items) != 4: return False for item in items: # Make sure each item is an integer. if re.match(r"[-+]?\d+$", item) is None: return False # Ensure at least a few matching lines are found. return line_no > 2
def sniff_prefix(self, file_prefix: FilePrefix): """ >>> classname = MarkerMap >>> from galaxy.datatypes.sniff import get_test_fname >>> extn_true = classname().file_ext >>> file_true = get_test_fname("linkstudies." + extn_true) >>> classname().sniff(file_true) True >>> false_files = list(LinkageStudies.test_files) >>> false_files.remove("linkstudies." + extn_true) >>> result_true = [] >>> for fname in false_files: ... file_false = get_test_fname(fname) ... res = classname().sniff(file_false) ... if res: ... result_true.append(fname) >>> >>> result_true [] """ fio = file_prefix.string_io() if not self.header_check(fio): return False for lcount, line in enumerate(fio): if lcount > self.max_lines: return True try: chrm, gpos, nam, bpos, row = line.split() float(gpos) int(bpos) try: int(chrm) except ValueError: if not chrm.lower()[0] in ('x', 'y', 'm'): return False except ValueError: return False return True
def sniff_prefix(self, file_prefix: FilePrefix): """ >>> classname = AllegroLOD >>> from galaxy.datatypes.sniff import get_test_fname >>> extn_true = classname().file_ext >>> file_true = get_test_fname("linkstudies." + extn_true) >>> classname().sniff(file_true) True >>> false_files = list(LinkageStudies.test_files) >>> false_files.remove("linkstudies." + extn_true) >>> result_true = [] >>> for fname in false_files: ... file_false = get_test_fname(fname) ... res = classname().sniff(file_false) ... if res: ... result_true.append(fname) >>> >>> result_true [] """ fio = file_prefix.string_io() if not self.header_check(fio): return False for lcount, line in enumerate(fio): if lcount > self.max_lines: return True tokens = line.split() try: int(tokens[0]) float(tokens[1]) if tokens[2] != "-inf": float(tokens[2]) except (ValueError, IndexError): return False return True
def sniff_prefix(self, file_prefix: FilePrefix): sep = None header = None for idx, line in enumerate(file_prefix.line_iterator()): line = line.strip() if sep is None: sep = self._parse_delimiter(line) if sep is None: return False line = line.split(sep) if len(line) != 3: return False if idx == 0: header = self._parse_header(line) if (header is None) and not self._parse_dataline(line): return False elif not self._parse_dataline(line): return False if sep is None or header is None: return False return True
def sniff_prefix(self, file_prefix: FilePrefix): """ Determines whether the file is the correct type. """ has_version = False found_man_mtd = set() contents = file_prefix.string_io() for line in contents: if re.match(r"^\s*$", line): continue columns = line.strip("\r\n").split("\t") if columns[0] == "MTD": if columns[1] == "mzTab-version" and re.match( self._version_re, columns[2]) is not None: has_version = True elif columns[1] in self._man_mtd: mandatory_field = self._man_mtd[columns[1]] if mandatory_field is None or columns[2].lower( ) in mandatory_field: found_man_mtd.add(columns[1]) elif not columns[0] in self._sections: return False return has_version and found_man_mtd == set(self._man_mtd.keys())
def sniff_prefix(self, file_prefix: FilePrefix): """ Determines whether the file is a velveth produced RoadMap:: 142858 21 1 ROADMAP 1 ROADMAP 2 ... """ fh = file_prefix.string_io() for line in fh: line = line.strip() if line: # first non-empty line if not re.match(r'\d+\t\d+\t\d+$', line): return False # The next line.strip() should be 'ROADMAP 1' line = fh.readline().strip() return bool(re.match(r'ROADMAP \d+$', line)) else: return False # we found a non-empty line, but it's not a fasta header return False