def sniff(self, filename): """ http://www.mothur.org/wiki/Oligos_File Determines whether the file is a otu (operational taxonomic unit) format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.oligos' ) >>> Oligos().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.oligos' ) >>> Oligos().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if len(line) == 2 and line[0] in ['forward', 'reverse']: count += 1 continue elif len(line) == 3 and line[0] == 'barcode': count += 1 continue else: return False if count > 0: return True return False
def sniff(self, filename): """ Determines whether the file is otu (operational taxonomic unit) format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' ) >>> Otu().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' ) >>> Otu().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 2: return False if count >= 1: try: check = int(line[1]) if check + 2 != len(line): return False except ValueError: return False count += 1 if count > 2: return True return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is otu (operational taxonomic unit) format label<TAB>count[<TAB>value(1..n)] >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.sabund' ) >>> Sabund().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.sabund' ) >>> Sabund().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 2: return False try: check = int(line[1]) if check + 2 != len(line): return False for i in range(2, len(line)): int(line[i]) except ValueError: return False count += 1 if count > 0: return True return False
def sniff_prefix(self, file_prefix): """ Try to guess if the file is a PDBQT file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('NuBBE_1_obabel_3D.pdbqt') >>> PDBQT().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> PDBQT().sniff(fname) False """ headers = iter_headers(file_prefix, sep=' ', count=300) h = t = c = s = k = False for line in headers: section_name = line[0].strip() if section_name == 'REMARK': h = True elif section_name == 'ROOT': t = True elif section_name == 'ENDROOT': c = True elif section_name == 'BRANCH': s = True elif section_name == 'TORSDOF': k = True if h * t * c * s * k: return True else: return False
def sniff(self, filename): """ Try to guess if the file is a PDBQT file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('NuBBE_1_obabel_3D.pdbqt') >>> PDBQT().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> PDBQT().sniff(fname) False """ headers = iter_headers(filename, sep=' ', count=300) h = t = c = s = k = False for line in headers: section_name = line[0].strip() if section_name == 'REMARK': h = True elif section_name == 'ROOT': t = True elif section_name == 'ENDROOT': c = True elif section_name == 'BRANCH': s = True elif section_name == 'TORSDOF': k = True if h * t * c * s * k: return True else: return False
def set_meta(self, dataset, overwrite=True, **kwd): super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd) if dataset.has_data(): label_names = set() otulabel_names = set() ncols = 0 data_lines = 0 comment_lines = 0 headers = iter_headers(dataset.file_name, sep='\t', count=-1) first_line = get_headers(dataset.file_name, sep='\t', count=1) # set otulabels if len(first_line) > 2: otulabel_names = first_line[2:] # set label names and number of lines for line in headers: if len(line) >= 2 and not line[0].startswith('@'): data_lines += 1 ncols = max(ncols, len(line)) label_names.add(line[0]) else: comment_lines += 1 # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() dataset.metadata.otulabels = list(otulabel_names) dataset.metadata.otulabels.sort()
def set_meta(self, dataset, overwrite=True, skip=1, **kwd): super(GroupAbund, self).set_meta(dataset, overwrite=overwrite, **kwd) # See if file starts with header line if dataset.has_data(): label_names = set() group_names = set() data_lines = 0 comment_lines = 0 ncols = 0 headers = iter_headers(dataset.file_name, sep='\t', count=-1) for line in headers: if line[0] == 'label' and line[1] == 'Group': skip = 1 comment_lines += 1 else: skip = 0 data_lines += 1 ncols = max(ncols, len(line)) label_names.add(line[0]) group_names.add(line[1]) # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() dataset.metadata.groups = list(group_names) dataset.metadata.groups.sort() dataset.metadata.skip = skip
def sniff(self, filename): """ Determines whether the file is a secondary structure map format A single column with an integer value which indicates the row that this row maps to. Check to make sure if structMap[10] = 380 then structMap[380] = 10 and vice versa. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.map' ) >>> SecondaryStructureMap().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.map' ) >>> SecondaryStructureMap().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') line_num = 0 rowidxmap = {} for line in headers: line_num += 1 if len(line) > 1: return False try: pointer = int(line[0]) if pointer > line_num: rowidxmap[pointer] = line_num elif pointer > 0 or line_num in rowidxmap: if rowidxmap[line_num] != pointer: return False except (ValueError, KeyError): return False if line_num < 3: return False return True
def sniff(self, filename): """ Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format The first and second columns have the sequence names and the third column is the distance between those sequences. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' ) >>> PairwiseDistanceMatrix().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' ) >>> PairwiseDistanceMatrix().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) != 3: return False try: float(line[2]) try: # See if it's also an integer int(line[2]) except ValueError: # At least one value is not an integer all_ints = False except ValueError: return False count += 1 if count > 2: return not all_ints return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is a secondary structure map format A single column with an integer value which indicates the row that this row maps to. Check to make sure if structMap[10] = 380 then structMap[380] = 10 and vice versa. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.map' ) >>> SecondaryStructureMap().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.map' ) >>> SecondaryStructureMap().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') line_num = 0 rowidxmap = {} for line in headers: line_num += 1 if len(line) > 1: return False try: pointer = int(line[0]) if pointer > line_num: rowidxmap[pointer] = line_num elif pointer > 0 or line_num in rowidxmap: if rowidxmap[line_num] != pointer: return False except (ValueError, KeyError): return False if line_num < 3: return False return True
def sniff(self, filename): """ Determines whether the file is otu (operational taxonomic unit) format label<TAB>count[<TAB>value(1..n)] >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.sabund' ) >>> Sabund().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.sabund' ) >>> Sabund().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 2: return False try: check = int(line[1]) if check + 2 != len(line): return False for i in range(2, len(line)): int(line[i]) except ValueError: return False count += 1 if count > 0: return True return False
def sniff_prefix(self, file_prefix): """ http://www.mothur.org/wiki/Oligos_File Determines whether the file is a otu (operational taxonomic unit) format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.oligos' ) >>> Oligos().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.oligos' ) >>> Oligos().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if len(line) == 2 and line[0] in ['forward', 'reverse']: count += 1 continue elif len(line) == 3 and line[0] == 'barcode': count += 1 continue else: return False if count > 0: return True return False
def sniff_prefix(self, file_prefix): """ >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('sample.gfa2') >>> Gfa2().sniff(fname) True >>> Gfa1().sniff(fname) False """ found_valid_lines = False for line in iter_headers(file_prefix, "\t"): if line[0].startswith('#'): continue if line[0] == 'H': return len(line) >= 2 and line[1] == 'VN:Z:2.0' elif line[0] == 'S': if len(line) < 3: return False elif line[0] == 'F': if len(line) < 8: return False elif line[0] == 'E': if len(line) < 9: return False elif line[0] == 'G': if len(line) < 6: return False elif line[0] == 'O' or line[0] == 'U': if len(line) < 3: return False else: return False found_valid_lines = True return found_valid_lines
def sniff_prefix(self, file_prefix): """ Try to guess if the file is a PDB file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('5e5z.pdb') >>> PDB().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> PDB().sniff(fname) False """ headers = iter_headers(file_prefix, sep=' ', count=300) h = t = c = s = k = e = False for line in headers: section_name = line[0].strip() if section_name == 'HEADER': h = True elif section_name == 'TITLE': t = True elif section_name == 'COMPND': c = True elif section_name == 'SOURCE': s = True elif section_name == 'KEYWDS': k = True elif section_name == 'EXPDTA': e = True if h * t * c * s * k * e: return True else: return False
def sniff_prefix(self, file_prefix): """ Try to guess if the file is a PQR file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('5e5z.pqr') >>> PQR().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> PQR().sniff(fname) False """ prog = self.get_matcher() headers = iter_headers(file_prefix, sep=None, comment_designator='REMARK 5', count=3000) h = a = False for line in headers: section_name = line[0].strip() if section_name == 'REMARK': h = True elif section_name == 'ATOM' or section_name == 'HETATM': if prog.match(' '.join(line)): a = True break if h * a: return True else: return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is otu (operational taxonomic unit) format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' ) >>> Otu().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' ) >>> Otu().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 2: return False if count >= 1: try: check = int(line[1]) if check + 2 != len(line): return False except ValueError: return False count += 1 if count > 2: return True return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format The first and second columns have the sequence names and the third column is the distance between those sequences. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' ) >>> PairwiseDistanceMatrix().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' ) >>> PairwiseDistanceMatrix().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) != 3: return False try: float(line[2]) try: # See if it's also an integer int(line[2]) except ValueError: # At least one value is not an integer all_ints = False except ValueError: return False count += 1 if count > 2: return not all_ints return False
def sniff(self, filename): """ Try to guess if the file is a PDB file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('5e5z.pdb') >>> PDB().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> PDB().sniff(fname) False """ headers = iter_headers(filename, sep=' ', count=300) h = t = c = s = k = e = False for line in headers: section_name = line[0].strip() if section_name == 'HEADER': h = True elif section_name == 'TITLE': t = True elif section_name == 'COMPND': c = True elif section_name == 'SOURCE': s = True elif section_name == 'KEYWDS': k = True elif section_name == 'EXPDTA': e = True if h * t * c * s * k * e: return True else: return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is an axes format The first line may have column headings. The following lines have the name in the first column plus float columns for each axis. .. code-block:: group axis1 axis2 forest 0.000000 0.145743 pasture 0.145743 0.000000 .. code-block:: axis1 axis2 U68589 0.262608 -0.077498 U68590 0.027118 0.195197 U68591 0.329854 0.014395 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.axes' ) >>> Axes().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.axes' ) >>> Axes().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 col_cnt = None all_integers = True for line in headers: if count != 0: if col_cnt is None: col_cnt = len(line) if col_cnt < 2: return False else: if len(line) != col_cnt: return False try: for i in range(1, col_cnt): check = float(line[i]) # Check abs value is <= 1.0 if abs(check) > 1.0: return False # Also test for whether value is an integer try: check = int(line[i]) except ValueError: all_integers = False except ValueError: return False count += 1 if count > 0: return not all_integers return False
def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd): super(Group, self).set_meta(dataset, overwrite, skip, max_data_lines) group_names = set() headers = iter_headers(dataset.file_name, sep='\t', count=-1) for line in headers: if len(line) > 1: group_names.add(line[1]) dataset.metadata.groups = list(group_names)
def sniff_prefix(self, file_prefix): """ Determines whether the file is an axes format The first line may have column headings. The following lines have the name in the first column plus float columns for each axis. ==> 98_sq_phylip_amazon.fn.unique.pca.axes <== group axis1 axis2 forest 0.000000 0.145743 pasture 0.145743 0.000000 ==> 98_sq_phylip_amazon.nmds.axes <== axis1 axis2 U68589 0.262608 -0.077498 U68590 0.027118 0.195197 U68591 0.329854 0.014395 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.axes' ) >>> Axes().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.axes' ) >>> Axes().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 col_cnt = None all_integers = True for line in headers: if count != 0: if col_cnt is None: col_cnt = len(line) if col_cnt < 2: return False else: if len(line) != col_cnt: return False try: for i in range(1, col_cnt): check = float(line[i]) # Check abs value is <= 1.0 if abs(check) > 1.0: return False # Also test for whether value is an integer try: check = int(line[i]) except ValueError: all_integers = False except ValueError: return False count += 1 if count > 0: return not all_integers return False
def set_meta(self, dataset, overwrite=True, skip=0, **kwd): super(DistanceMatrix, self).set_meta(dataset, overwrite=overwrite, skip=skip, **kwd) headers = iter_headers(dataset.file_name, sep='\t') for line in headers: if not line[0].startswith('@'): try: dataset.metadata.sequence_count = int(''.join(line)) # seq count sometimes preceded by tab break except Exception as e: if not isinstance(self, PairwiseDistanceMatrix): log.warning("DistanceMatrix set_meta %s" % e)
def sniff_prefix(self, file_prefix): """ Checks for and does cursory validation on data that looks like AGP >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('eg1.agp') >>> GoldenPath().sniff(fname) True >>> fname = get_test_fname('eg2.agp') >>> GoldenPath().sniff(fname) True >>> fname = get_test_fname('1.bed') >>> GoldenPath().sniff(fname) False >>> fname = get_test_fname('2.tabular') >>> GoldenPath().sniff(fname) False """ found_non_comment_lines = False try: for line in iter_headers(file_prefix, '\t', comment_designator='#'): if line: if len(line) != 9: return False assert line[4] in [ 'A', 'D', 'F', 'G', 'O', 'P', 'W', 'N', 'U' ] ostensible_numbers = line[1:3] if line[4] in ['U', 'N']: ostensible_numbers.append(line[5]) assert line[6] in [ 'scaffold', 'contig', 'centromere', 'short_arm', 'heterochromatin', 'telomere', 'repeat' ] assert line[7] in ['yes', 'no'] assert line[8] in [ 'na', 'paired-ends', 'align_genus', 'align_xgenus', 'align_trnscript', 'within_clone', 'clone_contig', 'map', 'strobe', 'unspecified' ] else: ostensible_numbers.extend([line[6], line[7]]) assert line[8] in ['+', '-', '?', '0', 'na'] if line[4] == 'U': assert int(line[5]) == 100 assert all( map(lambda x: str(x).isnumeric() and int(x) > 0, ostensible_numbers)) found_non_comment_lines = True except Exception: return False return found_non_comment_lines
def sniff_prefix(self, file_prefix): """ Determines whether the file is a lower-triangle distance matrix (phylip) format The first line has the number of sequences in the matrix. The remaining lines have the sequence name followed by a list of distances from all preceeding sequences 5 # possibly but not always preceded by a tab :/ U68589 U68590 0.3371 U68591 0.3609 0.3782 U68592 0.4155 0.3197 0.4148 U68593 0.2872 0.1690 0.3361 0.2842 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.lower.dist' ) >>> LowerTriangleDistanceMatrix().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.lower.dist' ) >>> LowerTriangleDistanceMatrix().sniff( fname ) False """ numlines = 300 headers = iter_headers(file_prefix, sep='\t', count=numlines) line_num = 0 for line in headers: if not line[0].startswith('@'): # first line should contain the number of sequences in the file if line_num == 0: if len(line) > 2: return False else: try: sequence_count = int(''.join(line)) assert sequence_count > 0 except ValueError: return False else: # number of fields should equal the line number if len(line) != (line_num): return False try: # Distances should be floats for column in line[2:]: float(column) except ValueError: return False line_num += 1 # check if the number of lines in the file was as expected if line_num == sequence_count + 1 or line_num == numlines + 1: return True return False
def sniff(self, filename): """ Determines whether the file is a Reference Taxonomy http://www.mothur.org/wiki/Taxonomy_outline A table with 2 or 3 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order) - integer ? Example: 2-column (http://www.mothur.org/wiki/Taxonomy_outline) X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida; AF052717.1 Eukaryota;Parabasalidea; Example: 3-column (http://vamps.mbl.edu/resources/databases.php) v3_AA008 Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus 5 v3_AA016 Bacteria 120 v3_AA019 Archaea;Crenarchaeota;Marine_Group_I 1 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.ref.taxonomy' ) >>> RefTaxonomy().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.ref.taxonomy' ) >>> RefTaxonomy().sniff( fname ) False """ headers = iter_headers(filename, sep='\t', count=300) count = 0 pat_prog = re.compile( '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$' ) found_semicolons = False for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if not (2 <= len(line) <= 3): return False if not pat_prog.match(line[1]): return False if not found_semicolons and line[1].find(';') > -1: found_semicolons = True if len(line) == 3: try: int(line[2]) except Exception: return False count += 1 if count > 0: # Require that at least one entry has semicolons in the 2nd column return found_semicolons return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is a frequency tabular format for chimera analysis .. code-block:: #1.14.0 0 0.000 1 0.000 ... 155 0.975 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' ) >>> Frequency().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' ) >>> Frequency().sniff( fname ) False >>> # Expression count matrix (EdgeR wrapper) >>> fname = get_test_fname( 'mothur_datatypetest_false_2.mothur.freq' ) >>> Frequency().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): # first line should be #<version string> if count == 0: if not line[0].startswith('#') or len(line) != 1: return False else: # all other lines should be <int> <float> if len(line) != 2: return False try: int(line[0]) float(line[1]) if line[1].find('.') == -1: return False except Exception: return False count += 1 if count > 1: return True return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is a square distance matrix (Column-formatted distance matrix) format The first line has the number of sequences in the matrix. The following lines have the sequence name in the first column plus a column for the distance to each sequence in the row order in which they appear in the matrix. 3 U68589 0.0000 0.3371 0.3610 U68590 0.3371 0.0000 0.3783 U68590 0.3371 0.0000 0.3783 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.square.dist' ) >>> SquareDistanceMatrix().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.square.dist' ) >>> SquareDistanceMatrix().sniff( fname ) False """ numlines = 300 headers = iter_headers(file_prefix, sep='\t', count=numlines) line_num = 0 for line in headers: if not line[0].startswith('@'): if line_num == 0: if len(line) > 2: return False else: try: sequence_count = int(''.join(line)) assert sequence_count > 0 except ValueError: return False else: # number of fields should equal the number of sequences if len(line) != sequence_count + 1: return False try: # Distances should be floats for column in line[2:]: float(column) except ValueError: return False line_num += 1 # check if the number of lines in the file was as expected if line_num == sequence_count + 1 or line_num == numlines + 1: return True return False
def set_meta(self, dataset, overwrite=True, **kwd): """ Set metadata for Otu files. >>> from galaxy.datatypes.sniff import get_test_fname >>> from galaxy.util.bunch import Bunch >>> dataset = Bunch() >>> dataset.metadata = Bunch >>> otu = Otu() >>> dataset.file_name = get_test_fname( 'mothur_datatypetest_true.mothur.otu' ) >>> dataset.has_data = lambda: True >>> otu.set_meta(dataset) >>> dataset.metadata.columns 100 >>> len(dataset.metadata.labels) == 37 True >>> len(dataset.metadata.otulabels) == 98 True """ super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd) if dataset.has_data(): label_names = set() otulabel_names = set() ncols = 0 data_lines = 0 comment_lines = 0 headers = iter_headers(dataset.file_name, sep='\t', count=-1) first_line = get_headers(dataset.file_name, sep='\t', count=1) if first_line: first_line = first_line[0] # set otulabels if len(first_line) > 2: otulabel_names = first_line[2:] # set label names and number of lines for line in headers: if len(line) >= 2 and not line[0].startswith('@'): data_lines += 1 ncols = max(ncols, len(line)) label_names.add(line[0]) else: comment_lines += 1 # Set the discovered metadata values for the dataset dataset.metadata.data_lines = data_lines dataset.metadata.columns = ncols dataset.metadata.labels = list(label_names) dataset.metadata.labels.sort() dataset.metadata.otulabels = list(otulabel_names) dataset.metadata.otulabels.sort()
def sniff_prefix(self, file_prefix): """ Determines whether the file is a Reference Taxonomy http://www.mothur.org/wiki/Taxonomy_outline A table with 2 or 3 columns: - SequenceName - Taxonomy (semicolon-separated taxonomy in descending order) - integer ? Example: 2-column (http://www.mothur.org/wiki/Taxonomy_outline) X56533.1 Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma; X97975.1 Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida; AF052717.1 Eukaryota;Parabasalidea; Example: 3-column (http://vamps.mbl.edu/resources/databases.php) v3_AA008 Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus 5 v3_AA016 Bacteria 120 v3_AA019 Archaea;Crenarchaeota;Marine_Group_I 1 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.ref.taxonomy' ) >>> RefTaxonomy().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.ref.taxonomy' ) >>> RefTaxonomy().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t', count=300) count = 0 pat_prog = re.compile('^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$') found_semicolons = False for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if not (2 <= len(line) <= 3): return False if not pat_prog.match(line[1]): return False if not found_semicolons and line[1].find(';') > -1: found_semicolons = True if len(line) == 3: try: int(line[2]) except Exception: return False count += 1 if count > 0: # Require that at least one entry has semicolons in the 2nd column return found_semicolons return False
def sniff(self, filename): """ Determines whether the file is a frequency tabular format for chimera analysis #1.14.0 0 0.000 1 0.000 ... 155 0.975 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' ) >>> Frequency().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' ) >>> Frequency().sniff( fname ) False # Expression count matrix (EdgeR wrapper) >>> fname = get_test_fname( 'mothur_datatypetest_false_2.mothur.freq' ) >>> Frequency().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): # first line should be #<version string> if count == 0: if not line[0].startswith('#') or len(line) != 1: return False else: # all other lines should be <int> <float> if len(line) != 2: return False try: int(line[0]) float(line[1]) if line[1].find('.') == -1: return False except Exception: return False count += 1 if count > 1: return True return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format The first and second columns have the sequence names and the third column is the distance between those sequences. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' ) >>> PairwiseDistanceMatrix().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' ) >>> PairwiseDistanceMatrix().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 names = [False, False] for line in headers: if line[0].startswith('@'): continue if len(line) != 3: return False # check if col3 contains distances (floats) try: float(line[2]) try: # See if it's also an integer int(line[2]) except ValueError: # At least one value is not an integer all_ints = False except ValueError: return False count += 1 # check if col1 and col2 likely contain names for c in [0, 1]: try: float(line[c]) except ValueError: names[c] = True if not names[0] or not names[1]: return False if count > 2: return not all_ints return False
def sniff(self, filename): """ Determines whether the file is in html format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'complete.bed' ) >>> Html().sniff( fname ) False >>> fname = get_test_fname( 'file.html' ) >>> Html().sniff( fname ) True """ headers = iter_headers(filename, None) for i, hdr in enumerate(headers): if hdr and hdr[0].lower().find('<html>') >= 0: return True return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is in html format >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'complete.bed' ) >>> Html().sniff( fname ) False >>> fname = get_test_fname( 'file.html' ) >>> Html().sniff( fname ) True """ headers = iter_headers(file_prefix, None) for i, hdr in enumerate(headers): if hdr and hdr[0].lower().find('<html>') >= 0: return True return False
def sniff(self, filename): """ Try to guess if the file is a InChI file. >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('drugbank_drugs.inchi') >>> InChI().sniff(fname) True >>> fname = get_test_fname('drugbank_drugs.cml') >>> InChI().sniff(fname) False """ inchi_lines = iter_headers(filename, sep=' ', count=10) for inchi in inchi_lines: if not inchi[0].startswith('InChI='): return False return True
def sniff_prefix(self, file_prefix): """ Determines whether the file is a quantiles tabular format for chimera analysis .. code-block:: 1 0 0 0 0 0 0 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 ... >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' ) >>> Quantile().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' ) >>> Quantile().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if len(line) != 7: return False try: int(line[0]) float(line[1]) float(line[2]) float(line[3]) float(line[4]) float(line[5]) float(line[6]) except Exception: return False count += 1 if count > 0: return True return False
def sniff(self, filename): """ Determines whether the file is a frequency tabular format for chimera analysis #1.14.0 0 0.000 1 0.000 ... 155 0.975 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' ) >>> Frequency().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' ) >>> Frequency().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if count == 0: # first line should be #<version string> if not line[0].startswith('#') and len(line) == 1: return False else: # all other lines should be <int> <float> if len(line) != 2: return False try: int(line[0]) float(line[1]) except Exception: return False count += 1 if count > 1: return True return False
def sniff_prefix(self, file_prefix): """ >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('big.gfa1') >>> Gfa1().sniff(fname) True >>> Gfa2().sniff(fname) False """ found_valid_lines = False for line in iter_headers(file_prefix, "\t"): if line[0].startswith('#'): continue if line[0] == 'H': return len(line) == 2 and line[1] == 'VN:Z:1.0' elif line[0] == 'S': if len(line) < 3: return False elif line[0] == 'L': if len(line) < 6: return False for i in (2, 4): if line[i] not in ('+', '-'): return False elif line[0] == 'C': if len(line) < 7: return False for i in (2, 4): if line[i] not in ('+', '-'): return False int(line[5]) elif line[0] == 'P': if len(line) < 4: return False else: return False found_valid_lines = True return found_valid_lines
def sniff_prefix(self, file_prefix): """ >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname('A-3105.paf') >>> Paf().sniff(fname) True """ found_valid_lines = False for line in iter_headers(file_prefix, "\t"): if len(line) < 12: return False for i in (1, 2, 3, 6, 7, 8, 9, 10, 11): int(line[i]) if line[4] not in ('+', '-'): return False if not (0 <= int(line[11]) <= 255): return False # Check that the optional columns after the 12th contain SAM-like typed key-value pairs for i in range(12, len(line)): if len(line[i].split(':')) != 3: return False found_valid_lines = True return found_valid_lines
def sniff(self, filename, vals_are_int=False): """ Determines whether the file is a otu (operational taxonomic unit) Shared format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.2 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' ) >>> GroupAbund().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' ) >>> GroupAbund().sniff( fname ) False """ headers = iter_headers(filename, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 3: return False if count > 0 or line[0] != 'label': try: check = int(line[2]) if check + 3 != len(line): return False for i in range(3, len(line)): if vals_are_int: int(line[i]) else: float(line[i]) except ValueError: return False count += 1 if count > 1: return True return False
def sniff_prefix(self, file_prefix, vals_are_int=False): """ Determines whether the file is a otu (operational taxonomic unit) Shared format label<TAB>group<TAB>count[<TAB>value(1..n)] The first line is column headings as of Mothur v 1.2 >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' ) >>> GroupAbund().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' ) >>> GroupAbund().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 for line in headers: if not line[0].startswith('@'): if len(line) < 3: return False if count > 0 or line[0] != 'label': try: check = int(line[2]) if check + 3 != len(line): return False for i in range(3, len(line)): if vals_are_int: int(line[i]) else: float(line[i]) except ValueError: return False count += 1 if count > 1: return True return False
def sniff_prefix(self, file_prefix): """ Determines whether the file is a quantiles tabular format for chimera analysis 1 0 0 0 0 0 0 2 0.309198 0.309198 0.37161 0.37161 0.37161 0.37161 3 0.510982 0.563213 0.693529 0.858939 1.07442 1.20608 ... >>> from galaxy.datatypes.sniff import get_test_fname >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' ) >>> Quantile().sniff( fname ) True >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' ) >>> Quantile().sniff( fname ) False """ headers = iter_headers(file_prefix, sep='\t') count = 0 for line in headers: if not line[0].startswith('@') and not line[0].startswith('#'): if len(line) != 7: return False try: int(line[0]) float(line[1]) float(line[2]) float(line[3]) float(line[4]) float(line[5]) float(line[6]) except Exception: return False count += 1 if count > 0: return True return False