Exemplo n.º 1
0
    def sniff(self, filename):
        """
        Try to guess if the file is a PDB file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('5e5z.pdb')
        >>> PDB().sniff(fname)
        True

        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> PDB().sniff(fname)
        False
        """
        headers = get_headers(filename, sep=' ', count=300)
        h = t = c = s = k = e = False
        for line in headers:
            section_name = line[0].strip()
            if section_name == 'HEADER':
                h = True
            elif section_name == 'TITLE':
                t = True
            elif section_name == 'COMPND':
                c = True
            elif section_name == 'SOURCE':
                s = True
            elif section_name == 'KEYWDS':
                k = True
            elif section_name == 'EXPDTA':
                e = True

        if h * t * c * s * k * e:
            return True
        else:
            return False
Exemplo n.º 2
0
    def sniff ( self, filename ):
        """
        Determines whether the file is in generic fastq format
        For details, see http://maq.sourceforge.net/fastq.shtml

        Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina
              These differ in the representation of the quality scores

        >>> fname = get_test_fname( '1.fastqsanger' )
        >>> Fastq().sniff( fname )
        True
        >>> fname = get_test_fname( '2.fastqsanger' )
        >>> Fastq().sniff( fname )
        True
        """
        headers = get_headers( filename, None )
        bases_regexp = re.compile( "^[NGTAC]*" )
        # check that first block looks like a fastq block
        try:
            if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]:
                # Check the sequence line, make sure it contains only G/C/A/T/N
                if not bases_regexp.match( headers[1][0] ):
                    return False
                return True
            return False
        except:
            return False
Exemplo n.º 3
0
    def sniff(self, filename):
        """
        Try to guess if the file is a PDBQT file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('NuBBE_1_obabel_3D.pdbqt')
        >>> PDBQT().sniff(fname)
        True

        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> PDBQT().sniff(fname)
        False
        """
        headers = get_headers(filename, sep=' ', count=300)
        h = t = c = s = k = False
        for line in headers:
            section_name = line[0].strip()
            if section_name == 'REMARK':
                h = True
            elif section_name == 'ROOT':
                t = True
            elif section_name == 'ENDROOT':
                c = True
            elif section_name == 'BRANCH':
                s = True
            elif section_name == 'TORSDOF':
                k = True

        if h * t * c * s * k:
            return True
        else:
            return False
Exemplo n.º 4
0
    def sniff( self, filename ):
        """
        Determines wether the file is in maf format

        The .maf format is line-oriented. Each multiple alignment ends with a blank line.
        Each sequence in an alignment is on a single line, which can get quite long, but
        there is no length limit. Words in a line are delimited by any white space.
        Lines starting with # are considered to be comments. Lines starting with ## can
        be ignored by most programs, but contain meta-data of one form or another.

        The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
        variable=value pairs. There should be no white space surrounding the "=".

        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5

        >>> fname = get_test_fname( 'sequence.maf' )
        >>> Maf().sniff( fname )
        True
        >>> fname = get_test_fname( 'sequence.fasta' )
        >>> Maf().sniff( fname )
        False
        """
        headers = get_headers( filename, None )
        try:
            if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf":
                return True
            else:
                return False
        except:
            return False
Exemplo n.º 5
0
    def sniff(self, filename):
        """
        Determines whether the file is a secondary structure map format
        A single column with an integer value which indicates the row that this
        row maps to. Check to make sure if structMap[10] = 380 then
        structMap[380] = 10 and vice versa.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.map' )
        >>> SecondaryStructureMap().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.map' )
        >>> SecondaryStructureMap().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        line_num = 0
        rowidxmap = {}
        for line in headers:
            line_num += 1
            if len(line) > 1:
                return False
            try:
                pointer = int(line[0])
                if pointer > line_num:
                    rowidxmap[pointer] = line_num
                elif pointer > 0 or line_num in rowidxmap:
                    if rowidxmap[line_num] != pointer:
                        return False
            except (ValueError, KeyError):
                return False
        if line_num < 3:
            return False
        return True
Exemplo n.º 6
0
    def sniff( self, filename ):
        """
        Checks for 'pileup-ness'

        There are two main types of pileup: 6-column and 10-column. For both,
        the first three and last two columns are the same. We only check the
        first three to allow for some personalization of the format.

        >>> fname = get_test_fname( 'interval.interval' )
        >>> Pileup().sniff( fname )
        False
        >>> fname = get_test_fname( '6col.pileup' )
        >>> Pileup().sniff( fname )
        True
        >>> fname = get_test_fname( '10col.pileup' )
        >>> Pileup().sniff( fname )
        True
        """
        headers = get_headers( filename, '\t' )
        try:
            for hdr in headers:
                if hdr and not hdr[0].startswith( '#' ):
                    if len( hdr ) < 3:
                        return False
                    try:
                        # chrom start in column 1 (with 0-based columns)
                        # and reference base is in column 2
                        check = int( hdr[1] )
                        assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ]
                    except:
                        return False
            return True
        except:
            return False
Exemplo n.º 7
0
    def sniff(self, filename):
        """
        Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
        The first and second columns have the sequence names and the third column is the distance between those sequences.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' )
        >>> PairwiseDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' )
        >>> PairwiseDistanceMatrix().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) != 3:
                    return False
                try:
                    float(line[2])
                    try:
                        # See if it's also an integer
                        int(line[2])
                    except ValueError:
                        # At least one value is not an integer
                        all_ints = False
                except ValueError:
                    return False
                count += 1

        if count > 2:
            return not all_ints

        return False
Exemplo n.º 8
0
    def set_meta(self, dataset, overwrite=True, **kwd):
        super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd)

        if dataset.has_data():
            label_names = set()
            otulabel_names = set()
            ncols = 0
            data_lines = 0
            comment_lines = 0

            headers = get_headers(dataset.file_name, sep='\t', count=-1)
            # set otulabels
            if len(headers[0]) > 2:
                otulabel_names = headers[0][2:]
            # set label names and number of lines
            for line in headers:
                if len(line) >= 2 and not line[0].startswith('@'):
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                else:
                    comment_lines += 1
            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.otulabels = list(otulabel_names)
            dataset.metadata.otulabels.sort()
Exemplo n.º 9
0
    def sniff(self, filename):
        """
        http://www.mothur.org/wiki/Oligos_File
        Determines whether the file is a otu (operational taxonomic unit) format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.oligos' )
        >>> Oligos().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.oligos' )
        >>> Oligos().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if len(line) == 2 and line[0] in ['forward', 'reverse']:
                    count += 1
                    continue
                elif len(line) == 3 and line[0] == 'barcode':
                    count += 1
                    continue
                else:
                    return False
        if count > 0:
            return True

        return False
Exemplo n.º 10
0
    def sniff(self, filename):
        """
        Determines whether the file is otu (operational taxonomic unit) format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> Otu().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' )
        >>> Otu().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 2:
                    return False
                if count >= 1:
                    try:
                        check = int(line[1])
                        if check + 2 != len(line):
                            return False
                    except ValueError:
                        return False
                count += 1
        if count > 2:
            return True

        return False
Exemplo n.º 11
0
    def set_meta(self, dataset, overwrite=True, skip=1, **kwd):
        super(GroupAbund, self).set_meta(dataset, overwrite=overwrite, **kwd)

        # See if file starts with header line
        if dataset.has_data():
            label_names = set()
            group_names = set()
            data_lines = 0
            comment_lines = 0
            ncols = 0

            headers = get_headers(dataset.file_name, sep='\t', count=-1)
            for line in headers:
                if line[0] == 'label' and line[1] == 'Group':
                    skip = 1
                    comment_lines += 1
                else:
                    skip = 0
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                    group_names.add(line[1])

            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.groups = list(group_names)
            dataset.metadata.groups.sort()
            dataset.metadata.skip = skip
Exemplo n.º 12
0
    def sniff(self, filename):
        """
        Determines whether the file is otu (operational taxonomic unit) format
        label<TAB>count[<TAB>value(1..n)]

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.sabund' )
        >>> Sabund().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.sabund' )
        >>> Sabund().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 2:
                    return False
                try:
                    check = int(line[1])
                    if check + 2 != len(line):
                        return False
                    for i in range(2, len(line)):
                        int(line[i])
                except ValueError:
                    return False
                count += 1
        if count > 0:
            return True

        return False
Exemplo n.º 13
0
    def sniff( self, filename ):
        """
        Determines whether the file is in lav format

        LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
        The first line of a .lav file begins with #:lav.

        For complete details see http://www.bioperl.org/wiki/LAV_alignment_format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'alignment.lav' )
        >>> Lav().sniff( fname )
        True
        >>> fname = get_test_fname( 'alignment.axt' )
        >>> Lav().sniff( fname )
        False
        """
        headers = get_headers( filename, None )
        try:
            if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'):
                return True
            else:
                return False
        except:
            return False
Exemplo n.º 14
0
    def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd):
        super(Group, self).set_meta(dataset, overwrite, skip, max_data_lines)

        group_names = set()
        headers = get_headers(dataset.file_name, sep='\t', count=-1)
        for line in headers:
            if len(line) > 1:
                group_names.add(line[1])
        dataset.metadata.groups = list(group_names)
Exemplo n.º 15
0
 def sniff( self, filename ):
     """
         InChI files starts with 'InChI='
     """
     inchi_lines = get_headers( filename, sep=' ', count=10 )
     for inchi in inchi_lines:
         if not inchi[0].startswith('InChI='):
             return False
     return True
Exemplo n.º 16
0
    def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd):
        super(SffFlow, self).set_meta(dataset, overwrite, 1, max_data_lines)

        headers = get_headers(dataset.file_name, sep='\t', count=1)
        try:
            flow_values = int(headers[0][0])
            dataset.metadata.flow_values = flow_values
        except Exception as e:
            log.warning("SffFlow set_meta %s" % e)
Exemplo n.º 17
0
    def sniff(self, filename):
        """
        Determines whether the file is an axes format
        The first line may have column headings.
        The following lines have the name in the first column plus float columns for each axis.
        ==> 98_sq_phylip_amazon.fn.unique.pca.axes <==
           group   axis1   axis2
           forest  0.000000        0.145743
           pasture 0.145743        0.000000

        ==> 98_sq_phylip_amazon.nmds.axes <==
                   axis1   axis2
           U68589  0.262608        -0.077498
           U68590  0.027118        0.195197
           U68591  0.329854        0.014395

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.axes' )
        >>> Axes().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.axes' )
        >>> Axes().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        count = 0
        col_cnt = None
        all_integers = True
        for line in headers:
            if count != 0:
                if col_cnt is None:
                    col_cnt = len(line)
                    if col_cnt < 2:
                        return False
                else:
                    if len(line) != col_cnt:
                        return False
                    try:
                        for i in range(1, col_cnt):
                            check = float(line[i])
                            # Check abs value is <= 1.0
                            if abs(check) > 1.0:
                                return False
                            # Also test for whether value is an integer
                            try:
                                check = int(line[i])
                            except ValueError:
                                all_integers = False
                    except ValueError:
                        return False
            count += 1

        if count > 0:
            return not all_integers

        return False
Exemplo n.º 18
0
 def sniff(self, filename):
     """Determine if the file is in pdf format."""
     headers = get_headers(filename, None, 1)
     try:
         if headers[0][0].startswith("%PDF"):
             return True
         else:
             return False
     except IndexError:
         return False
Exemplo n.º 19
0
    def set_meta(self, dataset, overwrite=True, skip=0, **kwd):
        super(DistanceMatrix, self).set_meta(dataset, overwrite=overwrite, skip=skip, **kwd)

        headers = get_headers(dataset.file_name, sep='\t')
        for line in headers:
            if not line[0].startswith('@'):
                try:
                    dataset.metadata.sequence_count = int(''.join(line))  # seq count sometimes preceded by tab
                    break
                except Exception as e:
                    if not isinstance(self, PairwiseDistanceMatrix):
                        log.warning("DistanceMatrix set_meta %s" % e)
Exemplo n.º 20
0
 def sniff_prefix(self, file_prefix):
     """
     Try to guess if the file is a Gal file.
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('test.gal')
     >>> Gal().sniff(fname)
     True
     >>> fname = get_test_fname('test.gpr')
     >>> Gal().sniff(fname)
     False
     """
     headers = get_headers(file_prefix, sep="\t", count=3)
     return "ATF" in headers[0][0] and "GenePix ArrayList" in headers[2][0]
Exemplo n.º 21
0
    def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd):
        super(CountTable, self).set_meta(dataset, overwrite=overwrite, **kwd)

        headers = get_headers(dataset.file_name, sep='\t', count=1)
        colnames = headers[0]
        dataset.metadata.column_types = ['str'] + (['int'] * ( len(headers[0]) - 1))
        if len(colnames) > 1:
            dataset.metadata.columns = len(colnames)
        if len(colnames) > 2:
            dataset.metadata.groups = colnames[2:]

        dataset.metadata.comment_lines = 1
        dataset.metadata.data_lines -= 1
Exemplo n.º 22
0
    def sniff(self, filename):
        """
        Determines whether the file is a lower-triangle distance matrix (phylip) format
        The first line has the number of sequences in the matrix.
        The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
                5  # possibly but not always preceded by a tab :/
                U68589
                U68590	0.3371
                U68591	0.3609	0.3782
                U68592	0.4155	0.3197	0.4148
                U68593	0.2872	0.1690	0.3361	0.2842

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.lower.dist' )
        >>> LowerTriangleDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.lower.dist' )
        >>> LowerTriangleDistanceMatrix().sniff( fname )
        False
        """
        numlines = 300
        headers = get_headers(filename, sep='\t', count=numlines)
        line_num = 0
        for line in headers:
            if not line[0].startswith('@'):
                # first line should contain the number of sequences in the file
                if line_num == 0:
                    if len(line) > 2:
                        return False
                    else:
                        try:
                            sequence_count = int(''.join(line))
                        except ValueError:
                            return False
                else:
                    # number of fields should equal the line number
                    if len(line) != (line_num):
                        return False
                    try:
                        # Distances should be floats
                        for column in line[2:]:
                            float(column)
                    except ValueError:
                        return False
                line_num += 1

        # check if the number of lines in the file was as expected
        if line_num == sequence_count + 1 or line_num == numlines + 1:
            return True

        return False
Exemplo n.º 23
0
    def sniff(self, filename):
        """
        Determines whether the file is a lower-triangle distance matrix (phylip) format
        The first line has the number of sequences in the matrix.
        The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
                5  # possibly but not always preceded by a tab :/
                U68589
                U68590	0.3371
                U68591	0.3609	0.3782
                U68592	0.4155	0.3197	0.4148
                U68593	0.2872	0.1690	0.3361	0.2842

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.lower.dist' )
        >>> LowerTriangleDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.lower.dist' )
        >>> LowerTriangleDistanceMatrix().sniff( fname )
        False
        """
        numlines = 300
        headers = get_headers(filename, sep='\t', count=numlines)
        line_num = 0
        for line in headers:
            if not line[0].startswith('@'):
                # first line should contain the number of sequences in the file
                if line_num == 0:
                    if len(line) > 2:
                        return False
                    else:
                        try:
                            sequence_count = int(''.join(line))
                        except ValueError:
                            return False
                else:
                    # number of fields should equal the line number
                    if len(line) != (line_num):
                        return False
                    try:
                        # Distances should be floats
                        for column in line[2:]:
                            float(column)
                    except ValueError:
                        return False
                line_num += 1

        # check if the number of lines in the file was as expected
        if line_num == sequence_count + 1 or line_num == numlines + 1:
            return True

        return False
Exemplo n.º 24
0
    def set_meta(self, dataset, overwrite=True, **kwd):
        """
        Set metadata for Otu files.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> from galaxy.util.bunch import Bunch
        >>> dataset = Bunch()
        >>> dataset.metadata = Bunch
        >>> otu = Otu()
        >>> dataset.file_name = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> dataset.has_data = lambda: True
        >>> otu.set_meta(dataset)
        >>> dataset.metadata.columns
        100
        >>> len(dataset.metadata.labels) == 37
        True
        >>> len(dataset.metadata.otulabels) == 98
        True
        """
        super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd)

        if dataset.has_data():
            label_names = set()
            otulabel_names = set()
            ncols = 0
            data_lines = 0
            comment_lines = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            first_line = get_headers(dataset.file_name, sep='\t', count=1)
            if first_line:
                first_line = first_line[0]
            # set otulabels
            if len(first_line) > 2:
                otulabel_names = first_line[2:]
            # set label names and number of lines
            for line in headers:
                if len(line) >= 2 and not line[0].startswith('@'):
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                else:
                    comment_lines += 1
            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.otulabels = list(otulabel_names)
            dataset.metadata.otulabels.sort()
Exemplo n.º 25
0
    def set_meta(self, dataset, overwrite=True, **kwd):
        """
        Set metadata for Otu files.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> from galaxy.util.bunch import Bunch
        >>> dataset = Bunch()
        >>> dataset.metadata = Bunch
        >>> otu = Otu()
        >>> dataset.file_name = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> dataset.has_data = lambda: True
        >>> otu.set_meta(dataset)
        >>> dataset.metadata.columns
        100
        >>> len(dataset.metadata.labels) == 37
        True
        >>> len(dataset.metadata.otulabels) == 98
        True
        """
        super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd)

        if dataset.has_data():
            label_names = set()
            otulabel_names = set()
            ncols = 0
            data_lines = 0
            comment_lines = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            first_line = get_headers(dataset.file_name, sep='\t', count=1)
            if first_line:
                first_line = first_line[0]
            # set otulabels
            if len(first_line) > 2:
                otulabel_names = first_line[2:]
            # set label names and number of lines
            for line in headers:
                if len(line) >= 2 and not line[0].startswith('@'):
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                else:
                    comment_lines += 1
            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.otulabels = list(otulabel_names)
            dataset.metadata.otulabels.sort()
Exemplo n.º 26
0
    def set_meta(self, dataset, overwrite=True, skip=1, max_data_lines=None, **kwd):
        super(CountTable, self).set_meta(dataset, overwrite=overwrite, **kwd)

        headers = get_headers(dataset.file_name, sep='\t', count=1)
        colnames = headers[0]
        dataset.metadata.column_types = ['str'] + (['int'] * (len(headers[0]) - 1))
        if len(colnames) > 1:
            dataset.metadata.columns = len(colnames)
        if len(colnames) > 2:
            dataset.metadata.groups = colnames[2:]

        dataset.metadata.comment_lines = 1
        if isinstance(dataset.metadata.data_lines, int):
            dataset.metadata.data_lines -= 1
Exemplo n.º 27
0
    def sniff(self, filename):
        """
        Determines whether the file is a Reference Taxonomy

        http://www.mothur.org/wiki/Taxonomy_outline
        A table with 2 or 3 columns:
        - SequenceName
        - Taxonomy (semicolon-separated taxonomy in descending order)
        - integer ?
        Example: 2-column (http://www.mothur.org/wiki/Taxonomy_outline)
          X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
          X97975.1        Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
          AF052717.1      Eukaryota;Parabasalidea;
        Example: 3-column (http://vamps.mbl.edu/resources/databases.php)
          v3_AA008	Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus	5
          v3_AA016	Bacteria	120
          v3_AA019	Archaea;Crenarchaeota;Marine_Group_I	1

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.ref.taxonomy' )
        >>> RefTaxonomy().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.ref.taxonomy' )
        >>> RefTaxonomy().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t', count=300)
        count = 0
        pat_prog = re.compile('^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$')
        found_semicolons = False
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if not (2 <= len(line) <= 3):
                    return False
                if not pat_prog.match(line[1]):
                    return False
                if not found_semicolons and line[1].find(';') > -1:
                    found_semicolons = True
                if len(line) == 3:
                    try:
                        int(line[2])
                    except Exception:
                        return False
                count += 1

        if count > 0:
            # Require that at least one entry has semicolons in the 2nd column
            return found_semicolons

        return False
Exemplo n.º 28
0
    def set_meta(self,
                 dataset,
                 overwrite=True,
                 skip=1,
                 max_data_lines=None,
                 **kwd):
        super(SffFlow, self).set_meta(dataset, overwrite, 1, max_data_lines)

        headers = get_headers(dataset.file_name, sep='\t', count=1)
        try:
            flow_values = int(headers[0][0])
            dataset.metadata.flow_values = flow_values
        except Exception as e:
            log.warning("SffFlow set_meta %s" % e)
Exemplo n.º 29
0
    def sniff(self, filename):
        """
        Determines whether the file is a Reference Taxonomy

        http://www.mothur.org/wiki/Taxonomy_outline
        A table with 2 or 3 columns:
        - SequenceName
        - Taxonomy (semicolon-separated taxonomy in descending order)
        - integer ?
        Example: 2-column (http://www.mothur.org/wiki/Taxonomy_outline)
          X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
          X97975.1        Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
          AF052717.1      Eukaryota;Parabasalidea;
        Example: 3-column (http://vamps.mbl.edu/resources/databases.php)
          v3_AA008	Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus	5
          v3_AA016	Bacteria	120
          v3_AA019	Archaea;Crenarchaeota;Marine_Group_I	1

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.ref.taxonomy' )
        >>> RefTaxonomy().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.ref.taxonomy' )
        >>> RefTaxonomy().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t', count=300)
        count = 0
        pat_prog = re.compile('^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$')
        found_semicolons = False
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if not (2 <= len(line) <= 3):
                    return False
                if not pat_prog.match(line[1]):
                    return False
                if not found_semicolons and line[1].find(';') > -1:
                    found_semicolons = True
                if len(line) == 3:
                    try:
                        int(line[2])
                    except Exception:
                        return False
                count += 1

        if count > 0:
            # Require that at least one entry has semicolons in the 2nd column
            return found_semicolons

        return False
Exemplo n.º 30
0
    def set_meta(self,
                 dataset,
                 overwrite=True,
                 skip=None,
                 max_data_lines=None,
                 **kwd):
        super(Group, self).set_meta(dataset, overwrite, skip, max_data_lines)

        group_names = set()
        headers = get_headers(dataset.file_name, sep='\t', count=-1)
        for line in headers:
            if len(line) > 1:
                group_names.add(line[1])
        dataset.metadata.groups = list(group_names)
Exemplo n.º 31
0
    def sniff(self, filename):
        """
        Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
        The first line has the number of sequences in the matrix.
        The following lines have the sequence name in the first column plus a column for the distance to each sequence
        in the row order in which they appear in the matrix.
               3
               U68589  0.0000  0.3371  0.3610
               U68590  0.3371  0.0000  0.3783
               U68590  0.3371  0.0000  0.3783

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.square.dist' )
        >>> SquareDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.square.dist' )
        >>> SquareDistanceMatrix().sniff( fname )
        False
        """
        numlines = 300
        headers = get_headers(filename, sep='\t', count=numlines)
        line_num = 0
        for line in headers:
            if not line[0].startswith('@'):
                if line_num == 0:
                    if len(line) > 2:
                        return False
                    else:
                        try:
                            sequence_count = int(''.join(line))
                        except ValueError:
                            return False
                else:
                    # number of fields should equal the number of sequences
                    if len(line) != sequence_count + 1:
                        return False
                    try:
                        # Distances should be floats
                        for column in line[2:]:
                            float(column)
                    except ValueError:
                        return False
                line_num += 1

        # check if the number of lines in the file was as expected
        if line_num == sequence_count + 1 or line_num == numlines + 1:
            return True

        return False
Exemplo n.º 32
0
    def sniff(self, filename):
        """
        Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
        The first line has the number of sequences in the matrix.
        The following lines have the sequence name in the first column plus a column for the distance to each sequence
        in the row order in which they appear in the matrix.
               3
               U68589  0.0000  0.3371  0.3610
               U68590  0.3371  0.0000  0.3783
               U68590  0.3371  0.0000  0.3783

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.square.dist' )
        >>> SquareDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.square.dist' )
        >>> SquareDistanceMatrix().sniff( fname )
        False
        """
        numlines = 300
        headers = get_headers(filename, sep='\t', count=numlines)
        line_num = 0
        for line in headers:
            if not line[0].startswith('@'):
                if line_num == 0:
                    if len(line) > 2:
                        return False
                    else:
                        try:
                            sequence_count = int(''.join(line))
                        except ValueError:
                            return False
                else:
                    # number of fields should equal the number of sequences
                    if len(line) != sequence_count + 1:
                        return False
                    try:
                        # Distances should be floats
                        for column in line[2:]:
                            float(column)
                    except ValueError:
                        return False
                line_num += 1

        # check if the number of lines in the file was as expected
        if line_num == sequence_count + 1 or line_num == numlines + 1:
            return True

        return False
Exemplo n.º 33
0
 def sniff(self, filename):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('test_tab.bed')
     >>> PlantTribesKsComponents().sniff(fname)
     False
     >>> fname = get_test_fname('1.ptkscmp')
     >>> PlantTribesKsComponents().sniff(fname)
     True
     """
     try:
         line_item_str = get_headers(filename, '\\t', 1)[0][0]
         return line_item_str == 'species\tn\tnumber_comp\tlnL\tAIC\tBIC\tmean\tvariance\tporportion'
     except Exception:
         return False
Exemplo n.º 34
0
 def set_meta(self, dataset, **kwd):
     """
     Set metadata for Gal file.
     """
     super().set_meta(dataset, **kwd)
     headers = get_headers(dataset.file_name, sep="\t", count=5)
     dataset.metadata.file_format = headers[0][0]
     dataset.metadata.version_number = headers[0][1]
     dataset.metadata.number_of_optional_header_records = int(headers[1][0])
     dataset.metadata.number_of_data_columns = int(headers[1][1])
     dataset.metadata.file_type = headers[2][0].strip().strip('"').split("=")[1]
     if "BlockCount" in headers[3][0]:
         dataset.metadata.block_count = int(headers[3][0].strip().strip('"').split("=")[1])
     if "BlockType" in headers[4][0]:
         dataset.metadata.block_type = int(headers[4][0].strip().strip('"').split("=")[1])
Exemplo n.º 35
0
 def sniff(self, filename):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('test_tab.bed')
     >>> PlantTribesKsComponents().sniff(fname)
     False
     >>> fname = get_test_fname('1.ptkscmp')
     >>> PlantTribesKsComponents().sniff(fname)
     True
     """
     try:
         line_item_str = get_headers(filename, '\\t', 1)[0][0]
         return line_item_str == 'species\tn\tnumber_comp\tlnL\tAIC\tBIC\tmean\tvariance\tporportion'
     except Exception:
         return False
Exemplo n.º 36
0
    def set_meta(self, dataset, overwrite=True, skip=0, **kwd):
        super(DistanceMatrix, self).set_meta(dataset,
                                             overwrite=overwrite,
                                             skip=skip,
                                             **kwd)

        headers = get_headers(dataset.file_name, sep='\t')
        for line in headers:
            if not line[0].startswith('@'):
                try:
                    dataset.metadata.sequence_count = int(
                        ''.join(line))  # seq count sometimes preceded by tab
                    break
                except Exception as e:
                    if not isinstance(self, PairwiseDistanceMatrix):
                        log.warning("DistanceMatrix set_meta %s" % e)
Exemplo n.º 37
0
    def sniff_prefix(self, file_prefix):
        """
        Try to guess if the file is a FPS file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('q.fps')
        >>> FPS().sniff(fname)
        True
        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> FPS().sniff(fname)
        False
        """
        header = get_headers(file_prefix, sep='\t', count=1)
        if header[0][0].strip() == '#FPS1':
            return True
        else:
            return False
Exemplo n.º 38
0
    def sniff_prefix(self, file_prefix):
        """
        Try to guess if the file is a FPS file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('q.fps')
        >>> FPS().sniff(fname)
        True
        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> FPS().sniff(fname)
        False
        """
        header = get_headers(file_prefix, sep='\t', count=1)
        if header[0][0].strip() == '#FPS1':
            return True
        else:
            return False
Exemplo n.º 39
0
    def sniff(self, filename):
        """
        Try to guess if the file is a InChI file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('drugbank_drugs.inchi')
        >>> InChI().sniff(fname)
        True

        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> InChI().sniff(fname)
        False
        """
        inchi_lines = get_headers(filename, sep=' ', count=10)
        for inchi in inchi_lines:
            if not inchi[0].startswith('InChI='):
                return False
        return True
Exemplo n.º 40
0
    def sniff(self, filename):
        """
        Try to guess if the file is a InChI file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('drugbank_drugs.inchi')
        >>> InChI().sniff(fname)
        True

        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> InChI().sniff(fname)
        False
        """
        inchi_lines = get_headers(filename, sep=' ', count=10)
        for inchi in inchi_lines:
            if not inchi[0].startswith('InChI='):
                return False
        return True
Exemplo n.º 41
0
    def sniff(self, filename):
        """
        Determines whether the file is in axt format

        axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
        at Penn State University.

        Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
        Blocks are separated from one another by blank lines.

        The summary line contains chromosomal position and size information about the alignment. It
        consists of 9 required fields.

        The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
        (line 3) with inserts.  Repeats are indicated by lower-case letters.

        For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'alignment.axt' )
        >>> Axt().sniff( fname )
        True
        >>> fname = get_test_fname( 'alignment.lav' )
        >>> Axt().sniff( fname )
        False
        """
        headers = get_headers(filename, None)
        if len(headers) < 4:
            return False
        for hdr in headers:
            if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
                return True
            if len(hdr) > 0 and not hdr[0].startswith("#"):
                if len(hdr) != 9:
                    return False
                try:
                    map(int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]])
                except:
                    return False
                if hdr[7] not in data.valid_strand:
                    return False
                else:
                    return True
Exemplo n.º 42
0
    def sniff( self, filename ):
        """
        Determines whether the file is in axt format

        axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
        at Penn State University.

        Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
        Blocks are separated from one another by blank lines.

        The summary line contains chromosomal position and size information about the alignment. It
        consists of 9 required fields.

        The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
        (line 3) with inserts.  Repeats are indicated by lower-case letters.

        For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'alignment.axt' )
        >>> Axt().sniff( fname )
        True
        >>> fname = get_test_fname( 'alignment.lav' )
        >>> Axt().sniff( fname )
        False
        """
        headers = get_headers( filename, None )
        if len(headers) < 4:
            return False
        for hdr in headers:
            if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
                return True
            if len(hdr) > 0 and not hdr[0].startswith("#"):
                if len(hdr) != 9:
                    return False
                try:
                    map( int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]] )
                except:
                    return False
                if hdr[7] not in data.valid_strand:
                    return False
                else:
                    return True
Exemplo n.º 43
0
    def sniff( self, filename ):
        """
        Determines whether the file is in html format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'complete.bed' )
        >>> Html().sniff( fname )
        False
        >>> fname = get_test_fname( 'file.html' )
        >>> Html().sniff( fname )
        True
        """
        headers = get_headers( filename, None )
        try:
            for i, hdr in enumerate(headers):
                if hdr and hdr[0].lower().find( '<html>' ) >= 0:
                    return True
            return False
        except:
            return True
Exemplo n.º 44
0
    def sniff(self, filename):
        """
        Determines whether the file is in html format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'complete.bed' )
        >>> Html().sniff( fname )
        False
        >>> fname = get_test_fname( 'file.html' )
        >>> Html().sniff( fname )
        True
        """
        headers = get_headers(filename, None)
        try:
            for i, hdr in enumerate(headers):
                if hdr and hdr[0].lower().find('<html>') >= 0:
                    return True
            return False
        except:
            return True
Exemplo n.º 45
0
    def sniff(self, filename):
        """
        Determines whether the file is a lane mask filter:  1 line consisting of zeros and ones.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.filter' )
        >>> LaneMask().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.filter' )
        >>> LaneMask().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        if len(headers) != 1 or len(headers[0]) != 1:
            return False

        if not re.match('^[01]+$', headers[0][0]):
            return False

        return True
Exemplo n.º 46
0
    def sniff(self, filename):
        """
        Determines whether the file is a lane mask filter:  1 line consisting of zeros and ones.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.filter' )
        >>> LaneMask().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.filter' )
        >>> LaneMask().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t', count=2)
        if len(headers) != 1 or len(headers[0]) != 1:
            return False

        if not re.match('^[01]+$', headers[0][0]):
            return False

        return True
Exemplo n.º 47
0
    def sniff(self, filename):
        headers = get_headers(filename, sep=' ', count=300)
        h = t = c = s = k = e = False
        for line in headers:
            section_name = line[0].strip()
            if section_name == 'HEADER':
                h = True
            elif section_name == 'TITLE':
                t = True
            elif section_name == 'COMPND':
                c = True
            elif section_name == 'SOURCE':
                s = True
            elif section_name == 'KEYWDS':
                k = True
            elif section_name == 'EXPDTA':
                e = True

        if h * t * c * s * k * e == True:
            return True
        else:
            return False
Exemplo n.º 48
0
    def sniff(self, filename):
        """
        Determines whether the file is a frequency tabular format for chimera analysis
        #1.14.0
        0	0.000
        1	0.000
        ...
        155	0.975

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' )
        >>> Frequency().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' )
        >>> Frequency().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if count == 0:
                    # first line should be #<version string>
                    if not line[0].startswith('#') and len(line) == 1:
                        return False
                else:
                    # all other lines should be <int> <float>
                    if len(line) != 2:
                        return False
                    try:
                        int(line[0])
                        float(line[1])
                    except Exception:
                        return False
                count += 1
        if count > 1:
            return True

        return False
Exemplo n.º 49
0
    def sniff(self, filename):
        """
        Determines whether a file is in gdm format

        GDM files have at least 6 required fields.
        (Actually in the format definition only the first 5 are mandatory, but the ones returned by the system have
        always at least 6).

        Required fields must be tab separated.

        Columns 0, 3, 4 must be strings.
        Columns 1, 2, 5 numbers.

        Column 5 (Score) can be not provided.


        """

        headers = get_headers(filename, '\t', count=10)

        try:
            for hdr in headers:
                if hdr and hdr[0] and not hdr[0].startswith('#'):
                    if len(hdr) != 6:
                        return False
                    try:
                        int(hdr[1])
                        int(hdr[2])
                    except:
                        return False
                    if hdr[5] != '.':
                        try:
                            float(hdr[5])
                        except:
                            return False
                    return True
        except:
            return False
Exemplo n.º 50
0
    def sniff(self, filename, vals_are_int=False):
        """
        Determines whether the file is a otu (operational taxonomic unit)
        Shared format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        The first line is column headings as of Mothur v 1.2

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' )
        >>> GroupAbund().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' )
        >>> GroupAbund().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 3:
                    return False
                if count > 0 or line[0] != 'label':
                    try:
                        check = int(line[2])
                        if check + 3 != len(line):
                            return False
                        for i in range(3, len(line)):
                            if vals_are_int:
                                int(line[i])
                            else:
                                float(line[i])
                    except ValueError:
                        return False
                count += 1
        if count > 1:
            return True
        return False
Exemplo n.º 51
0
    def sniff(self, filename):
        """
        Determines whether the file is a quantiles tabular format for chimera analysis
        1	0	0	0	0	0	0
        2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
        3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
        ...

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' )
        >>> Quantile().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' )
        >>> Quantile().sniff( fname )
        False
        """
        headers = get_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if len(line) != 7:
                    return False
                try:
                    int(line[0])
                    float(line[1])
                    float(line[2])
                    float(line[3])
                    float(line[4])
                    float(line[5])
                    float(line[6])
                except Exception:
                    return False
                count += 1
        if count > 0:
            return True

        return False
Exemplo n.º 52
0
    def sniff( self, filename ):
        """
        Determines whether the file is in lav format

        LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
        The first line of a .lav file begins with #:lav.

        For complete details see http://www.bioperl.org/wiki/LAV_alignment_format

        >>> fname = get_test_fname( 'alignment.lav' )
        >>> Lav().sniff( fname )
        True
        >>> fname = get_test_fname( 'alignment.axt' )
        >>> Lav().sniff( fname )
        False
        """
        headers = get_headers( filename, None )
        try:
            if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'):
                return True
            else:
                return False
        except:
            return False
Exemplo n.º 53
0
    def sniff(self, filename):
        """
        Checks for 'pileup-ness'

        There are two main types of pileup: 6-column and 10-column. For both,
        the first three and last two columns are the same. We only check the
        first three to allow for some personalization of the format.

        >>> fname = get_test_fname( 'interval.interval' )
        >>> Pileup().sniff( fname )
        False
        >>> fname = get_test_fname( '6col.pileup' )
        >>> Pileup().sniff( fname )
        True
        >>> fname = get_test_fname( '10col.pileup' )
        >>> Pileup().sniff( fname )
        True
        """
        headers = get_headers(filename, '\t')
        try:
            for hdr in headers:
                if hdr and not hdr[0].startswith('#'):
                    if len(hdr) < 3:
                        return False
                    try:
                        # chrom start in column 1 (with 0-based columns)
                        # and reference base is in column 2
                        check = int(hdr[1])
                        assert hdr[2] in [
                            'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n'
                        ]
                    except:
                        return False
            return True
        except:
            return False
Exemplo n.º 54
0
 def sniff(self, filename):
     header = get_headers(filename, sep='\t', count=1)
     if header[0][0].strip() == '#FPS1':
         return True
     else:
         return False
Exemplo n.º 55
0
 def sniff(self, filename):
     headers = get_headers(filename, '\n', count=1)
     return (len(headers) > 0 and len(headers[0]) >= 7
             and headers[0][0] == "gene_id"
             and headers[0][1].startswith("transcript_id")
             and headers[0][6] == "FPKM")
Exemplo n.º 56
0
 def sniff( self, filename ):
     headers = get_headers( filename, '\n', count=1 )
     return headers[0][0].startswith("##fileformat=VCF")