示例#1
0
 def set_meta( self, dataset, overwrite = True, **kwd ):
     """Sets the metadata information for datasets previously determined to be in bed format."""
     i = 0
     if dataset.has_data():
         for i, line in enumerate( file(dataset.file_name) ):
             metadata_set = False
             line = line.rstrip('\r\n')
             if line and not line.startswith('#'):
                 elems = line.split('\t')
                 if len(elems) > 2:
                     for startswith in data.col1_startswith:
                         if line.lower().startswith( startswith ):
                             if len( elems ) > 3:
                                 if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
                                     dataset.metadata.nameCol = 4
                             if len(elems) < 6:
                                 if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
                                     dataset.metadata.strandCol = 0
                             else:
                                 if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
                                     dataset.metadata.strandCol = 6
                             metadata_set = True
                             break
             if metadata_set: break
         Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
示例#2
0
 def set_meta( self, dataset, overwrite = True, **kwd ):
     i = 0
     for i, line in enumerate( file ( dataset.file_name ) ):
         line = line.rstrip('\r\n')
         if line and not line.startswith( '#' ):
             elems = line.split( '\t' )
             valid_start = False
             valid_end = False
             if len( elems ) == 9:
                 try:
                     start = int( elems[3] )
                     valid_start = True                                    
                 except:
                     if elems[3] == '.':
                         valid_start = True                                        
                 try:
                     end = int( elems[4] )
                     valid_end = True
                 except:
                     if elems[4] == '.':
                         valid_end = True
                 strand = elems[6]
                 phase = elems[7]
                 if valid_start and valid_end and start < end and strand in self.valid_gff3_strand and phase in self.valid_gff3_phase:
                     break
     Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
示例#3
0
 def set_meta(self, dataset, **kwd):
     Tabular.set_meta( self, dataset, **kwd)
     dataset.metadata.markerCol = 1
     header = file(dataset.file_name, 'r').readlines()[0].strip().split('\t')
     dataset.metadata.columns = len(header)
     t = ['numeric' for x in header]
     t[0] = 'string'
     dataset.metadata.column_types = t
     return True
示例#4
0
 def set_meta(self, dataset, **kwd):
     Tabular.set_meta(self, dataset, **kwd)
     dataset.metadata.markerCol = 1
     header = open(dataset.file_name, 'r').readlines()[0].strip().split('\t')
     dataset.metadata.columns = len(header)
     t = ['numeric' for x in header]
     t[0] = 'string'
     dataset.metadata.column_types = t
     return True
示例#5
0
 def set_meta(self, dataset, overwrite=True, **kwd):
     Tabular.set_meta(self,
                      dataset,
                      overwrite=overwrite,
                      max_data_lines=None,
                      max_guess_type_data_lines=1000,
                      **kwd)
     if dataset.metadata.comment_metadata is None:
         dataset_comment_metadata = DatasetCommentMetadata(dataset)
         dataset.metadata.comment_metadata = dataset_comment_metadata.comment_metadata.copy(
         )
         self.set_dataset_metadata_from_comments(dataset)
示例#6
0
 def set_meta( self, dataset, overwrite = True, **kwd ):
     i = 0
     for i, line in enumerate( file ( dataset.file_name ) ):
         line = line.rstrip('\r\n')
         if line and not line.startswith( '#' ):
             elems = line.split( '\t' )
             if len(elems) == 9:
                 try:
                     int( elems[3] )
                     int( elems[4] )
                     break
                 except:
                     pass
     Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
示例#7
0
    def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd):
        Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
        tis_args = set()
        try:
            fh = open(dataset.file_name)

            for line in fh:
                fields = line.strip().split("\t")
                try:
                    tis_args.add(fields[0])
                except IndexError:
                    pass
            dataset.metadata.args = []
            dataset.metadata.args += tis_args

        finally:
            fh.close()
示例#8
0
 def set_meta( self, dataset, overwrite = True, **kwd ):
     i = 0
     for i, line in enumerate( file ( dataset.file_name ) ):
         line = line.rstrip('\r\n')
         if line and not line.startswith( '#' ):
             elems = line.split( '\t' )
             try:
                 float( elems[0] ) #"Wiggle track data values can be integer or real, positive or negative values"
                 break
             except:
                 do_break = False
                 for str in data.col1_startswith:
                     if elems[0].lower().startswith(str):
                         do_break = True
                         break
                 if do_break:
                     break
     Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
示例#9
0
    def set_meta(self,
                 dataset,
                 overwrite=True,
                 skip=None,
                 max_data_lines=None,
                 **kwd):
        Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines)
        tis_args = set()
        try:
            fh = open(dataset.file_name)

            for line in fh:
                fields = line.strip().split('\t')
                try:
                    tis_args.add(fields[0])
                except IndexError:
                    pass
            dataset.metadata.args = []
            dataset.metadata.args += tis_args

        finally:
            fh.close()
示例#10
0
 def set_meta(self, dataset, **kwd):
     Tabular.set_meta(self, dataset, skip=None, **kwd)
示例#11
0
 def set_meta( self, dataset, overwrite = True, **kwd ):
     Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
示例#12
0
 def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ):
     Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 )
     
     """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand"""
     if dataset.has_data():
         for i, line in enumerate( file( dataset.file_name ) ):
             line = line.rstrip( '\r\n' )
             if line:
                 if ( first_line_is_header or line[0] == '#' ):
                     self.init_meta( dataset )
                     line = line.strip( '#' )
                     elems = line.split( '\t' )
                     valid = dict( alias_helper ) # shrinks
                     for index, col_name in enumerate( elems ):
                         if col_name in valid:
                             meta_name = valid[col_name]
                             if overwrite or not dataset.metadata.element_is_set( meta_name ):
                                 setattr( dataset.metadata, meta_name, index+1 )
                             values = alias_spec[ meta_name ]
                             start = values.index( col_name )
                             for lower in values[ start: ]:
                                 del valid[ lower ]  # removes lower priority keys 
                     break  # Our metadata is set, so break out of the outer loop
                 else: 
                     # Header lines in Interval files are optional. For example, BED is Interval but has no header.
                     # We'll make a best guess at the location of the metadata columns.
                     metadata_is_set = False
                     elems = line.split( '\t' )
                     if len( elems ) > 2:
                         for str in data.col1_startswith:
                             if line.lower().startswith( str ):
                                 if overwrite or not dataset.metadata.element_is_set( 'chromCol' ):
                                     dataset.metadata.chromCol = 1
                                 try:
                                     int( elems[1] )
                                     if overwrite or not dataset.metadata.element_is_set( 'startCol' ):
                                         dataset.metadata.startCol = 2
                                 except:
                                     pass # Metadata default will be used
                                 try:
                                     int( elems[2] )
                                     if overwrite or not dataset.metadata.element_is_set( 'endCol' ):
                                         dataset.metadata.endCol = 3
                                 except:
                                     pass # Metadata default will be used
                                 if len( elems ) > 3:
                                     try:
                                         int( elems[3] )
                                     except:
                                         if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
                                             dataset.metadata.nameCol = 4 
                                 if len( elems ) < 6 or elems[5] not in data.valid_strand:
                                     if overwrite or not dataset.metadata.element_is_set(  'strandCol' ):
                                         dataset.metadata.strandCol = 0
                                 else:
                                     if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
                                         dataset.metadata.strandCol = 6
                                 metadata_is_set = True
                                 break
                     if metadata_is_set:
                         break # Our metadata is set, so break out of the outer loop
示例#13
0
 def set_meta( self, dataset, overwrite = True, **kwd ):
     Tabular.set_meta( self, dataset, overwrite=overwrite, max_data_lines=None, max_guess_type_data_lines=1000, **kwd )
     if dataset.metadata.comment_metadata is None:
         dataset_comment_metadata = DatasetCommentMetadata( dataset )
         dataset.metadata.comment_metadata = dataset_comment_metadata.comment_metadata.copy()
         self.set_dataset_metadata_from_comments( dataset )
 def set_meta(self, dataset, **kwd):
     Tabular.set_meta(self, dataset, **kwd)
     if dataset.has_data():
         with open(dataset.file_name, errors='ignore') as fh:
             dataset.metadata.dimension = self._get_dimension(fh)
 def set_meta(self, dataset, **kwd):
     Tabular.set_meta(self, dataset, **kwd)
 def set_meta( self, dataset, **kwd ):
     Tabular.set_meta( self, dataset, **kwd )
示例#17
0
 def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ):
     Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 )
     
     """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand"""
     if dataset.has_data():
         empty_line_count = 0
         num_check_lines = 100 # only check up to this many non empty lines
         for i, line in enumerate( file( dataset.file_name ) ):
             line = line.rstrip( '\r\n' )
             if line:
                 if ( first_line_is_header or line[0] == '#' ):
                     self.init_meta( dataset )
                     line = line.strip( '#' )
                     elems = line.split( '\t' )
                     valid = dict( alias_helper ) # shrinks
                     for index, col_name in enumerate( elems ):
                         if col_name in valid:
                             meta_name = valid[col_name]
                             if overwrite or not dataset.metadata.element_is_set( meta_name ):
                                 setattr( dataset.metadata, meta_name, index+1 )
                             values = alias_spec[ meta_name ]
                             start = values.index( col_name )
                             for lower in values[ start: ]:
                                 del valid[ lower ]  # removes lower priority keys 
                     break  # Our metadata is set, so break out of the outer loop
                 else: 
                     # Header lines in Interval files are optional. For example, BED is Interval but has no header.
                     # We'll make a best guess at the location of the metadata columns.
                     metadata_is_set = False
                     elems = line.split( '\t' )
                     if len( elems ) > 2:
                         for str in data.col1_startswith:
                             if line.lower().startswith( str ):
                                 if overwrite or not dataset.metadata.element_is_set( 'chromCol' ):
                                     dataset.metadata.chromCol = 1
                                 try:
                                     int( elems[1] )
                                     if overwrite or not dataset.metadata.element_is_set( 'startCol' ):
                                         dataset.metadata.startCol = 2
                                 except:
                                     pass # Metadata default will be used
                                 try:
                                     int( elems[2] )
                                     if overwrite or not dataset.metadata.element_is_set( 'endCol' ):
                                         dataset.metadata.endCol = 3
                                 except:
                                     pass # Metadata default will be used
                                 #we no longer want to guess that this column is the 'name', name must now be set manually for interval files
                                 #we will still guess at the strand, as we can make a more educated guess
                                 #if len( elems ) > 3:
                                 #    try:
                                 #        int( elems[3] )
                                 #    except:
                                 #        if overwrite or not dataset.metadata.element_is_set( 'nameCol' ):
                                 #            dataset.metadata.nameCol = 4 
                                 if len( elems ) < 6 or elems[5] not in data.valid_strand:
                                     if overwrite or not dataset.metadata.element_is_set(  'strandCol' ):
                                         dataset.metadata.strandCol = 0
                                 else:
                                     if overwrite or not dataset.metadata.element_is_set( 'strandCol' ):
                                         dataset.metadata.strandCol = 6
                                 metadata_is_set = True
                                 break
                     if metadata_is_set or ( i - empty_line_count ) > num_check_lines:
                         break # Our metadata is set or we examined 100 non-empty lines, so break out of the outer loop
             else:
                 empty_line_count += 1