def set_meta( self, dataset, overwrite = True, **kwd ): """Sets the metadata information for datasets previously determined to be in bed format.""" i = 0 if dataset.has_data(): for i, line in enumerate( file(dataset.file_name) ): metadata_set = False line = line.rstrip('\r\n') if line and not line.startswith('#'): elems = line.split('\t') if len(elems) > 2: for startswith in data.col1_startswith: if line.lower().startswith( startswith ): if len( elems ) > 3: if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): dataset.metadata.nameCol = 4 if len(elems) < 6: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 0 else: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 6 metadata_set = True break if metadata_set: break Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
def set_meta( self, dataset, overwrite = True, **kwd ): i = 0 for i, line in enumerate( file ( dataset.file_name ) ): line = line.rstrip('\r\n') if line and not line.startswith( '#' ): elems = line.split( '\t' ) valid_start = False valid_end = False if len( elems ) == 9: try: start = int( elems[3] ) valid_start = True except: if elems[3] == '.': valid_start = True try: end = int( elems[4] ) valid_end = True except: if elems[4] == '.': valid_end = True strand = elems[6] phase = elems[7] if valid_start and valid_end and start < end and strand in self.valid_gff3_strand and phase in self.valid_gff3_phase: break Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
def set_meta(self, dataset, **kwd): Tabular.set_meta( self, dataset, **kwd) dataset.metadata.markerCol = 1 header = file(dataset.file_name, 'r').readlines()[0].strip().split('\t') dataset.metadata.columns = len(header) t = ['numeric' for x in header] t[0] = 'string' dataset.metadata.column_types = t return True
def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, **kwd) dataset.metadata.markerCol = 1 header = open(dataset.file_name, 'r').readlines()[0].strip().split('\t') dataset.metadata.columns = len(header) t = ['numeric' for x in header] t[0] = 'string' dataset.metadata.column_types = t return True
def set_meta(self, dataset, overwrite=True, **kwd): Tabular.set_meta(self, dataset, overwrite=overwrite, max_data_lines=None, max_guess_type_data_lines=1000, **kwd) if dataset.metadata.comment_metadata is None: dataset_comment_metadata = DatasetCommentMetadata(dataset) dataset.metadata.comment_metadata = dataset_comment_metadata.comment_metadata.copy( ) self.set_dataset_metadata_from_comments(dataset)
def set_meta( self, dataset, overwrite = True, **kwd ): i = 0 for i, line in enumerate( file ( dataset.file_name ) ): line = line.rstrip('\r\n') if line and not line.startswith( '#' ): elems = line.split( '\t' ) if len(elems) == 9: try: int( elems[3] ) int( elems[4] ) break except: pass Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd): Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) tis_args = set() try: fh = open(dataset.file_name) for line in fh: fields = line.strip().split("\t") try: tis_args.add(fields[0]) except IndexError: pass dataset.metadata.args = [] dataset.metadata.args += tis_args finally: fh.close()
def set_meta( self, dataset, overwrite = True, **kwd ): i = 0 for i, line in enumerate( file ( dataset.file_name ) ): line = line.rstrip('\r\n') if line and not line.startswith( '#' ): elems = line.split( '\t' ) try: float( elems[0] ) #"Wiggle track data values can be integer or real, positive or negative values" break except: do_break = False for str in data.col1_startswith: if elems[0].lower().startswith(str): do_break = True break if do_break: break Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i )
def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd): Tabular.set_meta(self, dataset, overwrite, skip, max_data_lines) tis_args = set() try: fh = open(dataset.file_name) for line in fh: fields = line.strip().split('\t') try: tis_args.add(fields[0]) except IndexError: pass dataset.metadata.args = [] dataset.metadata.args += tis_args finally: fh.close()
def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, skip=None, **kwd)
def set_meta( self, dataset, overwrite = True, **kwd ): Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ): Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 ) """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand""" if dataset.has_data(): for i, line in enumerate( file( dataset.file_name ) ): line = line.rstrip( '\r\n' ) if line: if ( first_line_is_header or line[0] == '#' ): self.init_meta( dataset ) line = line.strip( '#' ) elems = line.split( '\t' ) valid = dict( alias_helper ) # shrinks for index, col_name in enumerate( elems ): if col_name in valid: meta_name = valid[col_name] if overwrite or not dataset.metadata.element_is_set( meta_name ): setattr( dataset.metadata, meta_name, index+1 ) values = alias_spec[ meta_name ] start = values.index( col_name ) for lower in values[ start: ]: del valid[ lower ] # removes lower priority keys break # Our metadata is set, so break out of the outer loop else: # Header lines in Interval files are optional. For example, BED is Interval but has no header. # We'll make a best guess at the location of the metadata columns. metadata_is_set = False elems = line.split( '\t' ) if len( elems ) > 2: for str in data.col1_startswith: if line.lower().startswith( str ): if overwrite or not dataset.metadata.element_is_set( 'chromCol' ): dataset.metadata.chromCol = 1 try: int( elems[1] ) if overwrite or not dataset.metadata.element_is_set( 'startCol' ): dataset.metadata.startCol = 2 except: pass # Metadata default will be used try: int( elems[2] ) if overwrite or not dataset.metadata.element_is_set( 'endCol' ): dataset.metadata.endCol = 3 except: pass # Metadata default will be used if len( elems ) > 3: try: int( elems[3] ) except: if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): dataset.metadata.nameCol = 4 if len( elems ) < 6 or elems[5] not in data.valid_strand: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 0 else: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 6 metadata_is_set = True break if metadata_is_set: break # Our metadata is set, so break out of the outer loop
def set_meta( self, dataset, overwrite = True, **kwd ): Tabular.set_meta( self, dataset, overwrite=overwrite, max_data_lines=None, max_guess_type_data_lines=1000, **kwd ) if dataset.metadata.comment_metadata is None: dataset_comment_metadata = DatasetCommentMetadata( dataset ) dataset.metadata.comment_metadata = dataset_comment_metadata.comment_metadata.copy() self.set_dataset_metadata_from_comments( dataset )
def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, **kwd) if dataset.has_data(): with open(dataset.file_name, errors='ignore') as fh: dataset.metadata.dimension = self._get_dimension(fh)
def set_meta(self, dataset, **kwd): Tabular.set_meta(self, dataset, **kwd)
def set_meta( self, dataset, **kwd ): Tabular.set_meta( self, dataset, **kwd )
def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ): Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 ) """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand""" if dataset.has_data(): empty_line_count = 0 num_check_lines = 100 # only check up to this many non empty lines for i, line in enumerate( file( dataset.file_name ) ): line = line.rstrip( '\r\n' ) if line: if ( first_line_is_header or line[0] == '#' ): self.init_meta( dataset ) line = line.strip( '#' ) elems = line.split( '\t' ) valid = dict( alias_helper ) # shrinks for index, col_name in enumerate( elems ): if col_name in valid: meta_name = valid[col_name] if overwrite or not dataset.metadata.element_is_set( meta_name ): setattr( dataset.metadata, meta_name, index+1 ) values = alias_spec[ meta_name ] start = values.index( col_name ) for lower in values[ start: ]: del valid[ lower ] # removes lower priority keys break # Our metadata is set, so break out of the outer loop else: # Header lines in Interval files are optional. For example, BED is Interval but has no header. # We'll make a best guess at the location of the metadata columns. metadata_is_set = False elems = line.split( '\t' ) if len( elems ) > 2: for str in data.col1_startswith: if line.lower().startswith( str ): if overwrite or not dataset.metadata.element_is_set( 'chromCol' ): dataset.metadata.chromCol = 1 try: int( elems[1] ) if overwrite or not dataset.metadata.element_is_set( 'startCol' ): dataset.metadata.startCol = 2 except: pass # Metadata default will be used try: int( elems[2] ) if overwrite or not dataset.metadata.element_is_set( 'endCol' ): dataset.metadata.endCol = 3 except: pass # Metadata default will be used #we no longer want to guess that this column is the 'name', name must now be set manually for interval files #we will still guess at the strand, as we can make a more educated guess #if len( elems ) > 3: # try: # int( elems[3] ) # except: # if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): # dataset.metadata.nameCol = 4 if len( elems ) < 6 or elems[5] not in data.valid_strand: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 0 else: if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): dataset.metadata.strandCol = 6 metadata_is_set = True break if metadata_is_set or ( i - empty_line_count ) > num_check_lines: break # Our metadata is set or we examined 100 non-empty lines, so break out of the outer loop else: empty_line_count += 1