def __main__(): input_filename = sys.argv[1] output_filename = sys.argv[2] species = odict() cur_size = 0 for components in iter_fasta_alignment(input_filename): species_not_written = species.keys() for component in components: if component.species not in species: species[component.species] = tempfile.TemporaryFile() species[component.species].write("-" * cur_size) species[component.species].write(component.text) try: species_not_written.remove(component.species) except ValueError: #this is a new species pass for spec in species_not_written: species[spec].write("-" * len(components[0].text)) cur_size += len(components[0].text) out = open(output_filename, 'wb') for spec, f in species.iteritems(): f.seek(0) out.write(">%s\n%s\n" % (spec, f.read())) out.close()
def __main__(): input_filename = sys.argv[1] output_filename = sys.argv[2] species = odict() cur_size = 0 for components in iter_fasta_alignment( input_filename ): species_not_written = species.keys() for component in components: if component.species not in species: species[component.species] = tempfile.TemporaryFile() species[component.species].write( "-" * cur_size ) species[component.species].write( component.text ) try: species_not_written.remove( component.species ) except ValueError: #this is a new species pass for spec in species_not_written: species[spec].write( "-" * len( components[0].text ) ) cur_size += len( components[0].text ) out = open( output_filename, 'wb' ) for spec, f in species.iteritems(): f.seek( 0 ) out.write( ">%s\n%s\n" % ( spec, f.read() ) ) out.close()
def parse_ndx( self, ndx ): ndx_dict = odict() index_group = '' for line in ndx.split('\n'): if line.startswith('['): index_group = line.strip(' []\n\r') ndx_dict[ index_group ] = [] elif line.strip(): ndx_dict[ index_group ] += line.split() return ndx_dict
def parse_ndx(self, ndx): ndx_dict = odict() index_group = '' for line in ndx.split('\n'): if line.startswith('['): index_group = line.strip(' []\n\r') ndx_dict[index_group] = [] elif line.strip(): ndx_dict[index_group] += line.split() return ndx_dict
def read_unordered_gtf( iterator, strict=False ): """ Returns GTF features found in an iterator. GTF lines need not be ordered or clustered for reader to work. Reader returns GFFFeature objects sorted by transcript_id, chrom, and start position. """ # -- Get function that generates line/feature key. -- get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ] if strict: # Strict GTF parsing uses transcript_id only to group lines into feature. key_fn = get_transcript_id else: # Use lenient parsing where chromosome + transcript_id is the key. This allows # transcripts with same ID on different chromosomes; this occurs in some popular # datasources, such as RefGenes in UCSC. key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) # Aggregate intervals by transcript_id and collect comments. feature_intervals = odict() comments = [] for count, line in enumerate( iterator ): if line.startswith( '#' ): comments.append( Comment( line ) ) continue line_key = key_fn( line.split('\t') ) if line_key in feature_intervals: feature = feature_intervals[ line_key ] else: feature = [] feature_intervals[ line_key ] = feature feature.append( GFFInterval( None, line.split( '\t' ) ) ) # Create features. chroms_features = {} for count, intervals in enumerate( feature_intervals.values() ): # Sort intervals by start position. intervals.sort( lambda a,b: cmp( a.start, b.start ) ) feature = GFFFeature( None, intervals=intervals ) if feature.chrom not in chroms_features: chroms_features[ feature.chrom ] = [] chroms_features[ feature.chrom ].append( feature ) # Sort features by chrom, start position. chroms_features_sorted = [] for chrom_features in chroms_features.values(): chroms_features_sorted.append( chrom_features ) chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) ) for features in chroms_features_sorted: features.sort( lambda a,b: cmp( a.start, b.start ) ) # Yield comments first, then features. # FIXME: comments can appear anywhere in file, not just the beginning. # Ideally, then comments would be associated with features and output # just before feature/line. for comment in comments: yield comment for chrom_features in chroms_features_sorted: for feature in chrom_features: yield feature
def read_unordered_gtf(iterator, strict=False): """ Returns GTF features found in an iterator. GTF lines need not be ordered or clustered for reader to work. Reader returns GFFFeature objects sorted by transcript_id, chrom, and start position. """ # -- Get function that generates line/feature key. -- get_transcript_id = lambda fields: parse_gff_attributes(fields[8])[ 'transcript_id'] if strict: # Strict GTF parsing uses transcript_id only to group lines into feature. key_fn = get_transcript_id else: # Use lenient parsing where chromosome + transcript_id is the key. This allows # transcripts with same ID on different chromosomes; this occurs in some popular # datasources, such as RefGenes in UCSC. key_fn = lambda fields: fields[0] + '_' + get_transcript_id(fields) # Aggregate intervals by transcript_id and collect comments. feature_intervals = odict() comments = [] for count, line in enumerate(iterator): if line.startswith('#'): comments.append(Comment(line)) continue line_key = key_fn(line.split('\t')) if line_key in feature_intervals: feature = feature_intervals[line_key] else: feature = [] feature_intervals[line_key] = feature feature.append(GFFInterval(None, line.split('\t'))) # Create features. chroms_features = {} for count, intervals in enumerate(feature_intervals.values()): # Sort intervals by start position. intervals.sort(lambda a, b: cmp(a.start, b.start)) feature = GFFFeature(None, intervals=intervals) if feature.chrom not in chroms_features: chroms_features[feature.chrom] = [] chroms_features[feature.chrom].append(feature) # Sort features by chrom, start position. chroms_features_sorted = [] for chrom_features in chroms_features.values(): chroms_features_sorted.append(chrom_features) chroms_features_sorted.sort(lambda a, b: cmp(a[0].chrom, b[0].chrom)) for features in chroms_features_sorted: features.sort(lambda a, b: cmp(a.start, b.start)) # Yield comments first, then features. # FIXME: comments can appear anywhere in file, not just the beginning. # Ideally, then comments would be associated with features and output # just before feature/line. for comment in comments: yield comment for chrom_features in chroms_features_sorted: for feature in chrom_features: yield feature