示例#1
0
def __main__():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    species = odict()
    cur_size = 0
    for components in iter_fasta_alignment(input_filename):
        species_not_written = species.keys()
        for component in components:
            if component.species not in species:
                species[component.species] = tempfile.TemporaryFile()
                species[component.species].write("-" * cur_size)
            species[component.species].write(component.text)
            try:
                species_not_written.remove(component.species)
            except ValueError:
                #this is a new species
                pass
        for spec in species_not_written:
            species[spec].write("-" * len(components[0].text))
        cur_size += len(components[0].text)
    out = open(output_filename, 'wb')
    for spec, f in species.iteritems():
        f.seek(0)
        out.write(">%s\n%s\n" % (spec, f.read()))
    out.close()
def __main__():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    species = odict()
    cur_size = 0
    for components in iter_fasta_alignment( input_filename ):
        species_not_written = species.keys()
        for component in components:
            if component.species not in species:
                species[component.species] = tempfile.TemporaryFile()
                species[component.species].write( "-" * cur_size )
            species[component.species].write( component.text )
            try:
                species_not_written.remove( component.species )
            except ValueError:
                #this is a new species
                pass
        for spec in species_not_written:
            species[spec].write( "-" * len( components[0].text ) )
        cur_size += len( components[0].text )
    out = open( output_filename, 'wb' )
    for spec, f in species.iteritems():
        f.seek( 0 )
        out.write( ">%s\n%s\n" % ( spec, f.read() ) )
    out.close()
示例#3
0
文件: data.py 项目: arose/provi
 def parse_ndx( self, ndx ):
     ndx_dict = odict()
     index_group = ''
     for line in ndx.split('\n'):
         if line.startswith('['):
             index_group = line.strip(' []\n\r')
             ndx_dict[ index_group ] = []
         elif line.strip():
             ndx_dict[ index_group ] += line.split()
     return ndx_dict
示例#4
0
文件: data.py 项目: hildilab/server
 def parse_ndx(self, ndx):
     ndx_dict = odict()
     index_group = ''
     for line in ndx.split('\n'):
         if line.startswith('['):
             index_group = line.strip(' []\n\r')
             ndx_dict[index_group] = []
         elif line.strip():
             ndx_dict[index_group] += line.split()
     return ndx_dict
示例#5
0
def read_unordered_gtf( iterator, strict=False ):
    """
    Returns GTF features found in an iterator. GTF lines need not be ordered
    or clustered for reader to work. Reader returns GFFFeature objects sorted
    by transcript_id, chrom, and start position.
    """

    # -- Get function that generates line/feature key. --

    get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ]
    if strict:
        # Strict GTF parsing uses transcript_id only to group lines into feature.
        key_fn = get_transcript_id
    else:
        # Use lenient parsing where chromosome + transcript_id is the key. This allows
        # transcripts with same ID on different chromosomes; this occurs in some popular
        # datasources, such as RefGenes in UCSC.
        key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )


    # Aggregate intervals by transcript_id and collect comments.
    feature_intervals = odict()
    comments = []
    for count, line in enumerate( iterator ):
        if line.startswith( '#' ):
            comments.append( Comment( line ) )
            continue

        line_key = key_fn( line.split('\t') )
        if line_key in feature_intervals:
            feature = feature_intervals[ line_key ]
        else:
            feature = []
            feature_intervals[ line_key ] = feature
        feature.append( GFFInterval( None, line.split( '\t' ) ) )

    # Create features.
    chroms_features = {}
    for count, intervals in enumerate( feature_intervals.values() ):
        # Sort intervals by start position.
        intervals.sort( lambda a,b: cmp( a.start, b.start ) )
        feature = GFFFeature( None, intervals=intervals )
        if feature.chrom not in chroms_features:
            chroms_features[ feature.chrom ] = []
        chroms_features[ feature.chrom ].append( feature )

    # Sort features by chrom, start position.
    chroms_features_sorted = []
    for chrom_features in chroms_features.values():
        chroms_features_sorted.append( chrom_features )
    chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) )
    for features in chroms_features_sorted:
        features.sort( lambda a,b: cmp( a.start, b.start ) )

    # Yield comments first, then features.
    # FIXME: comments can appear anywhere in file, not just the beginning.
    # Ideally, then comments would be associated with features and output
    # just before feature/line.
    for comment in comments:
        yield comment

    for chrom_features in chroms_features_sorted:
        for feature in chrom_features:
            yield feature
示例#6
0
def read_unordered_gtf(iterator, strict=False):
    """
    Returns GTF features found in an iterator. GTF lines need not be ordered
    or clustered for reader to work. Reader returns GFFFeature objects sorted
    by transcript_id, chrom, and start position.
    """

    # -- Get function that generates line/feature key. --

    get_transcript_id = lambda fields: parse_gff_attributes(fields[8])[
        'transcript_id']
    if strict:
        # Strict GTF parsing uses transcript_id only to group lines into feature.
        key_fn = get_transcript_id
    else:
        # Use lenient parsing where chromosome + transcript_id is the key. This allows
        # transcripts with same ID on different chromosomes; this occurs in some popular
        # datasources, such as RefGenes in UCSC.
        key_fn = lambda fields: fields[0] + '_' + get_transcript_id(fields)

    # Aggregate intervals by transcript_id and collect comments.
    feature_intervals = odict()
    comments = []
    for count, line in enumerate(iterator):
        if line.startswith('#'):
            comments.append(Comment(line))
            continue

        line_key = key_fn(line.split('\t'))
        if line_key in feature_intervals:
            feature = feature_intervals[line_key]
        else:
            feature = []
            feature_intervals[line_key] = feature
        feature.append(GFFInterval(None, line.split('\t')))

    # Create features.
    chroms_features = {}
    for count, intervals in enumerate(feature_intervals.values()):
        # Sort intervals by start position.
        intervals.sort(lambda a, b: cmp(a.start, b.start))
        feature = GFFFeature(None, intervals=intervals)
        if feature.chrom not in chroms_features:
            chroms_features[feature.chrom] = []
        chroms_features[feature.chrom].append(feature)

    # Sort features by chrom, start position.
    chroms_features_sorted = []
    for chrom_features in chroms_features.values():
        chroms_features_sorted.append(chrom_features)
    chroms_features_sorted.sort(lambda a, b: cmp(a[0].chrom, b[0].chrom))
    for features in chroms_features_sorted:
        features.sort(lambda a, b: cmp(a.start, b.start))

    # Yield comments first, then features.
    # FIXME: comments can appear anywhere in file, not just the beginning.
    # Ideally, then comments would be associated with features and output
    # just before feature/line.
    for comment in comments:
        yield comment

    for chrom_features in chroms_features_sorted:
        for feature in chrom_features:
            yield feature