Пример #1
0
def CmalignScoreParser(lines):
    """Parser for tabfile format cmalign score result.
    
        - IMPORTANT: Will only parse standard output from cmalign.
                
        - NOTE: Will only work with search result files with a single CM
            as a query.  Will not work with multiple alignment result files
            that have been concatenated.
        
        - Result will be list of hits with following order:
        [seq idx, seq name, seq len, total bit score, struct bit score,
            avg prob, elapsed time]
        
    """
    # Converting indices and %GC to integers and bit score to float.
    # Since E-value is only present if CM is calibrated, leaving as string.
    conversion_fields = [(0,int),(2,int),(3,float),(4,float),(5,float)]
    cmalign_score_converter = ConvertFields(conversion_fields)
    #Ignore hash characters
    good_lines = []
    for l in lines:
        line = l.strip()
        if line.startswith('# STOCKHOLM 1.0'):
            break
        if line and (not line.startswith('#')):
            good_lines.append(l)
    #make parser
    cmalign_score_parser = SeparatorFormatParser(with_header=False,\
                                            converter=cmalign_score_converter,\
                                            ignore=None,\
                                            sep=None)
    
    return cmalign_score_parser(good_lines)
Пример #2
0
def CmsearchParser(lines):
    """Parser for tabfile format cmsearch result.
    
        - IMPORTANT: Will not parse standard output from cmsearch.  You must
            use --tabfile with cmsearch to get correct format to use this
            parser.
        
        - NOTE: Will only work with search result files with a single CM
            as a query.  Will not work with multiple search result files
            that have been concatenated.
        
        - Result will be list of hits with following order:
        [target name, target start, target stop, query start, query stop,
            bit score, E-value, GC%]
        
    """
    # Converting indices and %GC to integers and bit score to float.
    # Since E-value is only present if CM is calibrated, leaving as string.
    conversion_fields = [(2,int),(3,int),(4,int),(5,int),(6,float),(8,int)]
    cmsearch_converter = ConvertFields(conversion_fields)
    #Ignore hash characters
    good_lines = []
    for l in lines:
        if not l.startswith('#'):
            good_lines.append(l)
    #make parser
    cmsearch_parser = SeparatorFormatParser(with_header=False,\
                                            converter=cmsearch_converter,\
                                            ignore=None,\
                                            sep=None)
    
    return cmsearch_parser(good_lines)
Пример #3
0
                line.append('')

    header = []
    for t, b in zip(*lines):
        if t.strip().endswith('-'):
            c = t.strip() + b
        else:
            c = ' '.join([t.strip(), b.strip()])
        header += [c.strip()]
    return header


int_series = lambda x: map(int, x.replace(',', ' ').split())

row_converter = ConvertFields([(i, int) for i in range(8)]+\
                              [(i, int) for i in range(10, 13)]+\
                              [(i, int) for i in range(14, 18)]+\
                              [(i, int_series) for i in range(18, 21)])


def MinimalPslParser(data, row_converter=row_converter):
    """returns version, header and rows from data"""
    if type(data) == str:
        data = open(data)

    psl_version = None
    header = None
    rows = []

    for record in data:
        if psl_version is None:
            assert 'psLayout version' in record
Пример #4
0
"""
from cogent import LoadTable
from cogent.parse.table import ConvertFields

__author__ = "Gavin Huttley, Anuj Pahwa"
__copyright__ = "Copyright 2007-2016, The Cogent Project"
__credits__ = ["Rob Knight", "Peter Maxwell", "Gavin Huttley", "Anuj Pahwa"]
__license__ = "GPL"
__version__ = "1.9"
__maintainer__ = "Gavin Huttley"
__email__ = "*****@*****.**"
__status__ = "Development"

# The 4th and the 7th elements of the row of data returned from bowtie are
# integer values and can thus be converted.
row_converter = ConvertFields([(3, int), (6, int)])


def BowtieOutputParser(data, row_converter=row_converter):
    """yields a header and row of data from the default bowtie output
    
    Arguments:
        - row_converter: if not provided, uses a default converter which casts
          the Offset and Other Matches fields to ints. If set to None, all
          returned data will be strings (this is faster).
    """

    header = [
        'Query Name', 'Strand Direction', 'Reference Name', 'Offset',
        'Query Seq', 'Quality', 'Other Matches', 'Mismatches'
    ]
Пример #5
0
    return val


def get_strand(val):
    """returns 1/-1 for strand from bitwise operation"""
    v = int(val)
    strand = [-1, 1][v & 16 == 0]
    return strand


def zero_based(val):
    """returns a zero-based integer"""
    return int(val) - 1


strict_converter = ConvertFields([(1, int), (3, int), (4, int),
                                  (5, _strict_cigar_span)])

converter = ConvertFields([(1, get_strand), (3, zero_based), (4, _int_str),
                           (5, _cigar_span)])

# SAM fields: QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, OPT
complete_converter = ConvertFields([(0, str), (1, get_strand), (2, str),
                                    (3, zero_based), (4, int),
                                    (5, _cigar_span), (6, str), (7, int),
                                    (8, int), (9, str), (10, str), (11, str)])


def MinimalSamParser(data, converter=converter):
    """returns records from a sam file

    NOTE: the default converter turns the 1-based numbering of POS into
Пример #6
0
    """ returns 1/-1 for strand in place of '+' or '-' """
    strand = [-1, 1][val == '+']
    return strand


pattern = re.compile(r'[0-9,X,Y,MT]+')


def _get_chrom(val):
    """ returns the int component of a chromosome number """
    chrom = pattern.search(val).group(0)
    return chrom


# BED3 defines: chrom, chromStart, chromEnd
bed3_converter = ConvertFields([(0, _get_chrom), (1, int), (2, int)])

# BED6 adds: Name, score, strand
converter = ConvertFields([(0, _get_chrom), (1, int), (2, int), (3, str),
                           (4, int), (5, _get_strand)])

# BED12 additional fields: thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts
complete_converter = ConvertFields([(0, _get_chrom), (1, int), (2, int),
                                    (3, str), (4, int), (5, _get_strand),
                                    (6, int), (7, int), (8, tuple), (9, int),
                                    (10, tuple), (11, tuple)])


def MinimalBedParser(data, converter=converter):
    """returns data lines from a BED file