def CmalignScoreParser(lines): """Parser for tabfile format cmalign score result. - IMPORTANT: Will only parse standard output from cmalign. - NOTE: Will only work with search result files with a single CM as a query. Will not work with multiple alignment result files that have been concatenated. - Result will be list of hits with following order: [seq idx, seq name, seq len, total bit score, struct bit score, avg prob, elapsed time] """ # Converting indices and %GC to integers and bit score to float. # Since E-value is only present if CM is calibrated, leaving as string. conversion_fields = [(0,int),(2,int),(3,float),(4,float),(5,float)] cmalign_score_converter = ConvertFields(conversion_fields) #Ignore hash characters good_lines = [] for l in lines: line = l.strip() if line.startswith('# STOCKHOLM 1.0'): break if line and (not line.startswith('#')): good_lines.append(l) #make parser cmalign_score_parser = SeparatorFormatParser(with_header=False,\ converter=cmalign_score_converter,\ ignore=None,\ sep=None) return cmalign_score_parser(good_lines)
def CmsearchParser(lines): """Parser for tabfile format cmsearch result. - IMPORTANT: Will not parse standard output from cmsearch. You must use --tabfile with cmsearch to get correct format to use this parser. - NOTE: Will only work with search result files with a single CM as a query. Will not work with multiple search result files that have been concatenated. - Result will be list of hits with following order: [target name, target start, target stop, query start, query stop, bit score, E-value, GC%] """ # Converting indices and %GC to integers and bit score to float. # Since E-value is only present if CM is calibrated, leaving as string. conversion_fields = [(2,int),(3,int),(4,int),(5,int),(6,float),(8,int)] cmsearch_converter = ConvertFields(conversion_fields) #Ignore hash characters good_lines = [] for l in lines: if not l.startswith('#'): good_lines.append(l) #make parser cmsearch_parser = SeparatorFormatParser(with_header=False,\ converter=cmsearch_converter,\ ignore=None,\ sep=None) return cmsearch_parser(good_lines)
line.append('') header = [] for t, b in zip(*lines): if t.strip().endswith('-'): c = t.strip() + b else: c = ' '.join([t.strip(), b.strip()]) header += [c.strip()] return header int_series = lambda x: map(int, x.replace(',', ' ').split()) row_converter = ConvertFields([(i, int) for i in range(8)]+\ [(i, int) for i in range(10, 13)]+\ [(i, int) for i in range(14, 18)]+\ [(i, int_series) for i in range(18, 21)]) def MinimalPslParser(data, row_converter=row_converter): """returns version, header and rows from data""" if type(data) == str: data = open(data) psl_version = None header = None rows = [] for record in data: if psl_version is None: assert 'psLayout version' in record
""" from cogent import LoadTable from cogent.parse.table import ConvertFields __author__ = "Gavin Huttley, Anuj Pahwa" __copyright__ = "Copyright 2007-2016, The Cogent Project" __credits__ = ["Rob Knight", "Peter Maxwell", "Gavin Huttley", "Anuj Pahwa"] __license__ = "GPL" __version__ = "1.9" __maintainer__ = "Gavin Huttley" __email__ = "*****@*****.**" __status__ = "Development" # The 4th and the 7th elements of the row of data returned from bowtie are # integer values and can thus be converted. row_converter = ConvertFields([(3, int), (6, int)]) def BowtieOutputParser(data, row_converter=row_converter): """yields a header and row of data from the default bowtie output Arguments: - row_converter: if not provided, uses a default converter which casts the Offset and Other Matches fields to ints. If set to None, all returned data will be strings (this is faster). """ header = [ 'Query Name', 'Strand Direction', 'Reference Name', 'Offset', 'Query Seq', 'Quality', 'Other Matches', 'Mismatches' ]
return val def get_strand(val): """returns 1/-1 for strand from bitwise operation""" v = int(val) strand = [-1, 1][v & 16 == 0] return strand def zero_based(val): """returns a zero-based integer""" return int(val) - 1 strict_converter = ConvertFields([(1, int), (3, int), (4, int), (5, _strict_cigar_span)]) converter = ConvertFields([(1, get_strand), (3, zero_based), (4, _int_str), (5, _cigar_span)]) # SAM fields: QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL, OPT complete_converter = ConvertFields([(0, str), (1, get_strand), (2, str), (3, zero_based), (4, int), (5, _cigar_span), (6, str), (7, int), (8, int), (9, str), (10, str), (11, str)]) def MinimalSamParser(data, converter=converter): """returns records from a sam file NOTE: the default converter turns the 1-based numbering of POS into
""" returns 1/-1 for strand in place of '+' or '-' """ strand = [-1, 1][val == '+'] return strand pattern = re.compile(r'[0-9,X,Y,MT]+') def _get_chrom(val): """ returns the int component of a chromosome number """ chrom = pattern.search(val).group(0) return chrom # BED3 defines: chrom, chromStart, chromEnd bed3_converter = ConvertFields([(0, _get_chrom), (1, int), (2, int)]) # BED6 adds: Name, score, strand converter = ConvertFields([(0, _get_chrom), (1, int), (2, int), (3, str), (4, int), (5, _get_strand)]) # BED12 additional fields: thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts complete_converter = ConvertFields([(0, _get_chrom), (1, int), (2, int), (3, str), (4, int), (5, _get_strand), (6, int), (7, int), (8, tuple), (9, int), (10, tuple), (11, tuple)]) def MinimalBedParser(data, converter=converter): """returns data lines from a BED file