Пример #1
0
def read( strin, degap=False, style=None, preprocess=None):
    """
    * strin - one or more fasta format sequences. The first non-whitespace
    character must be >.
    * degap (bool) - if True, Non-alphanumeric characters are removed
    * style - None (no change to input), 'upper', or 'lower'
    * preprocess - None or a function accepting a single argument "seqstr"
      for modifying the input sequence string arbitrarily. If a function
      is provided, degap and style are ignored.
    """

    gt_count = strin.count('>')
    flist = strin.strip().split('>')
    flist.pop(0) # first element is necessarily empty

    seqlist = []
    for f in flist:

        try:
            firstline, rawseq = f.strip().split('\n', 1)
        except ValueError:
            raise FastaFormatError, 'The input fasta sequence appears to be improperly formatted (characters 1-50):\n%s' % `f[:50]`

        try:
            name, header = firstline.split(None,1)
        except ValueError:
            name, header = firstline.strip(), ''

        if preprocess is None:
            if degap:
                seq = removeAllButAlpha(rawseq)
            else:
                seq = removeWhitespace(rawseq)

            if style == 'upper':
                seq = seq.upper()
            elif style == 'lower':
                seq = seq.lower()
        else:
            seq = preprocess(seq)

        seqlist.append(Seq(name, seq, header))

    #sanity check
    if gt_count != len(seqlist):
        msg = 'input string contained %s ">" characters but only %s sequences were found '
        raise FastaFormatError, msg % (gt_count, len(seqlist))

    return seqlist
Пример #2
0
def _writeEMBLSeq(seqStr, linelength):

    lmar = 5
    seqDelimiter = '//'
    linesep = os.linesep

    seqStr = removeAllButAlpha(seqStr)

    strList = []
    strList.append('SQ   sequence length %s;' % len(seqStr))
    strLen =  linelength-lmar
    for i in range( 0, len(seqStr), strLen):
        strList.append( ' '*lmar + seqStr[i:i+strLen] )

    return linesep.join(strList) + linesep + seqDelimiter
Пример #3
0
def read( strin, degap=False, style=None):
    """
    * strin - one or more fasta format sequences. The first non-whitespace
    character must be >.
    * degap (bool) - if True, Non-alphanumeric characters are removed
    """

    assert strin.startswith('>')
    flist = strin[1:].split('\n>')

    seqlist = []
    for f in flist:

        if f.strip() == '':
            continue

        try:
            firstline, rawseq = f.strip().split('\n', 1)
        except ValueError:
            raise FastaFormatError, 'The input fasta sequence appears to be improperly formatted (characters 1-50):\n%s' % `f[:50]`

        try:
            name, header = firstline.split(None,1)
        except ValueError:
            name, header = firstline.strip(), ''

        seq = re.sub(r'[^a-zA-Z-]','-',rawseq)

        if degap:
            seq = removeAllButAlpha(rawseq)

        if style == 'upper':
            seq = seq.upper()
        elif style == 'lower':
            seq = seq.lower()

        seqlist.append(Seq(name, seq, header))

    return seqlist
Пример #4
0
def read(input, degap=False, case=None, keep_struct=True, keep_ref=True, check_duplicates=True):
    """
    * input - filename or string containing stockholm format sequence alignment
    * degap (bool) - if True, Non-alphanumeric characters are removed
    * case - specify "upper" or "lower" to force sequences into either
    * keep_struct - keep structural model (#=GC SS_cons element)
    * keep_ref - keep reference sequence (#=GC RF element)

    Raise a ValueError if sequence names are not unique.

    return a list of Seq objects
    """

    if len(input) < 50 and os.access(input, os.F_OK):
        lines = open(input)
    else:
        lines = input.splitlines()

    seqdata = {}
    names = []
    linecounts = defaultdict(int)
    for line in lines:
        name, seqstr = None, None
        if not line.strip():
            continue
        elif line.startswith("#=GC"):
            _, name, seqstr = line.split()
        elif line.startswith("#") or line.startswith("//"):
            continue
        else:
            name, seqstr = line.split()

        if name:
            linecounts[name] += 1
            if name not in seqdata:
                names.append(name)
            seqdata[name] = seqdata.get(name, "") + seqstr.strip()

    lcset = set(linecounts.values())

    # all names should have the same number of lines
    if len(lcset) > 1:
        log.error("The following sequence names are not unique:")
        expected = min(lcset)
        not_unique = []
        for name, count in linecounts.items():
            if count != expected:
                log.error("%s appears %s times" % (name, count / expected))
                not_unique.append(name)
        msg = "The following sequence names are not unique: %s" % ",".join(not_unique)
        raise ValueError(msg)

    seqlist = []
    for name in names:
        seq = seqdata[name]

        if name == "SS_cons":
            if not keep_struct:
                continue
        elif name == "RF" and not keep_ref:
            continue
        else:
            seq = re.sub(r"[^a-zA-Z-]", "-", seq)
            if degap:
                seq = removeAllButAlpha(seq)

        if case == "upper":
            seq = seq.upper()
        elif case == "lower":
            seq = seq.lower()

        seqlist.append(Seq(name, seq))

    log.info("writing %s of %s sequences" % (len(seqlist), len(names)))

    return seqlist
Пример #5
0
def read(input,
         namefun=lambda d: d['ACCESSION'][0].split()[0],
         keep_origin=False):
    """
    * input - filename or a string containing Genbank format sequence records
    * namefun - a function that operates on the dict contained in the data attribute
      (see below) to generate a string to be used as the sequence name.
    * keep_origin - if True, retains 'ORIGIN' element in data (the raw sequence string)

    return a generator of gbSeq objects, with sequence name set
    according to 'namefun'. The data attribute of each seq object
    contains all of the data from the genbank record represented as
    nested dicts and tuples::

     {'ACCESSION': ('AB512777', {}),
      'AUTHORS': ('Watanabe,K., Chao,S.-H., Sasamoto,M., Kudo,Y. and Fujimoto,J.',
                  {}),
      'AUTHORS-1': ('Watanabe,K., Chao,S.-H. and Fujimoto,J.', {}),
      'DEFINITION': ('Lactobacillus hammesii gene for 16S rRNA, partial sequence, strain: YIT 12110.',
                     {}),
      'FEATURES': ('Location/Qualifiers',
                   {'rRNA': ('<1..>1553', {'product': ('16S ribosomal RNA', {})}),
                    'source': ('1..1553',
                               {'db_xref': ('taxon:267633', {}),
                                'mol_type': ('genomic DNA', {}),
                                'note': ('type strain of Lactobacillus hammesii',
                                         {}),
                                'organism': ('Lactobacillus hammesii', {}),
                                'strain': ('YIT 12110', {})})}),
      'JOURNAL': ('Unpublished', {}),
      'JOURNAL-1': ('Submitted (15-JUL-2009) Contact:Koichi Watanabe Yakult Central Institute for Microbiological Research, Culture Collection and Microbial Systematics; 1796 Yaho, Kunitachi, Tokyo 186-8650, Japan',
                    {}),
      'KEYWORDS': ('.', {}),
      'LOCUS': ('AB512777                1553 bp    DNA     linear   BCT 17-SEP-2009',
                {}),
      'ORGANISM': ('Lactobacillus hammesii Bacteria; Firmicutes; Lactobacillales; Lactobacillaceae; Lactobacillus.',
                   {}),
      'REFERENCE': ('1', {}),
      'REFERENCE-1': ('2  (bases 1 to 1553)', {}),
      'SOURCE': ('Lactobacillus hammesii', {}),
      'TITLE': ('Novel Lactobacillus species isolated from stinky tofu brine', {}),
      'TITLE-1': ('Direct Submission', {}),
      'VERSION': ('AB512777.1  GI:258612363', {})}
    """

    seqdelim = r'//'
    leadingblank = ' '*10

    if input.find('\n') == -1 and os.access(input, os.F_OK):
        lines = open(input)
    else:
        lines = input.splitlines()

    record = []
    addto = None

    for i,line in enumerate(lines):
        line = line.rstrip()
        if not line:
            continue

        if line.startswith(leadingblank):
            line = line.strip()
            if line.startswith(r'/'):
                if '="' in line:
                    k,v = line[1:].split('=',1)
                    v = v.strip('" ')
                else:
                    k,v = line[1:].strip(), ''
                addto.append([k,v])
            else:
                addto.append(line)
        else:
            try:
                key, val = line.split(None,1)
            except ValueError:
                key, val = line, ''

            if line.strip() == seqdelim:
                d = _as_dict(record)
                seqstr = removeAllButAlpha(d['ORIGIN'][0])
                if not keep_origin:
                    del d['ORIGIN']
                yield gbSeq(name=namefun(d), seq=seqstr, data=d)

                record = []

            if key.isupper():
                record.append([])
                addto = record[-1]
            elif key[0].islower():
                record[-1].append([])
                addto = record[-1][-1]

            addto.extend([key, val])