Пример #1
0
def trim_columns(sequences, opts, tmp_dir):
    aali_path = tmp_dir + '/aligned.fasta'
    write_rfasta(sequences, aali_path, what='aa_ali')
    trimcl_path = tmp_dir + '/trimmed.fasta'
    if opts.trimcol == 'specific':
        cmds = [BINARIES['trimal']['bin'], '-in' , aali_path,
                '-out', trimcl_path, '-gt' , str (opts.gaptreshold),
                '-st' , str (opts.similarity), '-colnumbering']
    else:
        cmds = [BINARIES['trimal']['bin'], '-in' , aali_path,
                '-out', trimcl_path, '-' + opts.trimcol,
                '-colnumbering']
    proc = Popen(cmds, stdout=PIPE, stderr=PIPE)
    (keeplist, err) = proc.communicate()
    LOG.append('')
    if 'ERROR' in err:
        exit('ERROR: trimming columns:\n' + err)

    keeplist = str (keeplist).strip().split(', ')

    algt = get_alignment(sequences, typ=('aa_ali' if opts.aa else 'codon'))
    nnn = compil('[A-Z]{3}')
    if opts.nogap: 
        for (col, num) in zip (algt, range (len (algt))):
            if not str(num) in keeplist:
                algt[num] = [ nnn.sub('', x) for x in  col ]
                algt[num] = [ compil('---').sub('', x) for x in algt[num]]
    else:
        for (col, num) in zip (algt, range (len (algt))):
            if not str(num) in keeplist:
                algt[num] = [ nnn.sub('NNN', x) for x in col ]
    for (key, seq) in zip (sorted (sequences.keys()), zip (*algt)):
        sequences[key]['codon'] = seq
Пример #2
0
def load_impmodel_from_cmm(f_name, rand_init=None, radius=None):
    '''
    Loads an IMPmodel object using an cmm file of the form:

    ::

        <marker_set name="1">
          <marker id="1" x="7347.50964739" y="-7743.92836303" z="-8283.39749204" r="0.00990099009901" g="0" b="0.990099009901" radius="500.0" note="1"/>
          <marker id="2" x="7647.90254377" y="-7308.1816344" z="-7387.75932893" r="0.019801980198" g="0" b="0.980198019802" radius="500.0" note="2"/>
          <link id1="1" id2="2" r="1" g="1" b="1" radius="250.0"/>
        </marker_set>

    :params f_name: path where to find the file
    :params None rand_init: IMP random initial number used to generate the model
    :param None radius: radius of each particle

    :return: IMPmodel
    '''

    if not rand_init:
        try:
            rand_init = str(int(f_name.split('.')[-2]))
        except:
            rand_init = None
    model = IMPmodel((('x', {}), ('y', {}), ('z', {}), ('rand_init', rand_init),
                      ('index', 0), ('objfun', 0), ('radius', radius)))
    expr = compil(
        ' x="([0-9.-]+)" y="([0-9.-]+)" z="([0-9.-]+)".* radius="([0-9.]+)"')
    for xxx, yyy, zzz, radius in findall(expr, open(f_name).read()):
        model['x'].append(float(xxx))
        model['y'].append(float(yyy))
        model['z'].append(float(zzz))
    if not model['radius']:
        model['radius'] = float(radius)
    return model
Пример #3
0
def load_impmodel_from_xyz_OLD(f_name, rand_init=None, radius=None,
                               chromosome='UNKNOWN', start=0, resolution=1):
    """
    Loads an IMPmodel object using an xyz file of the form:

    ::

          p1           1      44.847     412.828    -162.673
          p2           2     -55.574     396.869    -129.782

    :params f_name: path where to find the file
    :params None rand_init: IMP random initial number used to generate the model
    :param None radius: radius of each particle

    :return: IMPmodel

    """
    if not rand_init:
        try:
            rand_init = str(int(f_name.split('.')[-2]))
        except:
            rand_init = None
    model = IMPmodel((('x', []), ('y', []), ('z', []), ('rand_init', rand_init),
                      ('objfun', None), ('radius', radius)))
    expr = compil('p[0-9]+\s+[0-9]+\s+([0-9.-]+)\s+([0-9.-]+)\s+([0-9.-]+)')
    for xxx, yyy, zzz in findall(expr, open(f_name).read()):
        model['x'].append(float(xxx))
        model['y'].append(float(yyy))
        model['z'].append(float(zzz))
    model['description'] = {'chromosome':chromosome,
                            'start': start, 'resolution': resolution}

    return model
Пример #4
0
 def write(self, outfile=None, item='seq', reverse=False, width=60, descr=False):
     """
     Write sequence object to file in fasta format
     
     :argument None outfile: path to outfile, if None than, print to stdout
     :argument seq item: what to put in place of sequence
     :argument False reverse: wether to reverse or not the sequence
     :argument 60 width: number of sites per line when printing sequence
     :argument False descr: put description of sequence also, not recommended if you are not sure how the aligner will read it.
     
     """
     if outfile:
         out = open (outfile, 'w')
     else:
         out = stdout
     wsub = compil('([A-Za-z-]{'+str(width)+'})')
     for elt in self:
         if decr:
             out.write ('>%s |%s\n' % (elt, self[elt]['descr']))
         else:
             out.write ('>%s\n' % (elt))
         seq = self[elt][item][::-1] if reverse else self[elt][item]
         seq = seq if type(seq) is str else ''.join(seq)
         out.write ('%s\n' % (sub(wsub, '\\1\n', seq)))
     if outfile:
         out.close()
Пример #5
0
def load_impmodel_from_cmm(f_name, rand_init=None, radius=None):
    '''
    Loads an IMPmodel object using an cmm file of the form:

    ::

        <marker_set name="1">
          <marker id="1" x="7347.50964739" y="-7743.92836303" z="-8283.39749204" r="0.00990099009901" g="0" b="0.990099009901" radius="500.0" note="1"/>
          <marker id="2" x="7647.90254377" y="-7308.1816344" z="-7387.75932893" r="0.019801980198" g="0" b="0.980198019802" radius="500.0" note="2"/>
          <link id1="1" id2="2" r="1" g="1" b="1" radius="250.0"/>
        </marker_set>

    :params f_name: path where to find the file
    :params None rand_init: IMP random initial number used to generate the model
    :param None radius: radius of each particle

    :return: IMPmodel
    '''

    if not rand_init:
        try:
            rand_init = str(int(f_name.split('.')[-2]))
        except:
            rand_init = None
    model = IMPmodel((('x', []), ('y', []), ('z', []), ('rand_init', rand_init),
                      ('index', 0), ('objfun', 0), ('radius', radius)))
    expr = compil(
        ' x="([0-9.-]+)" y="([0-9.-]+)" z="([0-9.-]+)".* radius="([0-9.]+)"')
    for xxx, yyy, zzz, radius in findall(expr, open(f_name).read()):
        model['x'].append(float(xxx))
        model['y'].append(float(yyy))
        model['z'].append(float(zzz))
    if not model['radius']:
        model['radius'] = float(radius)
    return model
Пример #6
0
def load_impmodel_from_xyz_OLD(f_name, rand_init=None, radius=None,
                               chromosome='UNKNOWN', start=0, resolution=1):
    """
    Loads an IMPmodel object using an xyz file of the form:

    ::

          p1           1      44.847     412.828    -162.673
          p2           2     -55.574     396.869    -129.782

    :params f_name: path where to find the file
    :params None rand_init: IMP random initial number used to generate the model
    :param None radius: radius of each particle

    :return: IMPmodel

    """
    if not rand_init:
        try:
            rand_init = str(int(f_name.split('.')[-2]))
        except:
            rand_init = None
    model = IMPmodel((('x', []), ('y', []), ('z', []), ('rand_init', rand_init),
                      ('objfun', None), ('radius', radius)))
    expr = compil('p[0-9]+\s+[0-9]+\s+([0-9.-]+)\s+([0-9.-]+)\s+([0-9.-]+)')
    for xxx, yyy, zzz in findall(expr, open(f_name).read()):
        model['x'].append(float(xxx))
        model['y'].append(float(yyy))
        model['z'].append(float(zzz))
    model['description'] = {'chromosome':chromosome,
                            'start': start, 'resolution': resolution}

    return model
Пример #7
0
def read_fasta(infile):
    '''
    read file in fasta format and yield each sequence
    '''
    nam   = None
    descr = None
    seq   = ''
    blank_re = compil('[ \t]')
    for line in open(infile):
        line = line.strip()
        if line.startswith('>'):
            if nam is not None:
                if seq == '':
                    print >> stderr, 'ERROR: no sequence for ', str(nam)
                    exit()
                yield { 'name'  : nam,
                        'descr' : descr,
                        'seq'   : seq
                    }
            items = blank_re.split(line, maxsplit=1)
            nam   = items[0].lstrip('>')
            descr = items[1] if len (items) == 2 else None
            seq = ''
            continue
        seq += blank_re.sub('', line)
    if seq == '' and nam is not None:
        print >> stderr, 'ERROR: no sequence for ', str(nam)
        exit()
    elif seq == '':
        print >> stderr, 'ERROR: presence of repeated names'
        exit()
    yield { 'name'  : nam,
            'descr' : descr,
            'seq'   : seq
            }
Пример #8
0
 def write(self,
           outfile=None,
           item='seq',
           reverse=False,
           width=60,
           descr=False):
     """
     Write sequence object to file in fasta format
     
     :argument None outfile: path to outfile, if None than, print to stdout
     :argument seq item: what to put in place of sequence
     :argument False reverse: wether to reverse or not the sequence
     :argument 60 width: number of sites per line when printing sequence
     :argument False descr: put description of sequence also, not recommended if you are not sure how the aligner will read it.
     
     """
     if outfile:
         out = open(outfile, 'w')
     else:
         out = stdout
     wsub = compil('([A-Za-z-]{' + str(width) + '})')
     for elt in self:
         if decr:
             out.write('>%s |%s\n' % (elt, self[elt]['descr']))
         else:
             out.write('>%s\n' % (elt))
         seq = self[elt][item][::-1] if reverse else self[elt][item]
         seq = seq if type(seq) is str else ''.join(seq)
         out.write('%s\n' % (sub(wsub, '\\1\n', seq)))
     if outfile:
         out.close()
Пример #9
0
def trim_columns(sequences, opts, tmp_dir):
    aali_path = tmp_dir + '/aligned.fasta'
    write_rfasta(sequences, aali_path, what='aa_ali')
    trimcl_path = tmp_dir + '/trimmed.fasta'
    if opts.trimcol == 'specific':
        cmds = [
            BINARIES['trimal']['bin'], '-in', aali_path, '-out', trimcl_path,
            '-gt',
            str(opts.gaptreshold), '-st',
            str(opts.similarity), '-colnumbering'
        ]
    else:
        cmds = [
            BINARIES['trimal']['bin'], '-in', aali_path, '-out', trimcl_path,
            '-' + opts.trimcol, '-colnumbering'
        ]
    proc = Popen(cmds, stdout=PIPE, stderr=PIPE)
    (keeplist, err) = proc.communicate()
    LOG.append('')
    if 'ERROR' in err:
        exit('ERROR: trimming columns:\n' + err)

    keeplist = str(keeplist).strip().split(', ')

    algt = get_alignment(sequences)
    nnn = compil('[A-Z]{3}')
    if opts.nogap:
        for (col, num) in zip(algt, range(len(algt))):
            if not str(num) in keeplist:
                algt[num] = [nnn.sub('', x) for x in col]
                algt[num] = [compil('---').sub('', x) for x in algt[num]]
    else:
        for (col, num) in zip(algt, range(len(algt))):
            if not str(num) in keeplist:
                algt[num] = [nnn.sub('NNN', x) for x in col]
    for (key, seq) in zip(sorted(sequences.keys()), zip(*algt)):
        sequences[key]['codon'] = seq
Пример #10
0
def load_impmodel_from_xyz(f_name, rand_init=None, radius=None):
    """
    Loads an IMPmodel object using an xyz file of the form:

    ::

          # ID              : some identifier
          # SPECIES         : None
          # CELL TYPE       : None
          # EXPERIMENT TYPE : Hi-C
          # RESOLUTION      : 10000
          # ASSEMBLY        : None
          # CHROMOSOME      : 19
          # START           : 1
          # END             : 50
          1  19:1-10000        44.847     412.828    -162.673
          2  19:10001-20000   -55.574     396.869    -129.782

    :params f_name: path where to find the file
    :params None rand_init: IMP random initial number used to generate the model
    :param None radius: radius of each particle

    :return: IMPmodel

    """
    if not rand_init:
        try:
            rand_init = str(int(f_name.split('.')[-2]))
        except:
            rand_init = None
    model = IMPmodel(
        (('x', []), ('y', []), ('z', []), ('rand_init', rand_init),
         ('index', 0), ('objfun', 0), ('radius', radius)))
    expr = compil(
        '[0-9]+\s[A-Za-z0-9_ ]+:[0-9]+-[0-9]+\s+([0-9.-]+)\s+([0-9.-]+)\s+([0-9.-]+)'
    )
    model['description'] = {}
    for line in open(f_name):
        if line.startswith('# '):
            key, val = line.strip('# ').split(':')
            model['description'][key.strip().lower()] = val.strip()
    for xxx, yyy, zzz in findall(expr, open(f_name).read()):
        model['x'].append(float(xxx))
        model['y'].append(float(yyy))
        model['z'].append(float(zzz))
    return model
Пример #11
0
def load_impmodel_from_xyz(f_name, rand_init=None, radius=None):
    """
    Loads an IMPmodel object using an xyz file of the form:

    ::

          # ID              : some identifier
          # SPECIES         : None
          # CELL TYPE       : None
          # EXPERIMENT TYPE : Hi-C
          # RESOLUTION      : 10000
          # ASSEMBLY        : None
          # CHROMOSOME      : 19
          # START           : 1
          # END             : 50
          1  19:1-10000        44.847     412.828    -162.673
          2  19:10001-20000   -55.574     396.869    -129.782

    :params f_name: path where to find the file
    :params None rand_init: IMP random initial number used to generate the model
    :param None radius: radius of each particle

    :return: IMPmodel

    """
    if not rand_init:
        try:
            rand_init = str(int(f_name.split('.')[-2]))
        except:
            rand_init = None
    model = IMPmodel((('x', []), ('y', []), ('z', []), ('rand_init', rand_init),
                      ('index', 0), ('objfun', 0), ('radius', radius)))
    expr = compil('[0-9]+\s[A-Za-z0-9_ ]+:[0-9]+-[0-9]+\s+([0-9.-]+)\s+([0-9.-]+)\s+([0-9.-]+)')
    model['description'] = {}
    for line in open(f_name):
        if line.startswith('# '):
            key, val = line.strip('# ').split(':')
            model['description'][key.strip().lower()] = val.strip()
    for xxx, yyy, zzz in findall(expr, open(f_name).read()):
        model['x'].append(float(xxx))
        model['y'].append(float(yyy))
        model['z'].append(float(zzz))
    return model
Пример #12
0
def main():
    '''
    main function when called by command line.
    '''
    opts = get_options()

    log = '\n\n'
    gencode = _set_code(opts.code)
    seqs     = {}
    for seq in read_fasta(opts.fastafile):
        seq['trseq'] = translate(seq['seq'], gencode, stop=opts.remove_stop)
        seqs[seq['name']] = seq

    log += '   ' + str (len (seqs)) + ' sequences\n\n'
    prot_path   = opts.outfile + '_prot'
    aali_path   = opts.outfile + '_aa_ali'
    ali_path    = opts.outfile + '_ali'
    trimsq_path = opts.outfile + '_trimseq'
    trimcl_path = opts.outfile + '_trimcol'
    score_path  = opts.outfile + '_score'
    map_path    = opts.outfile + '_map'
    todel       = [prot_path]
    write_fasta(seqs, prot_path, clean=True, typ='trseq')

    if opts.only_translate:
        exit()

    ###########
    # ALIGN
    if not opts.input_ali:
        proc = Popen([opts.muscle_bin,
                      '-quiet',
                      '-noanchors',
                      '-maxiters' , '999',
                      '-maxhours' , '24 ',
                      '-maxtrees' , '100',
                      '-in'       , prot_path,
                      '-out'      , aali_path,
                      '-scorefile', score_path # must be last!!! because option...
                      ][:None if opts.score else -2], stdout=PIPE)
        if proc.communicate()[1] is not None:
            print >> stderr, proc.communicate()[0]
            exit('\nERROR: runninge muscle')
       
        log += '   Muscle command line: \n' + \
               ' '.join([opts.muscle_bin, '-quiet', '-noanchors', '-maxiters' , \
                         '999', '-maxhours', '24 ', '-maxtrees', '100', '-in', \
                         prot_path, '-out', aali_path, '-scorefile', \
                         score_path][:None if opts.score else -2]) + '\n\n'

    else:
        proc = Popen(['cp', prot_path, aali_path], stdout=PIPE)
        if proc.communicate()[1] is not None:
            print >> stderr, proc.communicate()[0]
            exit('\nERROR: when skipping muscle.')


    ###########
    # TRIM SEQS
    if opts.trimseq != False:
        todel.append(trimsq_path)
        proc = Popen([opts.trimal_bin,
                      '-in'        , aali_path,
                      '-out'       , trimsq_path,
                      '-resoverlap', str (opts.trimseq[1]),
                      '-seqoverlap', str (opts.trimseq[2]),
                      '-cons'      , '100'
                      ], stdout=PIPE)
        if proc.communicate()[1] is not None:
            print >> stderr, proc.communicate()[0]
            exit('\nERROR: runninge muscle')

        for seq in read_fasta(trimsq_path):
            seqs[seq['name']]['ali'] = seq['seq']
        
        trimmed = filter (lambda x: not seqs[x].has_key('ali'), seqs)
        if not opts.quiet:
            print >> stderr, 'WARNING: trimmed sequences: \n\t' + \
                  '\n\t'.join(trimmed)
        if len (trimmed) > 0:
            log += '->trimmed sequences: \n\t' + \
                   '\n\t'.join(trimmed) + '\n'
        else: log += '->no trimmed sequences\n'

        for s in seqs.keys():
            if s in trimmed:
                del(seqs[s])
        aali_path = trimsq_path
        log += '   Trimal (sequences) command line: \n' + \
               ' '.join([opts.trimal_bin, '-in', aali_path, '-out',
                         trimsq_path, '-resoverlap', str (opts.trimseq[1]), \
                         '-seqoverlap', str (opts.trimseq[2]), '-cons', '100']) \
                         + '\n\n'
    else:
        for seq in read_fasta(aali_path):
            seqs[seq['name']]['ali'] = seq['seq']


    ###########
    # CODON MAP
    seqs = map2codons(seqs, opts.input_ali)

    ###########
    # TRIM COLS
    if opts.trimcol != 'None':
        if opts.trimcol == 'specific':
            todel.append(trimcl_path)
            proc = Popen([opts.trimal_bin,
                          '-in' , aali_path,
                          '-out', trimcl_path, 
                          '-gt' , str (opts.gaptreshold),
                          '-st' , str (opts.similarity),
                          '-colnumbering'
                          ], stdout=PIPE)
            (keeplist, err) = proc.communicate()
            if err is not None:
                exit('ERROR: trimming columns.')
            log += '   Trimal (columns) command line: \n' + \
                   ' '.join([opts.trimal_bin,
                          '-in', aali_path,
                          '-out', trimcl_path, 
                          '-gt', str (opts.gaptreshold),
                          '-st', str (opts.similarity),
                          '-colnumbering'
                          ]) + '\n'

        else:
            todel.append(trimcl_path)
            proc = Popen([opts.trimal_bin,
                          '-in' , aali_path,
                          '-out', trimcl_path, 
                          '-' + opts.trimcol,
                          '-colnumbering'
                          ], stdout=PIPE)
            (keeplist, err) = proc.communicate()
            if err is not None:
                exit('ERROR: trimming columns.')
            log += '   Trimal (columns) command line: \n' + \
                   ' '.join([opts.trimal_bin, '-in' , aali_path, '-out',
                             trimcl_path, '-' + opts.trimcol, \
                             '-colnumbering']) + '\n'

        keeplist = str (keeplist).strip().split(', ')

        algt = get_alignment(seqs)
        nnn = compil('[A-Z]{3}')
        if opts.nogap: 
            for (col, num) in zip (algt, range (len (algt))):
                if not str(num) in keeplist:
                    algt[num] = map (lambda x: nnn.sub('', x), col)
                    algt[num] = map (lambda x: compil('---').sub('', x), algt[num])
        else:
            for (col, num) in zip (algt, range (len (algt))):
                if not str(num) in keeplist:
                    algt[num] = map (lambda x: nnn.sub('NNN', x), col)
        for (key, seq) in zip (sorted (seqs.keys()), zip (*algt)):
            seqs[key]['codons'] = ''.join(seq)

    ###########
    # SEQ MAP
    if opts.printmap:
        _printmap(seqs, map_path, opts.pymap)
    write_fasta(seqs, ali_path, clean=opts.clean, typ='codons')

    Popen(['rm', '-f'] + todel, stdout=PIPE)

    if opts.print_log:
        print log