def parse_file_fasta_seqkey(file_fasta, hsh, options):
    if options.get('-a') == '':
        print_stderr('reading file into hash\n')

    _id = ''
    seq = ''
    running_1 = 0

    FASTA = open_or_die2(file_fasta, 'rb')

    while True:
        l = FASTA.readline().strip()
        if not l:
            break

        m = re.match(r'^>(\S+)', l)
        if m:
            _id = m.group()
            seq = ''

            while True:
                ll = FASTA.readline().strip()
                if not ll:
                    break

                mm = re.match(r'^>(\S+)', ll)
                if mm:
                    cnt = find_cnt(_id)
                    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
                    # ATTR: Performance issue below:
                    # create_hash_key_chain(hsh, 0, seq)
                    try:
                        hsh[seq] = (hsh[seq]) + cnt
                    except KeyError:
                        hsh[seq] = cnt

                    running_1 += 1

                    if options.get('-a') == '':
                        print_stderr('{}\r'.format(running_1))

                    _id = mm.group()
                    seq = ''
                    continue

                seq += ll

    cnt = find_cnt(_id)
    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
    create_hash_key_chain(hsh, 0, seq)
    hsh[seq] += cnt
    running_1 += 1

    if options.get('-a') == '':
        print_stderr('{}\r'.format(running_1))

    FASTA.close()
예제 #2
0
def remove_adapter(_id, seq, prefix):
    seq = tr(seq, '[acgtun.]', '[ACGTTNN]')
    seq_clipped = None

    pattern = r'(\w+)' + prefix
    m = re.search(pattern, seq)
    if m:
        seq_clipped = m.groups()[0]
    elif substr(seq, 0, 6) == prefix:
        seq_clipped = prefix
    else:
        finish = 0

        while not finish and len(prefix) > 0:
            # ATTR: chop $prefix
            prefix = prefix[:-1]
            mm = re.search(r'(\w+){}$'.format(prefix), seq)
            if mm:
                seq_clipped = mm.groups()[0]
                finish = 1

    if not seq_clipped:
        seq_clipped = seq

    # print ">$id\n$seq_clipped\n";
    print('>{}\n{}'.format(_id, seq_clipped))
def com(sequence):
    return tr(sequence, 'acgtuACGTU', 'TGCAATGCAA')
예제 #4
0
            continue

        if re.match(r'^\s*$', l):
            continue

        if novel or known:
            l = l.strip()
            line = re.split('\t', l)
            coord = 'na'
            if len(line) > 16 and line[16]:
                coord = line[16]

            if known:
                if float(line[1]) >= thres and float(line[1]) < maxs:
                    if options.get('-d') == '':
                        line[seqcol] = tr(line[seqcol], 'uU', 'tT')

                    if options.get('-p') == '':
                        m = re.search(r'\|([a-zA-Z0-9_-]*)$', line[0])
                        if m:
                            line[0] = m.groups()[0]

                        OUT.write(">{}\n{}\n".format(line[0],
                                                     line[seqcol].upper()))

                    else:
                        OUT.write(">{}_{}_x{}_coord:{}_score:{}\n{}\n".format(
                            line[0], line[9], line[5], coord, line[1],
                            line[seqcol].upper()))

                    coord = coord.strip()
    try:
        IN = open(sys.argv[1], 'rb')
    except IOError:
        print('cannot open file {}'.format(sys.argv[1]))
        sys.exit(-1)

    while True:
        l = IN.readline()
        if not l:
            break

        line = re.split(r'\t', l)
        if line[1] == '-':
            line[4] = str_reverse(line[4])
            line[4] = tr(line[4], 'ACGTN', 'TGCAN')

        gseq = ssplit(line[4].lower())
        edit = ssplit('m' * len(line[4]))
        mm = 0

        if line[7]:
            changes = re.split(r',', line[7])
            for change in changes:
                match = re.search(r'(\d+):(\w+)\>\w+', change)
                if match:
                    match = match.groups()
                    mm += 1
                    gseq[int(match[0])] = match[1].lower()
                    edit[int(match[0])] = 'M'
def parse_file_struct(file_struct):
    global db_old
    FILE_STRUCT = open_or_die(file_struct, 'rb',
                              'can not open file {}\n'.format(file_struct))
    while True:
        line = FILE_STRUCT.readline()
        if not line:
            break

        line = line.strip()

        m = re.match(r'^>(\S+)\s*(.*)', line)
        if m:
            m = m.groups()
            _id = m[0]
            desc = m[1]
            seq = ""
            struct = ""
            mfe = ""

            while True:
                line2 = FILE_STRUCT.readline()
                if not line2:
                    break

                line2 = line2.strip()
                mm = re.match(r'^>(\S+)\s*(.*)', line2)
                if mm:
                    hash_desc[_id] = desc
                    hash_seq[_id] = seq
                    hash_struct[_id] = struct
                    hash_mfe[_id] = mfe
                    _id = mm.groups()[0]
                    desc = mm.groups()[1]
                    seq = ""
                    struct = ""
                    mfe = ""
                    continue

                m3 = re.match(r'^\w', line2)
                if m3:
                    line2 = tr(line2, 'uU', 'tT')
                    seq += line2

                m3 = re.search(r'((\.|\(|\))+)', line2)
                if m3:
                    struct += m3.groups()[0]

                m3 = re.search(r'\((\s*-\d+\.\d+)\)', line2)
                if m3:
                    mfe = m3.groups()[0]

    hash_desc[_id] = desc
    hash_seq[_id] = seq
    hash_struct[_id] = struct
    hash_mfe[_id] = mfe

    # print('\n'.join(sorted(hash_struct.values())))
    # print('\n'.join(sorted(hash_desc.keys())))

    FILE_STRUCT.close()
예제 #7
0
                if mm:
                    remove_adapter(_id, seq, prefix)
                    _id = mm.groups()[0]
                    seq = ''
                    continue

                seq += ll

    remove_adapter(_id, seq, prefix)
    FASTA.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('file_fasta')
    parser.add_argument('seq_adapter')

    if len(sys.argv) != 3:
        print(usage)
        sys.exit(-1)

    args = parser.parse_args(sys.argv[1:3])
    file_fasta = args.file_fasta
    seq_adapter = args.seq_adapter
    seq_test = "TCGTATGCCGTCTTCTGCTTGT"

    prefix = substr(seq_adapter, 0, 6)
    prefix = tr(prefix, '[acgtun.]', '[ACGTTNN]')
    remove_adapters(file_fasta, prefix)
    sys.exit(0)