示例#1
0
def read_rep():
    dn = 'rep.dict.pkl'
    if cmn.filexist(dn):
        print('loading repeats using precomputed data...')
        return cmn.pickle_read(dn)

    freps = cmn.cmd2lines('ls annotation_repeats/*.gff3')
    repdict = {}
    for frep in freps:
        for line in cmn.file2lines(fn):
            items = line.strip().split()
            scaf = items[0]
            if scaf not in repdict:
                repdict[scaf] = set([])

            i, j = list(map(int, items[3:5]))
            repdict[scaf] = repdict[scaf] | set(range(i, j))
    cmn.pickle_write(repdict, dn)
    return repdict
        return [char1, char1]
    else:
        char1, char2 = flist[-2:]
        print(count_dict, char1, char2, count_dict, cutoff, 'twochars')
        return [char1, char2]


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

try:
    fblast, fread = sys.argv[1:3]
except:
    print('*.py blastExon.dict.pkl readSp.dict.pkl', file=sys.stderr)
    sys.exit()

blast_dict = cmn.pickle_read(fblast)

#reads[sp][name] = seq
read_dict = cmn.pickle_read(fread)
splist = set(read_dict.keys())

#read in ranges for exons
#{'COX1': readID: {2:(1,2,3)}}
#TODO: fill in missing

stack_dict = {}
exon_lengths = {}
for exon in blast_dict:
    info = blast_dict[exon]
    stack_dict[exon] = {}
示例#3
0
        sys.exit()

    cwd = os.getcwd()

    #subsetIDs = set(cmn.getid(fsubset))
    subsetJobs = set([
        cmn.lastName(line.replace('sbatch', '').strip())[4:-4]
        for line in cmn.file2lines(fsubset)
    ])

    #1. read in info
    fsams = cmn.cmd2lines('ls %s/*/*/*.sam' % mapdir)
    #print fsams
    samdirs = set(['/'.join(fsam.split('/')[:-2]) for fsam in fsams])
    #print samdirs
    require_refs = cmn.pickle_read(freq)

    fq_dict = {}
    refdict = {}
    #1. tell by reftable
    #make the requirement by the reftable
    for line in cmn.file2lines(freftable):
        items = line.strip().split()
        sp = items[0]
        fastqs = items[1].split(',')
        fq_dict[sp] = fastqs

    # check for reference
    #2. tell by best mapping
    for samdir in samdirs:
        sp = cmn.lastName(samdir)
示例#4
0
    sys.path.append(python_lib)

import cmn
import ete3

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
        fname = sys.argv[2]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    nameDict = cmn.pickle_read(fname)

    t = ete3.Tree(fn)

    for node in t:
        name = node.name
        node.name = nameDict[name]

    dn = cmn.lastName(fn) + '.mapnamed'
    cmn.write_file(t.write(), dn)
示例#5
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py 1708_mapped.pileup", file=sys.stderr)
        sys.exit()

    new = []
    try:
        coding = cmn.pickle_read('coding.indexes.pkl')
    except:
        print('do not find index file for coding region')
        print('would not label coding positions')
        coding = set([])

    with open(fn) as fp:
        for line in fp:
            try:
                scaffold, index, ref_base, count, read_stack, qual_stack = line.strip(
                ).split()
            except:
                #coverage is 0
                items = line.strip().split()
                index, ref_base, count = items[1:4]
                print(index, ref_base, count, '0 lowCoverage(0)')
示例#6
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fns = sys.argv[1:]
    except:
        print("Usage: *.py read_filelist", file=sys.stderr)
        sys.exit()

    #read in data
    fdict = 'blastBySp.dict.pkl'
    sp_dict = cmn.pickle_read(fdict)

    #get the read ID, and the exon of it
    good_IDs = {}
    for sp in sp_dict:
        lines = sp_dict[sp]
        for line in lines:
            readID = line.split()[2]
            print('readID', readID)
            good_IDs[readID] = sp

    #get the reads and split them into exons
    #fns = cmn.getid(fn)

    rdict = {}
    for fn in fns: