def main():
    print_help()
    file_bed12 = bb.fun_open_file(sys.argv[2], "w")
    dict_exons = {}
    for l in bb.fun_open_file(sys.argv[1], "r"):
        line = l.strip().split("\t")
        if line[0] == "":
            break
        if line[0][0] == "#":
            continue
        gn = re.findall(r'gene_name \"([\w\.\-\_\(\)\[\]\'\:]+)\"', line[8])
        gi = re.findall(r'gene_id \"([\w\.\-\_\(\)\[\]\'\:]+)\"', line[8])
        if len(gn) == 0:
            gn = gi
        if (line[2] == "exon" or re.search("utr", line[2])) and len(gi) > 0:
            try:
                dict_exons[gi[0]].append(
                    [line[0],
                     int(line[3]),
                     int(line[4]), line[6], gn[0]])
            except KeyError:
                dict_exons[gi[0]] = [[
                    line[0],
                    int(line[3]),
                    int(line[4]), line[6], gn[0]
                ]]
    for i in dict_exons:
        dict_exons[i].sort()
    for i in dict_exons:
        s = -1  # ss: start; s: block start; e: end; c: chromosome; st: strand; ct: block numbers; bsize: block size; bstart: block start
        e = -1
        bstart = [0]
        bsize = []
        for j in dict_exons[i]:
            if e == -1:
                ct = 1
                ss = j[1]
                s = j[1]
                e = j[2]
                c = j[0]
                st = j[3]
            if j[1] > e + 1:
                bsize.append(e - s)
                s = j[1]
                e = j[2]
                bstart.append(s)
                ct += 1
            else:
                e = max(e, j[2])
        bsize.append(e - s)
        file_bed12.write(
            "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{1}\t{2}\t255,0,0\t{6}\t".format(
                c, ss, e, i, j[4], st, ct))
        for bs in bsize:
            file_bed12.write(str(bs) + ",")
        file_bed12.write("\t")
        for bs in bstart:
            file_bed12.write(str(bs) + ",")
        file_bed12.write("\n")
    file_bed12.close()
Exemplo n.º 2
0
def read_exon_intron():
    file_pi = bb.fun_open_file(PATH_QUERY_BED)
    file_psl = bb.fun_open_file("{0}/{1}_{2}.psl".format(PATH_OUT, SPECIES1, SPECIES2))
    dict_exon_intron = {}
    dict_min = {}
    for l in file_pi:
        line = l.strip().split()
        line[1] = int(line[1])
        if line[3] not in dict_min:
            dict_min[line[3]] = line[1]
        else:
            if line[1] < dict_min[line[3]]:
                dict_min[line[3]] = line[1]
    file_pi.seek(0)
    for l in file_pi:
        line = l.strip().split()
        line[1:3] = map(int, line[1:3])
        if line[3] not in dict_exon_intron:
            dict_exon_intron[line[3]] = {"exon": [], "intron": []}
        start = dict_min[line[3]]
        dict_exon_intron[line[3]]["exon"].append([line[1] - start, line[2] - start])
    for pi in dict_exon_intron:
        exons = dict_exon_intron[pi]["exon"]
        exons.sort()
        start = exons[0][1] + 1
        for i in range(1, len(exons)):
            end = exons[i][0] - 1
            dict_exon_intron[pi]["intron"].append([start, end])
            start = exons[i][1] + 1
    return dict_exon_intron
Exemplo n.º 3
0
def main():
    print_help()
    dict_files = {}
    dir_out = sys.argv[3].rstrip("/") + "/"
    # file open limited to 1024 in lunix, colse files when 800 files is read
    ss = 1
    for l in bb.fun_open_file(sys.argv[2], "r"):
        line = l.strip().split()
        dict_files[line[0]] = bb.fun_open_file(dir_out+line[0]+".bed2", "w")
        if ss%800 == 0:
            for l in bb.fun_open_file(sys.argv[1], "r"):
                line = l.strip().split()
                try:
                    dict_files[line[0]].write(l)
                except:
                    continue
            for key in dict_files:
                dict_files[key].close()
        ss+=1
    for l in bb.fun_open_file(sys.argv[1], "r"):
        line = l.strip().split()
        try:
            dict_files[line[0]].write(l)
        except:
            continue
    for key in dict_files:
        dict_files[key].close()
Exemplo n.º 4
0
def main():
    print_help()
    matrix = {}
    strand = {}
    for l in bb.fun_open_file(sys.argv[2]):
	line = l.strip().split()
        strand[line[3]] = line[5]
        cmd = "bigWigSummary {0} {1} {2} {3} {4}".format(sys.argv[1], line[0], line[1], line[2], sys.argv[4])
        res = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
        sig = res.stdout.readline().strip().split()
	if sig == []:
		matrix[line[3]] = []
		for j in range(int(sys.argv[4])):
			matrix[line[3]].append("0")
	else:
        	matrix[line[3]] = sig
    file_out = bb.fun_open_file(sys.argv[3], "w")
    for gn in matrix:
        file_out.write(gn)
        list_out = []
        for sig in matrix[gn]:
            if sig == "n/a":
                list_out.append("0")
            else:
                list_out.append(sig)
        if strand[gn] == "-":
            list_out = list_out[::-1]
        file_out.write("\t" + "\t".join(list_out) + "\n")
    file_out.close()
def main():
    args = get_args()
    args.align = prepare_align_file(args)
    list_gene = read_gene_list(args)
    dict_matrix = {}
    for gn in list_gene:
        dict_matrix[gn] = []
    cmd = ["bedtools", "coverage", "-d", "-a", args.gene_position, "-b", args.align]
    bb.fun_print("run bedtools coverage", "green", "black", 1)
    ret = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    factor = 0
    for l in bb.fun_open_file(args.align):
        factor += 1
    print "unique mapped reads number: %s"%factor
    c = 0
    r = 0
    name = "0"
    bb.fun_print("make matrix", "green", "black", 1)
    tt = 0
    for l in ret.stdout:
        tt += 1
        if tt % 1000000 == 0:
            bb.fun_print("%s million lines processed......"%(tt/1000000))
        line = l.strip().split()
        if line[3] != name and name != "0" and r != 0:
            dict_matrix[name].append(str(float(c)/args.resolution*1000000/factor))
            r = 0
            c = 0
        name = line[3]
        r += 1
        c += int(line[7])
        if r == args.resolution:
            dict_matrix[name].append(str(float(c)/args.resolution*1000000/factor))
            r = 0
            c = 0
    file_matrix = bb.fun_open_file(args.output, "w")
    for gn in list_gene:
        file_matrix.write(gn)
        if DICT_STRAND[gn] == "+":
            for dp in dict_matrix[gn]:
                file_matrix.write("\t" + dp)
            file_matrix.write("\n")
        else:
            for dp in dict_matrix[gn][::-1]:
                file_matrix.write("\t" + dp)
            file_matrix.write("\n")
    file_matrix.close()
    if args.format in ["sam", "bam"]:
        subprocess.check_call("rm %s_temp*"%args.align)
Exemplo n.º 6
0
def main():
    if len(sys.argv) < 2:
        bb.fun_print_help("species_query", "species_target", "path_out", "chain_files(eg: rn5ToRn6,rn6ToMm10)","[query.ns.bed]",
                "[query.bed]")
    global_para()
    if not os.path.exists(PATH_OUT):
        bb.fun_print("create out path......", "red", "black", 1)
        os.mkdir(PATH_OUT)
    dict_pi = get_pi()
    psl_map()
    dict_exon_intron = read_exon_intron()
    write_exon_intron_length(dict_exon_intron)
    dict_cover = calculate_coverage(dict_exon_intron)
    list_ortho = find_ortho_pi(dict_cover)
    dict_synteny = get_synteny(dict_pi, list_ortho)
    run_lastz(dict_synteny, dict_pi, dict_cover)
    for i in dict_pi:
        if i not in list_ortho and i not in dict_synteny:
            LIST_OUT.append([i, 0, "none", "0", "0", "0", "chrNA", 1, 1, "+"])
    file_out = bb.fun_open_file("{0}/{1}_{2}.conserve.tab".format(PATH_OUT, SPECIES1, SPECIES2), "w")
    for i in LIST_OUT:
        file_out.write(i[0])
        for j in range(1, len(i)):
            file_out.write("\t" + str(i[j]))
        file_out.write("\n")
    file_out.close()
    os.system("rm {0}/temp*".format(PATH_OUT))
Exemplo n.º 7
0
def calculate_coverage(dict_exon_intron):
    file_psl = bb.fun_open_file("{0}/{1}_{2}.psl".format(PATH_OUT, SPECIES1, SPECIES2))
    dict_cover = {}
    for l in file_psl:
        line = l.strip().split()
        pi = line[9]
        align = []
        map_length = line[18].strip(",").split(",")
        map_start = line[19].strip(",").split(",")
        for i in range(len(map_length)):
            align.append([int(map_start[i]), int(map_start[i]) + int(map_length[i]) - 1])
        exon_length = 0
        intron_length = 0
        for i in dict_exon_intron[pi]["exon"]:
            exon_length += (i[1] - i[0] + 1)
        for i in dict_exon_intron[pi]["intron"]:
            intron_length += (i[1] - i[0] + 1)
        exon_over = 0
        intron_over = 0
        for i in align:
            for j in dict_exon_intron[pi]["exon"]:
                if not (i[1] < j[0] or i[0] > j[1]):
                    exon_over += (min(i[1], j[1]) - max(i[0], j[0]) + 1)
            for j in dict_exon_intron[pi]["intron"]:
                if not (i[1] < j[0] or i[0] > j[1]):
                    intron_over += (min(i[1], j[1]) - max(i[0], j[0]) + 1)
        exon_cover = float(exon_over) / float(exon_length)
        if intron_length == 0:
            intron_cover = -1
        else:
            intron_cover = float(intron_over) / float(intron_length)
        dict_cover[pi] = [str(exon_cover), str(intron_cover)]
    return dict_cover
Exemplo n.º 8
0
def read_chrom(path):
    out = {}
    file_in = bb.fun_open_file(path)
    for l in file_in:
        line = l.strip().split()
        out[line[0]] = int(line[1])
    return out
Exemplo n.º 9
0
def main():
    if len(sys.argv) < 2:
        bb.fun_print_help("in.fastq", "out.fastq")
    global RECODE
    RECODE = {
        "0": {
            "A": "A",
            "C": "C",
            "G": "G",
            "T": "T"
        },
        "1": {
            "A": "C",
            "C": "A",
            "G": "T",
            "T": "G"
        },
        "2": {
            "A": "G",
            "C": "T",
            "G": "A",
            "T": "C"
        },
        "3": {
            "A": "T",
            "C": "G",
            "G": "C",
            "T": "A"
        }
    }
    file_solid = bb.fun_open_file(sys.argv[1])
    file_illumina = bb.fun_open_file(sys.argv[2], "w")
    count = 0
    for l in file_solid:
        count += 1
        if count % 100000 == 0:
            sys.stdout.write(".")
            sys.stdout.flush()
        if l.startswith("@") or l.startswith("+"):
            file_illumina.write(l)
        elif l.startswith("!"):
            file_illumina.write(l[1:])
        else:
            string_solid = l.strip()
            string_illumina = decode(string_solid)
            file_illumina.write(string_illumina + "\n")
    file_illumina.close()
Exemplo n.º 10
0
def get_pi():
    out = {}
    file_in = bb.fun_open_file(PATH_QUERY_NS_BED)
    for l in file_in:
        line = l.strip().split()
        out[line[3]] = line[:3] + line[4:]
        out[line[3]][1:3] = map(int, out[line[3]][1:3])
    return out
Exemplo n.º 11
0
def read_gene_list(args):
    list_gene = []
    global DICT_STRAND
    DICT_STRAND = {}
    for l in bb.fun_open_file(args.gene_position):
        if l.split()[3] in list_gene:
            bb.fun_print_error("there is repetitive names in gene position file, please remove the repeat name")
        list_gene.append(l.split()[3])
        DICT_STRAND[l.split()[3]] = l.split()[5]
    return list_gene
Exemplo n.º 12
0
def write_exon_intron_length(dict_exon_intron):
    file_out = bb.fun_open_file("{0}/{1}.exon_intron.len".format(PATH_OUT, SPECIES1), "w")
    for pi in dict_exon_intron:
        file_out.write(pi)
        exon_length = 0
        intron_length = 0
        for i in dict_exon_intron[pi]["exon"]:
            exon_length += (i[1] - i[0] + 1)
        for i in dict_exon_intron[pi]["intron"]:
            intron_length += (i[1] - i[0] + 1)
        file_out.write("\t{0}\t{1}\n".format(exon_length, max(0, intron_length)))
    file_out.close()
Exemplo n.º 13
0
def find_ortho_pi(dict_cover):
    os.system("sort -k1,1 -k2,2n {0}/{1}_{2}.bed > {0}/t && mv {0}/t {0}/{1}_{2}.bed && sort -k1,1 -k2,2n {3}/{2}/{2}.piRNA.ns.bed > {0}/t && mv {0}/t {3}/{2}/{2}.piRNA.ns.bed".format(PATH_OUT, SPECIES1, SPECIES2, PATH_PI))
    command = "bedtools intersect -sorted -wo -s -a {0}/{1}_{2}.bed -b {3}/{2}/{2}.piRNA.ns.bed > {0}/{1}_{2}.ortho".format(PATH_OUT, SPECIES1, SPECIES2, PATH_PI)
    if os.system(command) != 0:
        bb.fun_print_error("intersectBed Error")
    out = []
    file_in = bb.fun_open_file("{0}/{1}_{2}.ortho".format(PATH_OUT, SPECIES1, SPECIES2))
    for l in file_in:
        line = l.strip().split()
        LIST_OUT.append([line[3], 4, line[9],
            line[4], dict_cover[line[3]][0], dict_cover[line[3]][1],
            line[6], line[7], line[8], line[11]])
        out.append(line[3])
    return out
Exemplo n.º 14
0
def get_synteny(dict_pi, list_ortho):
    out = {}
    file_in = bb.fun_open_file("{0}/{1}_{2}.bed".format(PATH_OUT, SPECIES1, SPECIES2))
    for l in file_in:
        line = l.strip().split()
        if line[3] not in list_ortho:
            out[line[3]] = []
            out[line[3]].append([dict_pi[line[3]][0],
                max(1, dict_pi[line[3]][1] - 150000),
                min(DICT_CHROM1[dict_pi[line[3]][0]], dict_pi[line[3]][2] + 150000),
                dict_pi[line[3]][4]]) 
            out[line[3]].append([line[0],
                max(1, int(line[1]) - 150000),
                min(DICT_CHROM2[line[0]], int(line[2]) + 150000),
                line[5], line[4]])
    return out
Exemplo n.º 15
0
def run_lastz(dict_synteny, dict_pi, dict_cover):
    for pi in dict_synteny:
        bb.fun_print("start check conservation for " + pi, "green", "black", 1)
        bb.fun_quick_write(
            "%s\t%s\t%s\t%s\t0\t%s\n" %
            (dict_synteny[pi][0][0], dict_synteny[pi][0][1],
             dict_synteny[pi][0][2], pi, dict_synteny[pi][0][3]),
            "%s/temp1.bed" % PATH_OUT)
        bb.fun_quick_write(
            "%s\t%s\t%s\t%s\t0\t%s\n" %
            (dict_pi[pi][0], max(1, dict_pi[pi][1] - 10000),
             min(DICT_CHROM1[dict_pi[pi][0]],
                 dict_pi[pi][2] + 10000), pi, dict_pi[pi][4]),
            "%s/temp.shuffle1.bed" % PATH_OUT)
        bb.fun_quick_write(
            "%s\t%s\t%s\t%s\t0\t%s\n" %
            (dict_synteny[pi][1][0], dict_synteny[pi][1][1],
             dict_synteny[pi][1][2], pi, dict_synteny[pi][1][3]),
            "%s/temp2.bed" % PATH_OUT)
        command = "bedtools getfasta -fi /data/tongji2/Annotation/Fasta/{0}.fa \
                -fo {1}/temp1.fa -bed {1}/temp1.bed -name -s".format(
            DICT_SP[SPECIES1], PATH_OUT)
        if os.system(command) != 0:
            bb.fun_print_error("getfasta Error")
        command = "bedtools getfasta -fi /data/tongji2/Annotation/Fasta/{0}.fa \
                -fo {1}/temp.shuffle1.fa -bed {1}/temp.shuffle1.bed -name -s".format(
            DICT_SP[SPECIES1], PATH_OUT)
        if os.system(command) != 0:
            bb.fun_print_error("getfasta Error")
        command = "bedtools getfasta -fi /data/tongji2/Annotation/Fasta/{0}.fa \
                -fo {1}/temp2.fa -bed {1}/temp2.bed -name -s".format(
            DICT_SP[SPECIES2], PATH_OUT)
        if os.system(command) != 0:
            bb.fun_print_error("getfasta Error")
        # shuffle
        bb.fun_print("start shuffling......", "blue", "black", font=1)
        shuffle_scores = [0]
        for i in range(0):
            command = "bedtools shuffle -i {0}/temp.shuffle1.bed -g /data/tongji2/Annotation/ChromSize/{1}.chrom.size > {0}/temp.shuffle2.bed".format(
                PATH_OUT, DICT_SP[SPECIES2])
            if os.system(command) != 0:
                bb.fun_print_error("bedtools shuffle Error")
            command = "bedtools getfasta -fi /data/tongji2/Annotation/Fasta/{0}.fa \
                    -fo {1}/temp.shuffle2.fa -bed {1}/temp.shuffle2.bed -name -s".format(
                DICT_SP[SPECIES2], PATH_OUT)
            if os.system(command) != 0:
                bb.fun_print_error("getfasta Error")
            command = "lastz {0}/temp.shuffle1.fa {0}/temp.shuffle2.fa --strand=plus --chain \
                    --output={0}/temp.shuffle.lastz --format=general:score".format(
                PATH_OUT)
            os.system(command)
            file_shuffle = bb.fun_open_file(
                "{0}/temp.shuffle.lastz".format(PATH_OUT))
            lines = file_shuffle.readlines()
            if len(lines) == 1:
                shuffle_scores.append(0)
            else:
                for record in lines:
                    if record[0] != "#":
                        shuffle_scores.append(float(record.strip()))
        shuffle_scores.sort()
        cutoff = shuffle_scores[int(len(shuffle_scores) * 0.95)]  #p-value=0.05
        bb.fun_print("p 0.05 cutoff mapping score: " + str(cutoff),
                     "blue",
                     "black",
                     font=1)
        # end shuffle
        command = "lastz {0}/temp1.fa {0}/temp2.fa --strand=plus --chain \
                --rdotplot={0}/temp.rplot --output={0}/temp.lastz \
                --format=general:name1,start1,end1,name2,start2,end2,nmatch,identity,score".format(
            PATH_OUT)
        if os.path.getsize(
                "{0}/temp1.fa".format(PATH_OUT)) and os.path.getsize(
                    "{0}/temp1.fa".format(PATH_OUT)):
            if os.system(command) != 0:
                bb.fun_print_error("lastz Error")
        else:
            LIST_OUT.append([pi, 1, "none", "0", "0", "0", "chrNA", 1, 1, "+"])
            continue
        os.system("sort -k2,2n {0}/temp.lastz > {0}/temp.sort.lastz".format(
            PATH_OUT))
        file_in = bb.fun_open_file("%s/temp.lastz" % PATH_OUT)
        list_lastz = []
        start = dict_synteny[pi][0][1]
        end = dict_synteny[pi][0][2]
        k = 0
        for l in file_in:
            if l[0] == "#":
                continue
            line = l.strip().split()
            if float(line[-1]) > cutoff:
                list_lastz.append([int(line[1]), int(line[2])])
        if dict_synteny[pi][0][3] == "+":
            pi_start = dict_pi[pi][1] - start
            pi_end = dict_pi[pi][2] - start
        else:
            pi_start = end - dict_pi[pi][2]
            pi_end = end - dict_pi[pi][1]
        for i in list_lastz:
            if pi_end < i[0] or pi_start > i[1]:
                continue
            else:
                LIST_OUT.append([
                    pi, 3, "none", dict_synteny[pi][1][4], dict_cover[pi][0],
                    dict_cover[pi][1], dict_synteny[pi][1][0],
                    dict_synteny[pi][1][1], dict_synteny[pi][1][2],
                    dict_synteny[pi][1][3]
                ])
                k = 1
                break
        if k == 0:
            LIST_OUT.append([pi, 2, "none", "0", "0", "0", "chrNA", 1, 1, "+"])