else: print("No sequence for: " + rec.query_name) #print("rpos: " + str(rpos) + " qpos: " + str(qpos)) return(seqs) #sys.exit() # Logfile loghandle = None if args.logfile: loghandle = open(args.logfile,"w+") print("Nr. of Contigs: " + str(len(contigs))) scafs = Longreads(args.inputfiles, blacklist, args.linename) #scafs.filter_low_quality_contigs(0.76) scafs.turn_longreads_around(logging=loghandle) scafs.sort_by_starts() scafs.filter_contigs_by_coverage(0.5,ignore_ends=True, verbose = False) print("Nr. of reads: " + str(len(scafs.lreads))) # Parse Recipe File needed_reads = set() items = [] with open(args.contigstringfile) as f: for line in f: if line.startswith("#"): continue if len(line.split()) == 2:
parser.add_argument("--blacklistfile", help="File containing long read ids where certain contig mappings should be ignored.") args = parser.parse_args() blacklist = defaultdict(list) if args.blacklistfile: with open(args.blacklistfile) as f: for line in f: sline = line.split() if sline[0] == "contig": blacklist[sline[1]] = "y" else: blacklist[sline[0]].append(sline[1]) #print(set([args.lrid])) scafs = Longreads(args.inputfiles, blacklist, args.linename, whitelist_lreads=set([args.lrid])) scafs.turn_longreads_around() scafs.sort_by_starts() #print(scafs.lreads) lrseqs = dict() for read in SeqIO.parse(args.sequencefile, "fastq"): lrseqs[read.id] = str(read.seq) ctg1, ctg2 = args.contigs.split("-") read = scafs.lreads[args.lrid] sc = 0 ec = 0 for ctg in read["maps"]: if ctg["name"] == ctg1: #print("\t".join([ctg["name"], str(ctg["scr"]), str(ctg["ecr"]), str(ctg["scc"]), str(ctg["ecc"])]))
sccs.append(ctgs[ctgn][ctgidx]["scc"]) eccs.append(ctgs[ctgn][ctgidx]["ecc"]) strands.append(ctgs[ctgn][ctgidx]["strand"]) newctg = { "strand": round(mean(strands)), "name": ctgn, "scc": round(mean(sccs)), "ecc": round(mean(eccs)), "scr": round(mean(cluster)) + round(mean(sccs)), "ecr": round(mean(cluster)) + round(mean(eccs)) } p["maps"].append(newctg) return pseudolongreads scafs = Longreads(args.inputfiles, blacklist, args.linename) #scafs.filter_whitelist_ctgs(set(["1115APD"])) scafs.filter_small_contigs(300) scafs.filter_reverse_small_contigs(600) scafs.filter_low_quality_contigs(0.81) scafs.turn_longreads_around() scafs.sort_by_starts() scafs.filter_contigcounts(args.mincontigs) #scafs.copy() print("Nr. of reads: " + str(len(scafs.lreads))) status = 0 for iteration in range(10): print("Pseudoaligning all... ", end="") lr_scores, lr_dists = scafs.pseudoalign_all()
for line in f: if line.startswith(">"): break else: format = "paf" celllinestr = line.split()[0] m = re.search('[A-Z]+', celllinestr) cellline = m.group(0) print("cell line detected: " + cellline) break if format == "fasta": for read in SeqIO.parse(args.inputfile, "fasta"): seqs[read.id] = str(read.seq).upper() elif format == "paf": scaf = Longreads.init_from_reverse_paf(args.inputfile) scaf.sort_by_starts() lread = scaf.lreads else: print("Problem! Format unknown") sys.exit() image = LongReadSVG(args.output, zoom=800) dwg = image.dwg ypad = 7 xpad = 20 col1 = "black" col2 = "lightgrey" col = col1
whitelist_ctgs = set() whitelist_lreads = set() if (args.whitelist): with open(args.whitelist) as f: for line in f: whitelist_ctgs.add(line.strip()) elif (args.whitelist_lrs): with open(args.whitelist_lrs) as f: for line in f: whitelist_lreads.add(line.strip()) contigs = {} for read in SeqIO.parse(args.contigfile, "fasta"): contigs[read.id] = len(read.seq) scafs = Longreads(args.inputfiles, blacklist, args.linename, whitelist_lreads) if whitelist_ctgs: scafs.filter_whitelist_ctgs(whitelist_ctgs) scafs.filter_contigcounts(int(args.mincontigs)) reverse_mappers = set() reverse_mappers.add("344DBB") reverse_mappers.add("472DBB") scafs.turn_longreads_around(reverse_mappers) scafs.sort_by_starts() #scafs.filter_small_contigs(300) #scafs.filter_overlapped_contigs(0.5) scafs.filter_contigcounts(int(args.mincontigs)) scafs.print_ids() print("Reads meeting criteria: " + str(len(scafs.lreads))) print("Pseudoaligning all...")
for read in SeqIO.parse(args.contigfile, "fasta"): contigs[read.id] = len(read.seq) print("Nr. of scaffolds: " + str(len(contigs))) blacklist = defaultdict(list) if args.blacklistfile: with open(args.blacklistfile) as f: for line in f: sline = line.split() if sline[0] == "contig": blacklist[sline[1]] = "y" else: blacklist[sline[0]].append(sline[1]) lrs = Longreads(args.inputfiles, blacklist, args.linename) lrs.filter_contigcounts(2) lrs.turn_longreads_around() lrs.sort_contigs_in_reads() greads = lrs.lreads # cluster np-reads print("scaffolding long reads ....") contig2cluster = {} creads = {} clusternr = 0 while len(greads) > 0: clusternr += 1 current_cluster = {} current_contigs = set() # take a random read and build a cluster from it
reads = {} cgreads = [] blacklist = defaultdict(list) if args.blacklistfile: with open(args.blacklistfile) as f: for line in f: sline = line.split() if sline[0] == "contig": blacklist[sline[1]] = "y" else: blacklist[sline[0]].append(sline[1]) lrs = Longreads(args.inputfiles, blacklist, args.linename) lrs.filter_contigcounts(2) lrs.filter_small_contigs(300) #lrs.filter_reverse_small_contigs(600) reverse_mappers = set() reverse_mappers.add("344DBB") reverse_mappers.add("472DBB") lrs.turn_longreads_around(reverse_mappers) lrs.sort_by_starts() contig2lrid = lrs.ctg2lreads def get_full_name(short_ctgn): if "_" in short_ctgn: nr = short_ctgn.split("_")[0]