def extract_features(batcher, ref_file, binpath='minimap2', nthread=3, minlen=29000): """ Stream output from JSON.xz file via load_gisaid() into minimap2 via subprocess. :param batcher: generator, returned by batch_fasta() :param ref_file: str, path to reference genome (FASTA format) :param binpath: str, path to minimap2 binary executable :param nthread: int, number of threads to run minimap2 :param minlen: int, minimum genome length :yield: dict, record augmented with genetic differences and missing sites; """ with open(ref_file) as handle: reflen = len(convert_fasta(handle)[0][1]) for fasta, batch in batcher: mm2 = minimap2.minimap2(fasta, ref_file, stream=True, path=binpath, nthread=nthread, minlen=minlen) result = list(minimap2.encode_diffs(mm2, reflen=reflen)) for row, record in zip(result, batch): # reconcile minimap2 output with GISAID record qname, diffs, missing = row record.update({'diffs': diffs, 'missing': missing}) yield record
def process_fasta(self, handle, ref_file, binpath, nthread, minlen=29000): """ Run all genomes in local FASTA file through Pangolin :param handle: file object, opened in read mode :param ref_file: str, path to FASTA file containing reference genome :param binpath: str, path to minimap2 binary :param nthread: int, number of threads to run minimap2 :param minlen: int, reject genomes below this threshold :yield: tuple, header and lineage """ reflen = len(seq_utils.convert_fasta(open(ref_file))[0][1]) mm2 = minimap2(handle, ref_file, stream=False, path=binpath, nthread=nthread, minlen=minlen) for header, aligned in stream_fasta(mm2, reflen=reflen): lineage = self.classify(aligned) yield header, lineage
def retrieve_genomes(db="data/gsaid.db", stream=False, nthread=1, ref_file='data/MT291829.fa', misstol=300, callback=None): """ Query database for Pangolin lineages and then retrieve the earliest sampled genome sequence for each. Export as FASTA for TreeTime analysis. :param db: str, path to sqlite3 database :return: list, (header, sequence) tuples """ # load and parse reference genome with open(ref_file) as handle: _, refseq = convert_fasta(handle)[0] reflen = len(refseq) # allocate lists coldates = [] lineages = [] seqs = [] # retrieve unaligned genomes from database for lineage, fasta_file in dump_raw_by_lineage(db, callback=callback): mm2 = minimap2(infile=fasta_file, nthread=nthread, stream=stream, ref=ref_file) gen = encode_diffs(mm2, reflen=reflen) qname = None for row in filter_outliers(gen): # exclude genomes too divergent from expectation if total_missing(row) > misstol: continue # take the earliest valid genome qname, _, _ = row _, coldate = parse_label(qname) break if qname is None: # none of the genomes were selected callback("no genome passed filters for lineage {}".format(lineage)) continue if callback: callback("selected genome {} for lineage {}".format( qname, lineage)) # update lists lineages.append(lineage) coldates.append(coldate) # reconstruct aligned sequence from feature vector seq = apply_features(row, refseq=refseq) seqs.append(seq) # clean up temporary files os.remove(fasta_file.name) # generate new headers in {name}|{accession}|{date} format expected by treetime() headers = map(lambda xy: '|{}|{}'.format(*xy), zip(lineages, coldates)) return dict(zip(headers, seqs))
treetime.parse_nexus(nexus_file, fasta, date_tol=args.datetol) # -> treetime.nwk # Retrieve raw genomes from DB, align and extract features cb.callback("Retrieving raw genomes from database") with open(args.ref) as handle: _, refseq = seq_utils.convert_fasta(handle)[0] reflen = len(refseq) mask = seq_utils.load_vcf(args.vcf) data = {} for lineage, fasta_file in db_utils.dump_raw_by_lineage( args.db, callback=cb.callback): mm2 = minimap2.minimap2(infile=fasta_file, nthread=args.mmthreads, ref=args.ref) gen = minimap2.encode_diffs(mm2, reflen=reflen) features = [] # excludes genomes too divergent from expectation for row in seq_utils.filter_outliers(gen): if seq_utils.total_missing(row) > args.misstol: # too many uncalled bases continue features.append(row) cb.callback("{} sequences of lineage {} passed filters".format( len(features), lineage)) # remove problematic sites from feature vectors features = seq_utils.filter_problematic(features, mask=mask)