Пример #1
0
def extract_features(batcher,
                     ref_file,
                     binpath='minimap2',
                     nthread=3,
                     minlen=29000):
    """
    Stream output from JSON.xz file via load_gisaid() into minimap2
    via subprocess.

    :param batcher:  generator, returned by batch_fasta()
    :param ref_file:  str, path to reference genome (FASTA format)
    :param binpath:  str, path to minimap2 binary executable
    :param nthread:  int, number of threads to run minimap2
    :param minlen:  int, minimum genome length

    :yield:  dict, record augmented with genetic differences and missing sites;
    """
    with open(ref_file) as handle:
        reflen = len(convert_fasta(handle)[0][1])

    for fasta, batch in batcher:
        mm2 = minimap2.minimap2(fasta,
                                ref_file,
                                stream=True,
                                path=binpath,
                                nthread=nthread,
                                minlen=minlen)
        result = list(minimap2.encode_diffs(mm2, reflen=reflen))
        for row, record in zip(result, batch):
            # reconcile minimap2 output with GISAID record
            qname, diffs, missing = row
            record.update({'diffs': diffs, 'missing': missing})
            yield record
 def process_fasta(self, handle, ref_file, binpath, nthread, minlen=29000):
     """
     Run all genomes in local FASTA file through Pangolin
     :param handle:  file object, opened in read mode
     :param ref_file:  str, path to FASTA file containing reference genome
     :param binpath:  str, path to minimap2 binary
     :param nthread:  int, number of threads to run minimap2
     :param minlen:  int, reject genomes below this threshold
     :yield:  tuple, header and lineage
     """
     reflen = len(seq_utils.convert_fasta(open(ref_file))[0][1])
     mm2 = minimap2(handle, ref_file, stream=False, path=binpath, nthread=nthread, minlen=minlen)
     for header, aligned in stream_fasta(mm2, reflen=reflen):
         lineage = self.classify(aligned)
         yield header, lineage
Пример #3
0
def retrieve_genomes(db="data/gsaid.db",
                     stream=False,
                     nthread=1,
                     ref_file='data/MT291829.fa',
                     misstol=300,
                     callback=None):
    """
    Query database for Pangolin lineages and then retrieve the earliest
    sampled genome sequence for each.  Export as FASTA for TreeTime analysis.
    :param db:  str, path to sqlite3 database
    :return:  list, (header, sequence) tuples
    """
    # load and parse reference genome
    with open(ref_file) as handle:
        _, refseq = convert_fasta(handle)[0]
    reflen = len(refseq)

    # allocate lists
    coldates = []
    lineages = []
    seqs = []

    # retrieve unaligned genomes from database
    for lineage, fasta_file in dump_raw_by_lineage(db, callback=callback):
        mm2 = minimap2(infile=fasta_file,
                       nthread=nthread,
                       stream=stream,
                       ref=ref_file)
        gen = encode_diffs(mm2, reflen=reflen)
        qname = None
        for row in filter_outliers(gen):
            # exclude genomes too divergent from expectation
            if total_missing(row) > misstol:
                continue
            # take the earliest valid genome
            qname, _, _ = row
            _, coldate = parse_label(qname)
            break

        if qname is None:
            # none of the genomes were selected
            callback("no genome passed filters for lineage {}".format(lineage))
            continue

        if callback:
            callback("selected genome {} for lineage {}".format(
                qname, lineage))

        # update lists
        lineages.append(lineage)
        coldates.append(coldate)

        # reconstruct aligned sequence from feature vector
        seq = apply_features(row, refseq=refseq)
        seqs.append(seq)

        # clean up temporary files
        os.remove(fasta_file.name)

    # generate new headers in {name}|{accession}|{date} format expected by treetime()
    headers = map(lambda xy: '|{}|{}'.format(*xy), zip(lineages, coldates))
    return dict(zip(headers, seqs))
Пример #4
0
    treetime.parse_nexus(nexus_file, fasta,
                         date_tol=args.datetol)  # -> treetime.nwk

    # Retrieve raw genomes from DB, align and extract features
    cb.callback("Retrieving raw genomes from database")

    with open(args.ref) as handle:
        _, refseq = seq_utils.convert_fasta(handle)[0]
    reflen = len(refseq)
    mask = seq_utils.load_vcf(args.vcf)
    data = {}

    for lineage, fasta_file in db_utils.dump_raw_by_lineage(
            args.db, callback=cb.callback):
        mm2 = minimap2.minimap2(infile=fasta_file,
                                nthread=args.mmthreads,
                                ref=args.ref)
        gen = minimap2.encode_diffs(mm2, reflen=reflen)

        features = []
        # excludes genomes too divergent from expectation
        for row in seq_utils.filter_outliers(gen):
            if seq_utils.total_missing(row) > args.misstol:
                # too many uncalled bases
                continue
            features.append(row)

        cb.callback("{} sequences of lineage {} passed filters".format(
            len(features), lineage))
        # remove problematic sites from feature vectors
        features = seq_utils.filter_problematic(features, mask=mask)