def concatenate_files(chunk_output_files, output_m8):
     with log.log_context("run_alignment_remote.concatenate_files", {"chunk_output_files": chunk_output_files}):
         with open(output_m8, 'wb') as outf:
             for f in chunk_output_files:
                 with log.log_context("run_alignment_remote.concatenate_files#chunk", {"f": f}):
                     with open(f, 'rb') as fd:
                         shutil.copyfileobj(fd, outf)
    def fetch_target_from_s3(self, target):
        ''' .done file should be written to the result dir when the download is complete '''
        log.write("Downloading target %s" % target)
        if target in self.given_targets:
            input_path_s3 = self.given_targets[target]["s3_dir"]
        else:
            input_path_s3 = self.output_dir_s3

        with log.log_context("fetch_input_files_from_s3", {"target": target}):
            PipelineFlow.fetch_input_files_from_s3(
                input_files=self.targets[target],
                input_dir_s3=input_path_s3,
                result_dir_local=self.output_dir_local)

        if target in self.given_targets and self.given_targets[target].get(
                "count_reads"):
            with log.log_context("count_input_reads", {"target": target}):
                try:
                    PipelineFlow.count_input_reads(
                        input_files=self.targets[target],
                        result_dir_local=self.output_dir_local,
                        result_dir_s3=self.output_dir_s3,
                        target_name=target,
                        max_fragments=self.given_targets[target]
                        ["max_fragments"])
                except AssertionError as e:
                    # The counting methods may raise assertion errors if assumptions
                    # about input format are not satisfied.
                    self.write_invalid_input_json({
                        "error": str(e),
                        "step": None
                    })
Пример #3
0
 def prefetch_large_files(self, touch_only=False):
     successes, failures = set(), set()
     with log.log_context("touch_large_files_and_make_space" if touch_only else "prefetch_large_files", values={"file_list": self.large_file_list}):
         for f in self.large_file_list:
             with log.log_context("fetch_reference", values={"file": f, "touch_only": touch_only}):
                 success = idseq_dag.util.s3.fetch_reference(
                     f, self.ref_dir_local, auto_unzip=True, auto_untar=True, allow_s3mi=True, touch_only=touch_only)
                 if success:
                     successes.add(f)
                 else:
                     failures.add(f)
     return successes, failures
Пример #4
0
def remove_rf(path: str):
    '''Mimics behavior of rm -rf linux command.'''
    def _remove_entry(path_entry):
        if os.path.isdir(path_entry) and not os.path.islink(path_entry):
            shutil.rmtree(path_entry)
        elif os.path.exists(path_entry):
            os.remove(path_entry)
    with log.log_context(context_name='command.remove_rf', values={'path': path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        path_list = _glob.glob(path)
        if len(path_list) == 1 and path_list[0] == path:
            _remove_entry(path)
        else:
            for path_entry in path_list:
                with log.log_context(context_name='command.remove_rf._remove_entry', values={'path_entry': path_entry}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
                    _remove_entry(path_entry)
Пример #5
0
def glob(glob_pattern: str,
         strip_folder_names: bool = False,
         max_results: int = 0):
    '''
        Execute a glob pattern to local file system.
            Parameters:
                glob_pattern(str): A glob pattern. Ex: /tmp/*.gz
                max_results(int): Limit the number of results to be returned. Zero means not limit is set.
                strip_folder_names(bool): Return only the file names without folder information.
                                          Ex: "/tmp/123.txt" is returned as "123.txt"
            Returns:
                Array of strings containing the files found. Empty array if none is found.
    '''
    values = {
        'glob_pattern': glob_pattern,
        'strip_folder_names': strip_folder_names,
        'max_results': max_results
    }
    with log.log_context(context_name='command.glob',
                         values=values,
                         log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        results = _glob.glob(glob_pattern)
        results.sort()
        if max_results > 0:
            results = results[:max_results]
        if strip_folder_names:
            results = list(map(os.path.basename, results))
        values["results"] = results
        return results
Пример #6
0
def list_s3_keys(s3_path_prefix):
    """Returns a list of s3 keys prefixed by s3_path_prefix."""
    with log.log_context(context_name="s3.list_s3_objects",
                         values={'s3_path_prefix': s3_path_prefix},
                         log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        parsed_url = urlparse(s3_path_prefix, allow_fragments=False)
        bucket = parsed_url.netloc
        prefix = parsed_url.path.lstrip('/')
        # Use the AWS CLI instead of boto for thread safety
        raw_response = command.execute(
            command_patterns.SingleCommand(
                cmd="aws",
                args=[
                    "s3api",
                    "list-objects-v2",
                    "--bucket",
                    bucket,
                    "--prefix",
                    prefix,
                ],
                env=dict(os.environ, **refreshed_credentials()),
            ),
            capture_stdout=True,
        )
        parsed_response = json.loads(raw_response)
        return [item['Key'] for item in parsed_response['Contents']]
Пример #7
0
def make_dirs(path: str):
    if not os.path.isdir(path):
        with log.log_context(
                context_name="command.make_dirs",
                values={'path': path},
                log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
            os.makedirs(path, exist_ok=True)
Пример #8
0
def move_file(src: str, dst: str):
    with log.log_context(context_name='command.move_file',
                         values={
                             'src': src,
                             'dest': dst
                         },
                         log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        shutil.move(src, dst)
Пример #9
0
 def _ensure_table_exists(self, conn):
     ''' Private: Fail if the table does not exist.  Called when self._db_conn is still none. '''
     self._assert_lock_held()
     with log.log_context(f"db_assert_table", {"db_path": self.db_path}):
         with conn:
             res = conn.execute(f"SELECT count(*) FROM sqlite_master WHERE type='table' AND name='{SQLITE_TABLE_NAME}';")
             table_exists = res.fetchone()[0] != 0
         assert table_exists, f"table {SQLITE_TABLE_NAME} doesn't exist in db {self.db_path}"
Пример #10
0
def rename(src: str, dst: str):
    with log.log_context(context_name='command.rename',
                         values={
                             'src': src,
                             'dest': dst
                         },
                         log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        os.rename(src, dst)
Пример #11
0
def execute(
    command: Union[command_patterns.CommandPattern, str],
    progress_file: str = None,
    timeout: int = None,
    grace_period: int = None,
    capture_stdout: bool = False,
    merge_stderr: bool = False,
    log_context_mode: log.LogContextMode = log.LogContextMode.
    START_END_LOG_EVENTS
) -> Union[str, None]:
    """Primary way to start external commands in subprocesses and handle
    execution with logging.
    """
    if not isinstance(command, command_patterns.CommandPattern):
        # log warning if using legacy format
        log.write(
            warning=True,
            message=
            f"Command parameter is using legacy type str. Use idseq_dag.util.command_patterns.",
            obj_data={
                "cmd": command,
                "type": type(command)
            })
        cmd = command_patterns.ShellScriptCommand(script=command, args=[])
    else:
        cmd = command

    with CommandTracker() as ct:
        log_values = {"cid": f"Command {ct.id}", "command": cmd.as_dict()}
        with log.log_context('command_execute',
                             values=log_values,
                             log_context_mode=log_context_mode) as lctx:
            with ProgressFile(progress_file):
                if timeout:
                    ct.timeout = timeout
                if grace_period:
                    ct.grace_period = grace_period
                if capture_stdout:
                    # Capture only stdout. Child stderr = parent stderr unless
                    # merge_stderr specified. Child input = parent stdin.
                    ct.proc = cmd.open(stdin=sys.stdin.fileno(),
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT if merge_stderr
                                       else sys.stderr.fileno())
                    stdout, _ = ct.proc.communicate()
                else:
                    # Capture nothing. Child inherits parent stdin/out/err.
                    ct.proc = cmd.open()
                    ct.proc.wait()
                    stdout = None

                lctx.values.update({"returncode": ct.proc.returncode})

                if ct.proc.returncode:
                    raise subprocess.CalledProcessError(
                        ct.proc.returncode, str(command), stdout)
                if capture_stdout:
                    return stdout
    def thread_run(self):
        ''' Actually running the step '''
        self.status = StepStatus.STARTED
        self.update_status_json_file("instantiated")

        v = {"step": self.name}
        with log.log_context("dag_step", v):
            with log.log_context("substep_wait_for_input_files", v):
                self.wait_for_input_files()
            with log.log_context("substep_validate_input_files", v):
                self.validate_input_files()

            # If an input file error was detected, stop execution.
            if self.input_file_error:
                log.write("Invalid input detected for step %s" % self.name)
                self.status = StepStatus.INVALID_INPUT
                self.update_status_json_file("user_errored")
                return

            with log.log_context("substep_run", v):
                self.update_status_json_file("running")
                self.run()
            with log.log_context("substep_validate", v):
                self.validate()
            with log.log_context("substep_save_progress", v):
                self.save_progress()
            with log.log_context("substep_save_counts", v):
                self.save_counts()
        self.upload_thread = threading.Thread(target=self.uploading_results)
        self.upload_thread.start()
        self.status = StepStatus.FINISHED
        self.update_status_json_file("finished_running")
Пример #13
0
def write_text_to_file(text: str, file_path: str):
    with log.log_context(context_name='command.write_text_to_file',
                         values={
                             'path': file_path,
                             'text': text
                         },
                         log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        with open(file_path, "w") as f:
            print(text, file=f)
Пример #14
0
def chmod(path: str, mode: int):
    '''Execute a chmod operation.
       Parameter 'mode' must be in octal format. Ex: chmod('/tmp/test.txt', 0o400)'''
    with log.log_context(context_name='command.chmod',
                         values={
                             'path': path,
                             'mode': oct(mode)
                         },
                         log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        os.chmod(path, mode)
Пример #15
0
def build_should_keep_filter(
    deuterostome_path,
    taxon_whitelist_path,
    taxon_blacklist_path
):

    # See also HOMO_SAPIENS_TAX_IDS in idseq-web
    taxids_to_remove = set(['9605', '9606'])

    if taxon_blacklist_path:
        with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_blacklist_into_set"}):
            taxids_to_remove.update(read_file_into_set(taxon_blacklist_path))

    if deuterostome_path:
        with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_file_into_set"}):
            taxids_to_remove.update(read_file_into_set(deuterostome_path))

    if taxon_whitelist_path:
        with log.log_context("generate_taxon_count_json_from_m8", {"substep": "read_whitelist_into_set"}):
            taxids_to_keep = read_file_into_set(taxon_whitelist_path)

    def is_blacklisted(hits: Iterable[str]):
        for taxid in hits:
            if int(taxid) >= 0 and taxid in taxids_to_remove:
                return True
        return False

    def is_whitelisted(hits: Iterable[str]):
        if not taxon_whitelist_path:
            return True
        for taxid in hits:
            if int(taxid) >= 0 and taxid in taxids_to_keep:
                return True
        return False

    def should_keep(hits: Iterable[str]):
        # In some places in the code taxids are ints rather than strings, this would lead
        # to a silent failure here so it is worth the explicit check.
        non_strings = [h for h in hits if type(h) != str]
        assert not non_strings, f"should_keep recieved non-string inputs {non_strings}"
        return is_whitelisted(hits) and not is_blacklisted(hits)

    return should_keep
Пример #16
0
def build_should_keep_filter(deuterostome_path, taxon_whitelist_path,
                             taxon_blacklist_path):

    # See also HOMO_SAPIENS_TAX_IDS in idseq-web
    taxids_to_remove = set(['9605', '9606'])

    if taxon_blacklist_path:
        with log.log_context("generate_taxon_count_json_from_m8",
                             {"substep": "read_blacklist_into_set"}):
            taxids_to_remove.update(read_file_into_set(taxon_blacklist_path))

    if deuterostome_path:
        with log.log_context("generate_taxon_count_json_from_m8",
                             {"substep": "read_file_into_set"}):
            taxids_to_remove.update(read_file_into_set(deuterostome_path))

    if taxon_whitelist_path:
        with log.log_context("generate_taxon_count_json_from_m8",
                             {"substep": "read_whitelist_into_set"}):
            taxids_to_keep = read_file_into_set(taxon_whitelist_path)

    def is_blacklisted(hits):
        for taxid in hits:
            if int(taxid) >= 0 and taxid in taxids_to_remove:
                return True
        return False

    def is_whitelisted(hits):
        if not taxon_whitelist_path:
            return True
        for taxid in hits:
            if int(taxid) >= 0 and taxid in taxids_to_keep:
                return True
        return False

    def should_keep(hits):
        return is_whitelisted(hits) and not is_blacklisted(hits)

    return should_keep
Пример #17
0
def _check_s3_presence(s3_path, allow_zero_byte_files):
    """True if s3_path exists. False otherwise."""
    with log.log_context(context_name="s3.check_s3_presence", values={'s3_path': s3_path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT) as lc:
        parsed_url = urlparse(s3_path, allow_fragments=False)
        bucket = parsed_url.netloc
        key = parsed_url.path.lstrip('/')
        try:
            o = boto3.resource('s3').Object(
                bucket,
                key
            )
            size = o.content_length
            lc.values['size'] = size
            exists = (allow_zero_byte_files and size >= 0) or (not allow_zero_byte_files and size > 0)
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                exists = False
            else:
                # Something else has gone wrong.
                raise
        lc.values['exists'] = exists
        return exists
Пример #18
0
 def _ensure_table_exists(self, conn):
     ''' Create writable table if one doesn't exist. '''
     self._assert_lock_held()
     with log.log_context(f"db_assert_table", {"db_path": self.db_path}):
         with conn:
             conn.execute(f"CREATE TABLE IF NOT EXISTS {SQLITE_TABLE_NAME} (dict_key VARCHAR(255) PRIMARY KEY, dict_value text)")
Пример #19
0
def remove_file(file_path: str):
    with log.log_context(context_name='command.remove_file', values={'path': file_path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        os.remove(file_path)
Пример #20
0
def touch(path, exist_ok=True):
    with log.log_context(context_name='command.touch', values={'path': path}, log_context_mode=log.LogContextMode.EXEC_LOG_EVENT):
        pathlib.Path(path).touch(exist_ok=exist_ok)
Пример #21
0
def generate_taxon_count_json_from_m8(m8_file, hit_level_file, e_value_type,
                                      count_type, lineage_map_path,
                                      deuterostome_path, taxon_whitelist_path,
                                      taxon_blacklist_path,
                                      cdhit_cluster_sizes_path,
                                      output_json_file):
    # Parse through hit file and m8 input file and format a JSON file with
    # our desired attributes, including aggregated statistics.

    cdhit_cluster_sizes = load_cdhit_cluster_sizes(cdhit_cluster_sizes_path)

    should_keep = build_should_keep_filter(deuterostome_path,
                                           taxon_whitelist_path,
                                           taxon_blacklist_path)
    # Setup
    aggregation = {}
    with open(hit_level_file, 'r', encoding='utf-8') as hit_f, \
         open(m8_file, 'r', encoding='utf-8') as m8_f, \
         open_file_db_by_extension(lineage_map_path, IdSeqDictValue.VALUE_TYPE_ARRAY) as lineage_map:  # noqa
        # Lines in m8_file and hit_level_file correspond (same read_id)
        hit_line = hit_f.readline()
        m8_line = m8_f.readline()
        num_ranks = len(lineage.NULL_LINEAGE)
        # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format
        MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022

        with log.log_context("generate_taxon_count_json_from_m8",
                             {"substep": "loop_1"}):
            while hit_line and m8_line:
                # Retrieve data values from files
                hit_line_columns = hit_line.rstrip("\n").split("\t")
                read_id = hit_line_columns[0]
                hit_level = hit_line_columns[1]
                hit_taxid = hit_line_columns[2]
                if int(hit_level) < 0:  # Skip negative levels and continue
                    hit_line = hit_f.readline()
                    m8_line = m8_f.readline()
                    continue

                # m8 files correspond to BLAST tabular output format 6:
                # Columns: read_id | _ref_id | percent_identity | alignment_length...
                #
                # * read_id = query (e.g., gene) sequence id
                # * _ref_id = subject (e.g., reference genome) sequence id
                # * percent_identity = percentage of identical matches
                # * alignment_length = length of the alignments
                # * e_value = the expect value
                #
                # See:
                # * http://www.metagenomics.wiki/tools/blast/blastn-output-format-6
                # * http://www.metagenomics.wiki/tools/blast/evalue

                m8_line_columns = m8_line.split("\t")
                msg = "read_ids in %s and %s do not match: %s vs. %s" % (
                    os.path.basename(m8_file),
                    os.path.basename(hit_level_file), m8_line_columns[0],
                    hit_line_columns[0])
                assert m8_line_columns[0] == hit_line_columns[0], msg
                percent_identity = float(m8_line_columns[2])
                alignment_length = float(m8_line_columns[3])
                e_value = float(m8_line_columns[10])

                # These have been filtered out before the creation of m8_f and hit_f
                assert alignment_length > 0
                assert -0.25 < percent_identity < 100.25
                assert e_value == e_value
                if e_value_type != 'log10':
                    # e_value could be 0 when large contigs are mapped
                    if e_value <= MIN_NORMAL_POSITIVE_DOUBLE:
                        e_value = MIN_NORMAL_POSITIVE_DOUBLE
                    e_value = math.log10(e_value)

                # Retrieve the taxon lineage and mark meaningless calls with fake
                # taxids.
                hit_taxids_all_levels = lineage_map.get(
                    hit_taxid, lineage.NULL_LINEAGE)
                cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage(
                    hit_taxids_all_levels, hit_taxid, hit_level)
                assert num_ranks == len(cleaned_hit_taxids_all_levels)

                if should_keep(cleaned_hit_taxids_all_levels):
                    # Aggregate each level and collect statistics
                    agg_key = tuple(cleaned_hit_taxids_all_levels)
                    while agg_key:
                        agg_bucket = aggregation.get(agg_key)
                        if not agg_bucket:
                            agg_bucket = {
                                'nonunique_count': 0,
                                'unique_count': 0,
                                'sum_percent_identity': 0.0,
                                'sum_alignment_length': 0.0,
                                'sum_e_value': 0.0
                            }
                            aggregation[agg_key] = agg_bucket
                        agg_bucket['nonunique_count'] += get_read_cluster_size(
                            cdhit_cluster_sizes, read_id)
                        agg_bucket['unique_count'] += 1
                        agg_bucket['sum_percent_identity'] += percent_identity
                        agg_bucket['sum_alignment_length'] += alignment_length
                        agg_bucket['sum_e_value'] += e_value
                        # Chomp off the lowest rank as we aggregate up the tree
                        agg_key = agg_key[1:]

                hit_line = hit_f.readline()
                m8_line = m8_f.readline()

    # Produce the final output
    taxon_counts_attributes = []
    with log.log_context("generate_taxon_count_json_from_m8",
                         {"substep": "loop_2"}):
        for agg_key, agg_bucket in aggregation.items():
            unique_count = agg_bucket['unique_count']
            nonunique_count = agg_bucket['nonunique_count']
            tax_level = num_ranks - len(agg_key) + 1
            # TODO: Extend taxonomic ranks as indicated on the commented out lines.
            taxon_counts_attributes.append({
                "tax_id":
                agg_key[0],
                "tax_level":
                tax_level,
                # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100",
                'genus_taxid':
                agg_key[2 - tax_level] if tax_level <= 2 else "-200",
                'family_taxid':
                agg_key[3 - tax_level] if tax_level <= 3 else "-300",
                # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400",
                # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500",
                # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600",
                # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700",
                # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800",
                "count":  # this field will be consumed by the webapp
                nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count,
                "nonunique_count":
                nonunique_count,
                "unique_count":
                unique_count,
                "dcr":
                nonunique_count / unique_count,
                "percent_identity":
                agg_bucket['sum_percent_identity'] / unique_count,
                "alignment_length":
                agg_bucket['sum_alignment_length'] / unique_count,
                "e_value":
                agg_bucket['sum_e_value'] / unique_count,
                "count_type":
                count_type
            })
        output_dict = {
            "pipeline_output": {
                "taxon_counts_attributes": taxon_counts_attributes
            }
        }

    with log.log_context("generate_taxon_count_json_from_m8", {
            "substep": "json_dump",
            "output_json_file": output_json_file
    }):
        with open(output_json_file, 'w') as outf:
            json.dump(output_dict, outf)
            outf.flush()
Пример #22
0
def generate_taxon_count_json_from_m8(
        blastn_6_path, hit_level_path, count_type, lineage_map_path,
        deuterostome_path, taxon_whitelist_path, taxon_blacklist_path,
        duplicate_cluster_sizes_path, output_json_file):
    # Parse through hit file and m8 input file and format a JSON file with
    # our desired attributes, including aggregated statistics.

    duplicate_cluster_sizes = load_duplicate_cluster_sizes(duplicate_cluster_sizes_path)

    should_keep = build_should_keep_filter(
        deuterostome_path, taxon_whitelist_path, taxon_blacklist_path)
    # Setup
    aggregation = {}
    with open(hit_level_path) as hit_level_f, \
         open(blastn_6_path) as blastn_6_f, \
         open_file_db_by_extension(lineage_map_path) as lineage_map:

        num_ranks = len(lineage.NULL_LINEAGE)
        # See https://en.wikipedia.org/wiki/Double-precision_floating-point_format
        MIN_NORMAL_POSITIVE_DOUBLE = 2.0**-1022

        with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_1"}):
            # Lines in m8_file and hit_level_file correspond (same read_id)
            for hit_row, blastn_6_row in zip(HitSummaryMergedReader(hit_level_f), BlastnOutput6NTRerankedReader(blastn_6_f)):
                # Retrieve data values from files
                read_id = hit_row["read_id"]
                hit_level = hit_row["level"]
                hit_taxid = hit_row["taxid"]
                if hit_level < 0:
                    log.write('hit_level < 0', debug=True)
                hit_source_count_type = hit_row.get("source_count_type")

                msg = "read_ids in %s and %s do not match: %s vs. %s" % (
                    os.path.basename(blastn_6_path), os.path.basename(hit_level_path),
                    blastn_6_row["qseqid"], read_id)
                assert blastn_6_row["qseqid"] == read_id, msg
                percent_identity = blastn_6_row["pident"]
                alignment_length = blastn_6_row["length"]

                if count_type == 'merged_NT_NR' and hit_source_count_type == 'NR':
                    # NOTE: At the moment of the change, applied ONLY in the scope of the prototype of NT/NR consensus project.
                    # Protein alignments (NR) are done at amino acid level. Each amino acid is composed of 3 nucleotides.
                    # To make alignment length values comparable across NT and NR alignments (for combined statistics),
                    # the NR alignment lengths are multiplied by 3.
                    alignment_length *= 3
                e_value = blastn_6_row["evalue"]

                # These have been filtered out before the creation of blastn_6_f and hit_level_f
                assert alignment_length > 0
                assert -0.25 < percent_identity < 100.25
                assert e_value == e_value

                if count_type == "NT" or hit_source_count_type == "NT":
                    # e_value could be 0 when large contigs are mapped
                    if e_value <= MIN_NORMAL_POSITIVE_DOUBLE:
                        e_value = MIN_NORMAL_POSITIVE_DOUBLE
                    e_value = math.log10(e_value)

                # Retrieve the taxon lineage and mark meaningless calls with fake
                # taxids.
                # lineage_map expects string ids
                hit_taxids_all_levels = lineage_map.get(
                    str(hit_taxid), lineage.NULL_LINEAGE)
                cleaned_hit_taxids_all_levels = lineage.validate_taxid_lineage(
                    hit_taxids_all_levels, hit_taxid, hit_level)
                assert num_ranks == len(cleaned_hit_taxids_all_levels)

                if should_keep(cleaned_hit_taxids_all_levels):
                    # Aggregate each level and collect statistics
                    agg_key = tuple(cleaned_hit_taxids_all_levels)
                    while agg_key:
                        agg_bucket = aggregation.get(agg_key)
                        if not agg_bucket:
                            agg_bucket = {
                                'nonunique_count': 0,
                                'unique_count': 0,
                                'sum_percent_identity': 0.0,
                                'sum_alignment_length': 0.0,
                                'sum_e_value': 0.0
                            }
                            aggregation[agg_key] = agg_bucket
                        agg_bucket['nonunique_count'] += get_read_cluster_size(
                            duplicate_cluster_sizes, read_id)
                        agg_bucket['unique_count'] += 1
                        agg_bucket['sum_percent_identity'] += percent_identity
                        agg_bucket['sum_alignment_length'] += alignment_length
                        agg_bucket['sum_e_value'] += e_value
                        if hit_source_count_type:
                            agg_bucket.setdefault('source_count_type', set()).add(hit_source_count_type)
                        # Chomp off the lowest rank as we aggregate up the tree
                        agg_key = agg_key[1:]

    # Produce the final output
    taxon_counts_attributes = []
    with log.log_context("generate_taxon_count_json_from_m8", {"substep": "loop_2"}):
        for agg_key, agg_bucket in aggregation.items():
            unique_count = agg_bucket['unique_count']
            nonunique_count = agg_bucket['nonunique_count']
            tax_level = num_ranks - len(agg_key) + 1
            # TODO: Extend taxonomic ranks as indicated on the commented out lines.
            taxon_counts_row = {
                "tax_id":
                agg_key[0],
                "tax_level":
                tax_level,
                # 'species_taxid' : agg_key[tax_level - 1] if tax_level == 1 else "-100",
                'genus_taxid':
                agg_key[2 - tax_level] if tax_level <= 2 else "-200",
                'family_taxid':
                agg_key[3 - tax_level] if tax_level <= 3 else "-300",
                # 'order_taxid' : agg_key[4 - tax_level] if tax_level <= 4 else "-400",
                # 'class_taxid' : agg_key[5 - tax_level] if tax_level <= 5 else "-500",
                # 'phyllum_taxid' : agg_key[6 - tax_level] if tax_level <= 6 else "-600",
                # 'kingdom_taxid' : agg_key[7 - tax_level] if tax_level <= 7 else "-700",
                # 'domain_taxid' : agg_key[8 - tax_level] if tax_level <= 8 else "-800",
                "count":  # this field will be consumed by the webapp
                nonunique_count if READ_COUNTING_MODE == ReadCountingMode.COUNT_ALL else unique_count,
                "nonunique_count":
                nonunique_count,
                "unique_count":
                unique_count,
                "dcr":
                nonunique_count / unique_count,
                "percent_identity":
                agg_bucket['sum_percent_identity'] / unique_count,
                "alignment_length":
                agg_bucket['sum_alignment_length'] / unique_count,
                "e_value":
                agg_bucket['sum_e_value'] / unique_count,
                "count_type":
                count_type
            }
            if agg_bucket.get('source_count_type'):
                taxon_counts_row['source_count_type'] = list(agg_bucket['source_count_type'])

            taxon_counts_attributes.append(taxon_counts_row)
        output_dict = {
            "pipeline_output": {
                "taxon_counts_attributes": taxon_counts_attributes
            }
        }

    with log.log_context(
        "generate_taxon_count_json_from_m8",
        {"substep": "json_dump", "output_json_file": output_json_file}
    ):
        with open(output_json_file, 'w') as outf:
            json.dump(output_dict, outf)
            outf.flush()
Пример #23
0
 def _connect(self):
     self._assert_lock_held()
     uri_db_path = self._uri_base()
     with log.log_context(f"db_open", {"db_path": self.db_path, "uri_db_path": uri_db_path}):
         return sqlite3.connect(uri_db_path, uri=True)
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        _align_m8, deduped_m8, hit_summary, orig_counts_with_dcr = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[1]
        reference_fasta, = self.input_files_local[2]
        duplicate_cluster_sizes_path, = self.input_files_local[3]

        blast_m8, refined_m8, refined_hit_summary, refined_counts_with_dcr, contig_summary_json, blast_top_m8 = self.output_files_local()

        assert refined_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local()
        assert orig_counts_with_dcr.endswith("with_dcr.json"), self.output_files_local()

        db_type = self.additional_attributes["db_type"]
        no_assembled_results = (
            os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or
            os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE)

        if no_assembled_results:
            # No assembled results or refseq fasta available.
            # Create empty output files.
            command.write_text_to_file(' ', blast_m8)
            command.write_text_to_file(' ', blast_top_m8)
            command.copy_file(deduped_m8, refined_m8)
            command.copy_file(hit_summary, refined_hit_summary)
            command.copy_file(orig_counts_with_dcr, refined_counts_with_dcr)
            command.write_text_to_file('[]', contig_summary_json)
            return  # return in the middle of the function

        (read_dict, accession_dict, _selected_genera) = m8.summarize_hits(hit_summary)
        PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig, reference_fasta, blast_top_m8)
        read2contig = {}
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig, duplicate_cluster_sizes_path)

        (updated_read_dict, read2blastm8, contig2lineage, added_reads) = self.update_read_dict(
            read2contig, blast_top_m8, read_dict, accession_dict, db_type)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads, read2blastm8,
                                         hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_reference(
            self.additional_files["lineage_db"],
            self.ref_dir_local,
            allow_s3mi=False)  # Too small to waste s3mi

        deuterostome_db = None
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_reference(self.additional_files["deuterostome_db"],
                                                 self.ref_dir_local, allow_s3mi=False)  # Too small for s3mi

        blacklist_s3_file = self.additional_files.get('taxon_blacklist', DEFAULT_BLACKLIST_S3)
        taxon_blacklist = s3.fetch_reference(blacklist_s3_file, self.ref_dir_local)

        taxon_whitelist = None
        if self.additional_attributes.get("use_taxon_whitelist"):
            taxon_whitelist = s3.fetch_reference(self.additional_files.get("taxon_whitelist", DEFAULT_WHITELIST_S3),
                                                 self.ref_dir_local)

        with TraceLock("PipelineStepBlastContigs-CYA", PipelineStepBlastContigs.cya_lock, debug=False):
            with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_count_json_from_m8", "db_type": db_type, "refined_counts": refined_counts_with_dcr}):
                m8.generate_taxon_count_json_from_m8(refined_m8, refined_hit_summary, db_type.upper(),
                                                     lineage_db, deuterostome_db, taxon_whitelist, taxon_blacklist,
                                                     duplicate_cluster_sizes_path, refined_counts_with_dcr)

        # generate contig stats at genus/species level
        with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary"}):
            contig_taxon_summary = self.generate_taxon_summary(
                read2contig,
                contig2lineage,
                updated_read_dict,
                added_reads,
                db_type,
                duplicate_cluster_sizes_path,
                # same filter as applied in generate_taxon_count_json_from_m8
                m8.build_should_keep_filter(deuterostome_db, taxon_whitelist, taxon_blacklist)
            )

        with log.log_context("PipelineStepBlastContigs", {"substep": "generate_taxon_summary_json", "contig_summary_json": contig_summary_json}):
            with open(contig_summary_json, 'w') as contig_outf:
                json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(os.path.dirname(contig_summary_json), f"contig2lineage.{db_type}.json")
        with log.log_context("PipelineStepBlastContigs", {"substep": "contig2lineage_json", "contig2lineage_json": contig2lineage_json}):
            with open(contig2lineage_json, 'w') as c2lf:
                json.dump(contig2lineage, c2lf)

        self.additional_output_files_hidden.append(contig2lineage_json)
Пример #25
0
    def run(self):
        '''
            1. summarize hits
            2. built blast index
            3. blast assembled contigs to the index
            4. update the summary
        '''
        (_align_m8, deduped_m8, hit_summary,
         orig_counts) = self.input_files_local[0]
        assembled_contig, _assembled_scaffold, bowtie_sam, _contig_stats = self.input_files_local[
            1]
        reference_fasta = self.input_files_local[2][0]

        (blast_m8, refined_m8, refined_hit_summary, refined_counts,
         contig_summary_json, blast_top_m8) = self.output_files_local()
        db_type = self.additional_attributes["db_type"]
        if os.path.getsize(assembled_contig) < MIN_ASSEMBLED_CONTIG_SIZE or \
           os.path.getsize(reference_fasta) < MIN_REF_FASTA_SIZE:
            # No assembled results or refseq fasta available.
            # Create empty output files.
            command.write_text_to_file(' ', blast_m8)
            command.write_text_to_file(' ', blast_top_m8)
            command.copy_file(deduped_m8, refined_m8)
            command.copy_file(hit_summary, refined_hit_summary)
            command.copy_file(orig_counts, refined_counts)
            command.write_text_to_file('[]', contig_summary_json)
            return  # return in the middle of the function

        (read_dict, accession_dict,
         _selected_genera) = m8.summarize_hits(hit_summary)
        PipelineStepBlastContigs.run_blast(db_type, blast_m8, assembled_contig,
                                           reference_fasta, blast_top_m8)
        read2contig = {}
        contig_stats = defaultdict(int)
        PipelineStepRunAssembly.generate_info_from_sam(bowtie_sam, read2contig,
                                                       contig_stats)

        (updated_read_dict, read2blastm8, contig2lineage,
         added_reads) = self.update_read_dict(read2contig, blast_top_m8,
                                              read_dict, accession_dict,
                                              db_type)
        self.generate_m8_and_hit_summary(updated_read_dict, added_reads,
                                         read2blastm8, hit_summary, deduped_m8,
                                         refined_hit_summary, refined_m8)

        # Generating taxon counts based on updated results
        lineage_db = s3.fetch_reference(
            self.additional_files["lineage_db"],
            self.ref_dir_local,
            allow_s3mi=False)  # Too small to waste s3mi
        deuterostome_db = None
        evalue_type = 'raw'
        if self.additional_files.get("deuterostome_db"):
            deuterostome_db = s3.fetch_reference(
                self.additional_files["deuterostome_db"],
                self.ref_dir_local,
                allow_s3mi=False)  # Too small for s3mi
        with TraceLock("PipelineStepBlastContigs-CYA",
                       PipelineStepBlastContigs.cya_lock,
                       debug=False):
            with log.log_context(
                    "PipelineStepBlastContigs", {
                        "substep": "generate_taxon_count_json_from_m8",
                        "db_type": db_type,
                        "refined_counts": refined_counts
                    }):
                m8.generate_taxon_count_json_from_m8(
                    refined_m8, refined_hit_summary, evalue_type,
                    db_type.upper(), lineage_db, deuterostome_db,
                    refined_counts)

        # generate contig stats at genus/species level
        with log.log_context("PipelineStepBlastContigs",
                             {"substep": "generate_taxon_summary"}):
            contig_taxon_summary = self.generate_taxon_summary(
                read2contig, contig2lineage, updated_read_dict, added_reads,
                db_type)

        with log.log_context(
                "PipelineStepBlastContigs", {
                    "substep": "generate_taxon_summary_json",
                    "contig_summary_json": contig_summary_json
                }):
            with open(contig_summary_json, 'w') as contig_outf:
                json.dump(contig_taxon_summary, contig_outf)

        # Upload additional file
        contig2lineage_json = os.path.join(
            os.path.dirname(contig_summary_json),
            f"contig2lineage.{db_type}.json")
        with log.log_context(
                "PipelineStepBlastContigs", {
                    "substep": "contig2lineage_json",
                    "contig2lineage_json": contig2lineage_json
                }):
            with open(contig2lineage_json, 'w') as c2lf:
                json.dump(contig2lineage, c2lf)

        self.additional_output_files_hidden.append(contig2lineage_json)