Пример #1
0
def get_amplicon_json_file_path(scheme, scheme_version, scheme_dir):
    """
    Get the file path to the amplicon primer JSON file
    Parameters
    ----------
    scheme: Path
        Artic primer scheme to get data from
    scheme_version: str
        The version, if any
    scheme_dir: str
        The scheme directory
    Returns
    -------
    pathlib.PosixPath
        File path to the JSON primer file.

    """
    artic_results_dir_primers = (
        Path(get_env_variable("MT_ARTIC_RESULTS_DIR")) / "artic/" / "primers_files"
    )
    if not (Path(get_env_variable("MT_ARTIC_RESULTS_DIR")) / "artic/").exists():
        (Path(get_env_variable("MT_ARTIC_RESULTS_DIR")) / "artic/").mkdir()
    # Needs to be dynamic here
    bed_file = f"{scheme}.scheme.bed"
    json_file = f"{scheme}_{scheme_version}.primers.json"
    full_path_to_bed_file = scheme_dir / scheme / scheme_version / bed_file
    if not (artic_results_dir_primers / json_file).exists():
        json_file_path = convert_amplicon_bed_file_to_json(
            full_path_to_bed_file, json_file, artic_results_dir_primers
        )
    else:
        json_file_path = artic_results_dir_primers / json_file
    return json_file_path
Пример #2
0
def create_minimap2_index(ref_info, file_name):
    """
    Create the minimap2 index for the reference file that we are uploading
    Parameters
    ----------
    ref_info: reference.models.ReferenceInfo
        The django ORM object
    file_name: pathlib.PosixPath
        The filename we are uploading

    Returns
    -------
    str
        File path to the newly created minimap2 index file
    """
    index_dir_path = (MEDIA_ROOT
                      if get_env_variable("MT_MINIMAP2_INDEX_DIR").isdigit()
                      else get_env_variable("MT_MINIMAP2_INDEX_DIR"))
    minimap2_index_file_location = (
        f"{index_dir_path}/minimap2_indexes/{file_name.stem}.mmi")
    out, err = subprocess.Popen(
        f"{MINIMAP2} -d {minimap2_index_file_location}"
        f" {ref_info.file_location.path}".split()).communicate()
    print(out)
    print(err)
    return minimap2_index_file_location
Пример #3
0
def predict_barcode_will_finish(
    amplicon_median_array, num_barcodes, total_mapped_reads_count
):
    """
    Predict if after
    Parameters
    ----------
    amplicon_median_array: np.ndarray
        Median coverage for each amplicon
    num_barcodes: int
        The total number of barcodes detected in the run
    total_mapped_reads_count:
        Total number of reads that have mapped to n-cov in this task

    Returns
    -------
    bool
        If this equation thinks the barcode will finish given enough time sequencing

    """
    # 100000 reads per barcode in a run
    ideal_reads_count_constant = int(get_env_variable("MT_IDEAL_READ_CONSTANT"))
    minimum_required_amplicons = int(get_env_variable("MT_ARTIC_MIN_AMPS_PERC"))
    coverage_per_amplicon = int(get_env_variable("MT_COVERAGE_PER_AMPLICON"))
    predicted_coverages = (
        amplicon_median_array
        / total_mapped_reads_count
        * num_barcodes
        * ideal_reads_count_constant
    )
    return (
        predicted_coverages[predicted_coverages > coverage_per_amplicon].size / amplicon_median_array.size
    ) * 100 > minimum_required_amplicons
def secure_artic_runs():
    """
    When run monitor is celery beaten, go through flowcells with a run_artic_pipeline command and check they have had
     data uploaded to them in the last 6 hours.
    If they haven't, trigger any barcodes that haven't been run, to tidy up all the sensitive files.

    Returns
    -------

    """
    logger.info(
        "Starting securing artic tasks for flowcells that haven't uploaded in 12 hours"
    )
    jobs = JobMaster.objects.filter(job_type_id=16, complete=False)
    for artic_job in jobs:
        flowcell = artic_job.flowcell
        last_activity_date = flowcell.last_activity_date
        twelve_hours = datetime.timedelta(
            hours=int(get_env_variable("MT_ARTIC_TIME_UNTIL_CLEARING")))
        three_hours = datetime.timedelta(hours=3)
        active = (last_activity_date <
                  datetime.datetime.now(datetime.timezone.utc) - twelve_hours)
        trigger_barcodes = (
            last_activity_date <
            datetime.datetime.now(datetime.timezone.utc) - three_hours)
        if trigger_barcodes:
            # so much unnecessary computing - need a way to mark it is final out side Artic Barcode Metadata
            trigger_all_barcodes_after_run(artic_job)

        if int(get_env_variable("MT_DESTROY_ARTIC_EVIDENCE")):
            # Not super happy with this, as it is affected by other things than read upload,
            # but can't think of an easy work around. If we aren't storing reads, we don't really update it apart from
            # when we upload a read batch
            # TOdo ideally we would add a run last activity time for this
            last_activity_date = flowcell.last_activity_date
            active = (
                last_activity_date >
                datetime.datetime.now(datetime.timezone.utc) - twelve_hours)
            if not active:
                results_dir = make_results_directory_artic(
                    flowcell.id, artic_job.id)
                for barcode_name in ArticBarcodeMetadata.objects.filter(
                        job_master=artic_job).values_list("barcode__name",
                                                          flat=True):
                    clear_unused_artic_files(str(results_dir / barcode_name),
                                             barcode_name, flowcell.id)
                artic_job.complete = True
                artic_job.save()

    logger.info("Finished securing artic tasks")
Пример #5
0
def get_results_modal_html(request, pk):
    """
    return the html for the modal for the all results download functionality
    Parameters
    ----------
    request: rest_framework.request.Request
        The ajax request body
    pk: int
        The primary key of the flowcell object in the database

    Returns
    -------
    html

    """
    results_files_extra = [
        ("Input fasta", "input-fasta"),
        ("Sorted Bam", "sorted-bam"),
        ("Sorted Bam Index", "sorted-bam-bai"),
    ]
    results_files = [
        ("Consensus sequence", "consensus"),
        ("Box plot", "box-plot"),
        ("Bar plot", "bar-plot"),
        ("Fail VCF", "fail-vcf"),
        ("Pass VCF", "pass-vcf"),
        ("Pangolin lineages CSV", "pangolin-lineages"),
    ]
    if not int(get_env_variable("MT_DESTROY_ARTIC_EVIDENCE")):
        results_files.extend(results_files_extra)
    context_dict = {"hidden_results_files": results_files}
    return render(request, "all-results-modal.html", context={"context": context_dict})
Пример #6
0
def make_results_directory_artic(flowcell_id, task_id, allow_create=True):
    """
    Make a results directory
    Parameters
    ----------
    flowcell_id: int
        Primary key of the flowcell entry in the database
    task_id: int
        Primary key of the task record in the database.
    allow_create: bool
        Allow the creation of the directory if it doesn't already exist

    Returns
    -------
    results_dir: pathlib.PosixPath
        PosixPath pointing to the results directory
    """
    environmental_results_directory = get_env_variable("MT_ARTIC_RESULTS_DIR")
    artic_dir = Path(f"{environmental_results_directory}/artic/")
    if not artic_dir.exists() and allow_create:
        Path.mkdir(artic_dir)
    results_dir = Path(f"{environmental_results_directory}/artic/Temp_results")
    if not results_dir.exists() and allow_create:
        Path.mkdir(results_dir)
    results_dir = Path(
        f"{environmental_results_directory}/artic/Temp_results/{flowcell_id}_{task_id}_artic"
    )
    if not results_dir.exists() and allow_create:
        Path.mkdir(results_dir)
    return results_dir
Пример #7
0
    def active(self):
        """
        Determine whether this flowcell has been active in the 48 hours
        :return:
        """
        # time deltas are pythons measurement of time difference
        delta = datetime.timedelta(
            days=int(get_env_variable("MT_TIME_UNTIL_INACTIVE")))
        # If the current time minus two days is more than the last activity date, there has been no activity in 48 hours
        if (datetime.datetime.now(datetime.timezone.utc) -
                delta) > self.last_activity_date:

            return False
        # Activity in the last 48 hours

        return True
Пример #8
0
def create_archive_tasks():
    """
    Create archive tasks for flowcells that are more than X days since last use, don't archive if set to -1
    Returns
    -------

    """
    time_until_inactive = get_env_variable("MT_TIME_UNTIL_ARCHIVE")
    if time_until_inactive == 0:
        return
    delta = datetime.timedelta(days=int(time_until_inactive))
    for flowcell in Flowcell.objects.filter(archived=False):
        if flowcell.last_activity_date < datetime.datetime.now(
                datetime.timezone.utc) - delta:
            jm, created = JobMaster.objects.get_or_create(job_type_id=18,
                                                          flowcell=flowcell)
            if created:
                logger.info(f"Marking flowcell: {flowcell} for archiving.")
Пример #9
0
def clear_artic_data(job_master):
    """
    Clear the artic files from the system drive
    Parameters
    ----------
    job_master: reads.models.JobMaster
        The job master ORM object of the track artic job
    Returns
    -------
    exit_code: int
        0 if successful, 1 if not
    """
    environmental_results_directory = get_env_variable("MT_ARTIC_RESULTS_DIR")
    results_dir = Path(
        f"{environmental_results_directory}/artic/Temp_results/{job_master.flowcell.id}_{job_master.id}_artic"
    )

    if not results_dir.exists():
        return 1
    else:
        # clear pngs from artic static dir
        rmtree(results_dir, onerror=on_delete_error)
        return 0
Пример #10
0
    def ready(self):
        """
        Hook that is called when the Artic app is initialised and ready. Code below is ready.
        Returns
        -------

        """
        from artic.utils import check_artic_static_exists, update_pangolin
        check_artic_static_exists()
        from minotourapp.utils import get_env_variable
        MT_VoC_PATH = get_env_variable("MT_VoC_PATH")
        if Path(f"{MT_VoC_PATH}").exists():
            print("VoC Path Found")
            ##Check if
            # cloned_repo = Repo.clone(os.path.join("https://github.com/phe-genomics/variant_definitions", Path(f"{MT_VoC_PATH}")))
            if Path(f"{MT_VoC_PATH}/variant_definitions/").exists():
                # already cloned so....
                print("Updating path")
                try:
                    repo = Repo(Path(f"{MT_VoC_PATH}/variant_definitions/"))
                    print(repo.remotes.origin.pull())
                    pass
                except git.GitCommandError as e:
                    print(
                        f"Git error, presumably being updated simultaneously {repr(e)}"
                    )
            else:
                try:
                    cloned_repo = Repo.clone_from(
                        "https://github.com/phe-genomics/variant_definitions",
                        f"{MT_VoC_PATH}/variant_definitions/",
                    )
                except git.GitCommandError as e:
                    print(
                        f"Git error, presumably being updated simultaneously {repr(e)}"
                    )
        update_pangolin()
Пример #11
0
"""
Create single redis instance and import it around where it is needed
"""
import redis

from minotourapp.utils import get_env_variable

redis_instance = redis.StrictRedis(
    host="127.0.0.1", port=6379, db=1,
    decode_responses=True) if "localhost" in get_env_variable(
        "MT_DJANGO_REDIS_URL") else redis.StrictRedis(
            unix_socket_path=get_env_variable("MT_DJANGO_REDIS_URL").split(
                "//")[-1],
            decode_responses=True)
Пример #12
0
def get_artic_barcode_metadata_html(request):
    """

    Parameters
    ----------
    request: rest_framework.request.Request
        Request body, params: the flowcell PK and selected barcode

    Returns
    -------

    """
    flowcell_id = request.GET.get("flowcellId", None)
    selected_barcode = request.GET.get("selectedBarcode", None)
    if not flowcell_id or not selected_barcode:
        return Response(
            "No flowcell ID or barcode provided.", status=status.HTTP_400_BAD_REQUEST
        )
    # see if we have a command waiting to be run
    try:
        artic_command_jm = bool(
            JobMaster.objects.filter(
                job_type_id=17, barcode__name=selected_barcode, flowcell_id=flowcell_id
            )
        )
    except JobMaster.DoesNotExist:
        artic_command_jm = False
    orm_object = ArticBarcodeMetadata.objects.filter(
        flowcell_id=flowcell_id, barcode__name=selected_barcode
    ).last()
    if not orm_object:
        return Response("No data found", status=status.HTTP_404_NOT_FOUND)
    # First iteration we may may not have FlowcellSumamryBarcodes, so cal
    if not orm_object.percentage_of_reads_in_barcode:
        try:
            barcode_numbers = FlowcellSummaryBarcode.objects.get(
                flowcell_id=flowcell_id, barcode_name=selected_barcode
            )
            all_numbers = FlowcellSummaryBarcode.objects.filter(
                flowcell_id=flowcell_id, barcode_name="All reads"
            ).values_list("read_count", flat=True)
            total_reads = 0
            for all_number in all_numbers:
                total_reads += all_number
            orm_object.percentage_of_reads_in_barcode = (
                barcode_numbers.read_count / total_reads * 100
            )
        except FlowcellSummaryBarcode.DoesNotExist as e:
            orm_object.percentage_of_reads_in_barcode = 0
    # [[new key, old key]]
    new_key_names = [
        ["Avg. Coverage", "average_coverage"],
        ["Var. Coverage", "variance_coverage"],
        ["Min. Coverage", "minimum_coverage"],
        ["Max. Coverage", "maximum_coverage"],
        ["% reads in run", "percentage_of_reads_in_barcode"],
        ["Has Finished", "has_finished"],
        ["Has Sufficient Coverage", "has_sufficient_coverage"],
    ]
    results_files = [
        ("Consensus sequence", "consensus"),
        ("Box plot", "box-plot"),
        ("Bar plot", "bar-plot"),
        ("Fail VCF", "fail-vcf"),
        ("Pass VCF", "pass-vcf"),
        ("Input fasta", "input-fasta"),
        ("Pangolin lineages CSV", "pangolin-lineages"),
        ("Sorted Bam", "sorted-bam"),
        ("Sorted Bam index", "sorted-bam-bai"),
    ]
    old_dict = orm_object.__dict__
    context_dict = {key[0]: old_dict[key[1]] for key in new_key_names}
    context_dict["hidden_barcode_pk"] = orm_object.barcode.id
    context_dict["hidden_barcode_name"] = orm_object.barcode.name
    context_dict["hidden_flowcell_id"] = flowcell_id
    context_dict["hidden_job_master_id"] = orm_object.job_master.id
    context_dict["hidden_results_files"] = results_files
    context_dict["hidden_has_finished"] = old_dict["has_finished"]
    context_dict["hidden_has_suff"] = old_dict["has_sufficient_coverage"]
    context_dict["hidden_marked_for_rerun"] = old_dict["marked_for_rerun"]
    context_dict["hidden_destroy_evidence"] = bool(
        int(get_env_variable("MT_DESTROY_ARTIC_EVIDENCE"))
    )
    context_dict["hidden_triggered_by_cleanup"] = (
        orm_object.has_finished and not orm_object.has_sufficient_coverage
    )
    context_dict["hidden_has_command_job_master"] = artic_command_jm
    (
        flowcell,
        artic_results_path,
        artic_task_id,
        _,
    ) = quick_get_artic_results_directory(flowcell_id)
    fastq_path = artic_results_path / selected_barcode / f"{selected_barcode}.fastq"
    fastq_path_gz = fastq_path.with_suffix(".fastq.gz")
    context_dict["hidden_has_fastq"] = fastq_path.exists() or fastq_path_gz.exists()
    if context_dict["hidden_has_finished"]:
        csv_path = artic_results_path / selected_barcode / "lineage_report.csv.gz"
        if csv_path.exists():
            df = pd.read_csv(
                artic_results_path / selected_barcode / "lineage_report.csv.gz"
            )
            html_string = df.T.to_html(classes="table table-striped", border=0)
            context_dict["hidden_html_string"] = html_string
    return render(
        request,
        "artic-barcode-metadata.html",
        context={"artic_barcode_metadata": context_dict},
    )
Пример #13
0
import os
import tempfile

from celery.schedules import crontab
from kombu import Exchange, Queue

# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
from minotourapp.utils import get_env_variable

BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.11/howto/deployment/checklist/

# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = get_env_variable("MT_SECRET_KEY")

# DEBUG = bool(os.environ.get('MT_DJANGO_DEBUG', True))
ALLOWED_HOSTS = [
    '*',
]

# Application definition

INSTALLED_APPS = [
    'django.contrib.admin',
    'django.contrib.auth',
    'django.contrib.contenttypes',
    'django.contrib.sessions',
    'django.contrib.messages',
    'django.contrib.staticfiles',
Пример #14
0
def run_centrifuge(flowcell_job_id, streamed_reads=None):
    """
    Run the metagenomics subprocess command, returning the data from it as a DataFrame.
    Parameters
    ----------
    flowcell_job_id: int
        The primary key of the flowcell ID
    streamed_reads: list of dict
        A list of dictionaries containing read information

    Returns
    -------
    pd.core.frame.DataFrame, int, int, int, pandas.core.frame.DataFrame, int, int
        Dataframe of metagenomics results, total output lines from metagenomics, last read primary key,
        total count of reads analysed, dataframe of any reads that identified as targets,
         number of reads with classifications, number of reads without classifications

    """
    # The JobMaster object
    task = JobMaster.objects.get(pk=flowcell_job_id)
    # The flowcell the reads are from
    flowcell = task.flowcell
    avg_read_length = int(flowcell.average_read_length)
    if avg_read_length == 0:
        logger.error(
            f"Average read length is zero Defaulting to 450, but this is an error."
        )
        avg_read_length = 1000
    if not streamed_reads and not isinstance(streamed_reads, list):
        read_count, last_read, fasta_df_barcode = get_fastq_df(
            flowcell_pk=int(flowcell.id),
            desired_yield=50,
            avg_read_len=avg_read_length,
            task=task,
        )
    else:
        last_read = task.last_read
        fasta_df_barcode = pd.DataFrame(streamed_reads)
        if not fasta_df_barcode.empty:
            fasta_df_barcode = fasta_df_barcode.rename(columns={
                "type": "read_type_id",
                "barcode": "barcode_id"
            })
            fasta_df_barcode["type__name"] = fasta_df_barcode["read_type_id"]
        read_count = fasta_df_barcode.shape[0]
    if fasta_df_barcode.empty:
        return pd.DataFrame(), None, None, None, None, 0, 0
    logger.debug("Flowcell id: {} - number of reads found {}".format(
        flowcell.id, read_count))
    # Create a fastq string to pass to Centrifuge
    fasta_df_barcode["fasta"] = (">read_id=" + fasta_df_barcode["read_id"] +
                                 ",barcode=" +
                                 fasta_df_barcode["barcode_name"] + "\n" +
                                 fasta_df_barcode["sequence"])
    fastqs_data = "\n".join(list(fasta_df_barcode["fasta"]))
    logger.info("Flowcell id: {} - Loading index and Centrifuging".format(
        flowcell.id))
    # Write the generated fastq file to stdin, passing it to the command
    # Use Popen to run the metagenomics command
    # The path to the metagenomics executable
    centrifuge_path = get_env_variable("MT_CENTRIFUGE")
    # The path to the Centrifuge Index
    index_path = get_env_variable("MT_CENTRIFUGE_INDEX")
    # The command to run metagenomics
    cmd = "perl " + centrifuge_path + " -f --mm -k 3 -x " + index_path + " -"
    try:
        out, err = subprocess.Popen(
            cmd.split(),
            preexec_fn=lambda: os.nice(-10),
            stdout=subprocess.PIPE,
            stdin=subprocess.PIPE,
            stderr=subprocess.PIPE,
        ).communicate(input=str.encode(fastqs_data))
    except subprocess.SubprocessError as e:
        logger.warning(f"{e}, running with standard niceness index.")
        out, err = subprocess.Popen(
            cmd.split(),
            stdout=subprocess.PIPE,
            stdin=subprocess.PIPE,
            stderr=subprocess.PIPE,
        ).communicate(input=str.encode(fastqs_data))
    # The standard error
    # out is a bytestring so it needs decoding
    if not out:
        logger.info(
            "Flowcell id: {} - No reads found or no metagenomics output."
            " Check above for error".format(flowcell.id))
        task.running = False
        task.save()
        return None
    centrifuge_output = out.decode()
    # total number of lines of metagenomics output dealt with
    total_centrifuge_output = centrifuge_output.count("\n") - 1
    logger.info(
        "Flowcell id: {} - number of metagenomics output lines is {}".format(
            flowcell.id, total_centrifuge_output))
    # output fields is the column headers for the pandas data frame
    output_fields = ["readID", "seqID", "taxID", "numMatches"]
    # create the DataFrame from the output
    df = pd.read_csv(StringIO(centrifuge_output),
                     sep="\t",
                     usecols=output_fields)
    # split out the barcode_name from the readID string
    df = split_read_id_and_barcodes(df)
    individual_reads_classified = np.unique(df["readID"].values).size
    targets_df = separate_target_cent_output(df, task, fasta_df_barcode)
    # The number of reads we have any form of classification for
    reads_classified = np.unique(df[df["tax_id"].ne(0)]["read_id"].values).size
    # The number of reads we have completely failed to classify
    reads_unclassified = np.unique(
        df[df["tax_id"].eq(0)]["read_id"].values).size  # save the values
    # Get the metadata object. Contains the start time, end time and runtime of the task
    metadata, created = Metadata.objects.get_or_create(task=task)
    return (
        df,
        individual_reads_classified,
        read_count,
        last_read,
        targets_df,
        reads_classified,
        reads_unclassified,
    )
Пример #15
0
def save_reads_bulk(reads):
    """
    Save reads into redis after they arrive from minFQ, and to the database for tasks
     to be run on them later.
    Parameters
    ----------
    reads: list of dict
        A list of reads in dictionary form sent from minFQ
    Returns
    -------
    None

    """
    flowcell_dict = {}
    reads_list = []
    run_dict = {}
    for read in reads:
        run_pk = read.get("run", -1)
        if run_pk not in run_dict and run_pk != -1:
            run = Run.objects.get(pk=run_pk)
            run_dict[run_pk] = run
            read["run_id"] = run.id
            read["flowcell_id"] = run.flowcell.id
        else:
            read["run_id"] = run_dict[run_pk].id
            read["flowcell_id"] = run_dict[run_pk].flowcell.id
        if read["flowcell_id"] not in flowcell_dict:
            f = Flowcell.objects.get(pk=read["flowcell_id"])
            if f.archived:
                f.archived = False
                f.last_activity_date = datetime.now(timezone.utc)
                f.save()
            flowcell_dict[read["flowcell_id"]] = 1
        fastq_read = FastqRead(
            read_id=read["read_id"],
            read=read["read"],
            channel=read["channel"],
            barcode_id=read["barcode"],
            rejected_barcode_id=read["rejected_barcode"],
            barcode_name=read["barcode_name"],
            sequence_length=read["sequence_length"],
            quality_average=read["quality_average"],
            sequence=read["sequence"],
            quality=read["quality"],
            is_pass=read["is_pass"],
            start_time=read["start_time"],
            run_id=read["run_id"],
            flowcell_id=read["flowcell_id"],
            type_id=read["type"],
            fastqfile_id=read["fastq_file"],
        )
        reads_list.append(fastq_read)
    # Save reads to redis for later processing of base-called data summaries.
    reads_as_json = json.dumps(reads)
    ### We want to pause to let the number of chunks get below 10?
    count = redis_instance.scard("reads")
    while count > 40:
        time.sleep(5)
        count = redis_instance.scard("reads")
    redis_instance.sadd("reads", reads_as_json)
    # Bulk create the entries
    skip_sequence_saving = int(get_env_variable("MT_SKIP_SAVING_SEQUENCE"))
    if not skip_sequence_saving:
        FastqRead.objects.bulk_create(reads_list, batch_size=1000)
Пример #16
0
def map_target_reads(task, path_to_reference, target_df, to_save_df, target_region_df):
    """
    Map the reads
    Parameters
    ----------
    task: reads.models.JobMaster
        The django ORM object of this task
    path_to_reference: pathlib.PosixPath
        The path to the concatenated, Gzipped reference file for all references in target set
    target_df: pd.core.frame.DataFrame
        Target reads dataframe, containing read sequence
    to_save_df: pd.core.frame.DataFrame
        The finalised metagenomics output data, with num_matches, lineages etc.
    target_region_df: pd.core.frame.DataFrame
        The regions defined in the GFF file that contain virulence areas

    Returns
    -------
    pd.core.frame.DataFrame
        Dataframe of minimap2 output from mapped target reads and num_mapped
    """
    minimap2_executable_path = get_env_variable("MT_MINIMAP2")
    cmd = f"{minimap2_executable_path} -x map-ont {path_to_reference} -"
    # target_df = pd.merge(target_df, to_save_df, on="tax_id")
    target_df["unique"] = np.where(target_df["numMatches"] == 1, 1, 0)
    gb = target_df.groupby(["tax_id", "barcode_name"])
    target_df.set_index(["tax_id", "barcode_name"], inplace=True)
    target_df["num_matches"] = gb.size()
    target_df["sum_unique"] = gb["unique"].sum()
    target_df.reset_index(inplace=True)
    taxid_list = np.unique(target_df["tax_id"].values)
    # use the ncbi thing to get the species names for the tax ids in the tax ids list
    ncbi = NCBITaxa()
    taxid_2_name = ncbi.get_taxid_translator(taxid_list)
    target_df["name"] = target_df["tax_id"].map(taxid_2_name)
    fasta_sequence_to_map = "\n".join(
        (">" + target_df["read_id"] + "\n" + target_df["sequence"]).values.tolist()
    )
    start_metagenomics_mapping_task(task.flowcell.id, task.target_set, target_df)
    # TODO merge mapping task into this function rather than above seperate mapping task creation
    process = subprocess.Popen(
        cmd.split(),
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    out, err = process.communicate(input=fasta_sequence_to_map.encode())
    if not out:
        logger.info(f"No reads mapped.")
        return pd.DataFrame()
    target_set_plasmids_ref_pks = MappingTarget.objects.filter(
        target_set=task.target_set
    ).values_list("reference_id", flat=True)
    reference_contig_names = (
        ReferenceInfo.objects.filter(pk__in=target_set_plasmids_ref_pks)
        .values_list("reference_lines__line_name", "name")
        .distinct()
    )
    contig_to_reference_dict = {
        contig_name: ref_species for contig_name, ref_species in reference_contig_names
    }
    map_out_df = pd.read_csv(StringIO(out.decode()), sep="\t", header=None)
    map_out_df.rename(
        columns={
            0: "read_id",
            1: "query_seq_len",
            2: "query_start",
            3: "query_end",
            4: "rel_strand",
            5: "target_seq_name",
            6: "target_seq_length",
            7: "target_start",
            8: "target_end",
            9: "num_residue_matches",
            10: "alignment_block_length",
            11: "mapping_qual",
        },
        inplace=True,
    )
    map_out_df["name"] = map_out_df["target_seq_name"].map(contig_to_reference_dict)
    map_out_df["num_residue_matches"] = map_out_df["num_residue_matches"].astype(
        np.int64
    )
    # Filter out low quality matches or the
    map_out_df = map_out_df.query("mapping_qual >= 40 & num_residue_matches >= 200")
    if map_out_df.empty:
        logger.info("Insufficient quality mappings.")
        return pd.DataFrame()
    # See whether the start or end of a mapping falls into the region
    map_out_df["read_is_red"] = 0
    map_out_df["read_is_red"] += target_region_df.apply(
        falls_in_region, args=(map_out_df,), axis=1
    )
    map_out_df["read_is_red"] = np.where(map_out_df["read_is_red"], 1, 0)
    map_out_df["name"] = map_out_df["name"].str.replace("_", " ")
    map_out_df = pd.merge(map_out_df, target_df, how="left", on=["read_id"])
    map_out_df["barcode_name"] = map_out_df["read_id"].map(
        target_df.set_index("read_id")["barcode_name"].loc[
            ~target_df.set_index("read_id")["barcode_name"].index.duplicated()
        ]
    )
    map_out_df = map_out_df.fillna(0)
    map_out_df["barcode_name"] = np.where(
        map_out_df["barcode_name"] == "No_barcode",
        "All reads",
        map_out_df["barcode_name"],
    )
    gb = map_out_df.groupby(["barcode_name", "name_y"])
    map_out_df.set_index(["barcode_name", "name_y"], inplace=True)
    map_out_df["num_mapped"] = gb.size()
    map_out_df["num_red_reads"] = gb["read_is_red"].sum()
    map_out_df = map_out_df.loc[~map_out_df.index.duplicated()]
    map_out_df.reset_index(inplace=True)
    return map_out_df
Пример #17
0
    def handle(self, *args, **options):
        """
            Handle the execution of the command
            :param args: The arguments, whether they are present or not
            :param options: The values that have been added to the arguments
            :return:
            """
        try:
            reference_files = []
            # These should be lowercase and include the '.'
            endings = {
                ".fna",
                ".fa",
                ".fasta",
                ".fsa",
            }
            if not options["key"]:
                print(
                    "To add references, your minotour api_key is required. "
                    "This can be found on the profile page of your account."
                )
                return
            for file_or_directory in options["reference"]:
                reference_files.extend(find_files_of_type(file_or_directory, endings))
            if not reference_files:
                raise FileNotFoundError(
                    f"No files found at specified location! Endings included are {pformat(endings)}"
                )
            private = False
            # If we want private references
            user = Token.objects.get(key=options["key"]).user
            if options["private"]:
                private = True
            # remove none from reference_files
            reference_files = list(filter(None.__ne__, reference_files))
            previous_ref = set(
                ReferenceInfo.objects.filter(private=False)
                .values_list("name", flat=True)
                .distinct()
            )
            # If it's private check we aren't multiplying an already existing private reference
            if options["private"]:
                previous_ref = previous_ref.union(
                    set(
                        ReferenceInfo.objects.filter(private=True, uploader=user)
                        .values_list("name", flat=True)
                        .distinct()
                    )
                )
            for ref_file in reference_files:
                # Get the species name of this reference, no file suffixes
                ref_file_stem = str(ref_file.stem).partition(".")[0]
                print("Processing file: {}".format(ref_file.name))
                if ref_file_stem in previous_ref:
                    print(
                        "A reference already exists for this species name: {}".format(
                            ref_file_stem
                        )
                    )
                    print(
                        "If you believe this is in error, or want to add this reference anyway,"
                        " please change the filename"
                    )
                    continue
                duplicated, sha256_hash = validate_reference_checks(ref_file, user)
                if duplicated:
                    return
                ## get fastq or fasta
                handle = (
                    pyfastx.Fastq
                    if set(ref_file.suffixes).intersection({".qz", ".gzip"})
                    else pyfastx.Fasta
                )
                # Check that the minimap2 index location folder exists
                index_dir_path = (
                    MEDIA_ROOT
                    if get_env_variable("MT_MINIMAP2_INDEX_DIR").isdigit()
                    else get_env_variable("MT_MINIMAP2_INDEX_DIR")
                )
                minimap2_index_path = f"{index_dir_path}/minimap2_indexes/"
                if not Path(minimap2_index_path).exists():
                    raise FileNotFoundError(
                        f"Minimap2 index directory does not exist at {minimap2_index_path}. Please create it!"
                    )
                # build minimap2 index
                minimap2_index_path += f"{ref_file.stem}.mmi"
                print("Building minimap2 index, please wait.....")
                out, err = subprocess.Popen(
                    f"{MINIMAP2} -d {minimap2_index_path}"
                    f" {ref_file.as_posix()}".split()
                ).communicate()
                print("Minimap2 index building output - ")
                print(out)
                print("\n")
                print(err)
                print("Built index. Parsing file...")

                # Individual lines (I.E Chromosomes in the reference)
                fa = handle(ref_file.as_posix())
                # Create the Reference info entry in the database
                ref_info, created = ReferenceInfo.objects.update_or_create(
                    name=ref_file_stem,
                    file_location=ref_file.resolve().as_posix(),
                    file_name=ref_file.name,
                    length=fa.size,
                    private=private,
                    uploader=user,
                    minimap2_index_file_location=minimap2_index_path,
                    sha256_checksum=sha256_hash,
                )
                # Create a Reference line entry for each "Chromosome/line"
                for contig in fa:
                    ReferenceLine.objects.create(
                        reference=ref_info,
                        line_name=contig.name,
                        chromosome_length=len(contig),
                    )
                print("Successfully handled file.")

        except Exception as e:
            raise CommandError(e)