Exemplo n.º 1
0
    def __init__(self,
                 pattern="**/summary.json",
                 output_filename=None,
                 verbose=True,
                 **kargs):
        super().__init__()

        from sequana import logger
        logger.level = "INFO"
        if verbose is False:
            logger.level = "WARNING"

        logger.info(
            "Sequana Summary is still a tool in progress and have been " +
            "  tested with the quality_control pipeline only for now.")
        self.title = "Sequana multiple summary"
        self.devtools = DevTools()

        self.filenames = list(glob.iglob(pattern, recursive=True))
        self.summaries = [ReadSummary(filename) for filename in self.filenames]
        self.projects = [
            ReadSummary(filename).data['project']
            for filename in self.filenames
        ]
        self.create_report_content()
        self.create_html(output_filename)
Exemplo n.º 2
0
    def switch_header_to_gi(self, acc):
        """Kraken will only accept the GI from NCBI so we need to convert
        the ENA accession to GI numbers"""

        # Accession may have a version .1, .2 hence this try/except first
        # without the version and then with the version.
        # Note also that some accession are different from an earlier version.
        # For instance, AF525933 is in the virus.txt list from ENA but
        # the new updated accession ois AH012103 showing that the list and DB
        # must not be fully synchronised.
        # http://www.ebi.ac.uk/ena/data/search?query=AF525933
        # In such case, the results attribute will be missing that accession,
        # which needs to be searched for specifically. We cannot now its name
        # before downloading the fasta.
        if acc in self.results.keys():
            res = self.results[acc]
        else:
            try:
                res = self.results[acc.split(".")[0]]
            except:
                logger.warning(
                    "\nUnknown accession (%s). May be an updated version. Checking..."
                    % acc)
                res = self.ena_id_to_gi_number([acc])
                self.results.update(res)
                res = res[acc]
                logger.info('Found %s using GI number' % acc)
        return ">" + res['identifier'] + " " + res['comment']
Exemplo n.º 3
0
    def random_selection(self,
                         output_filename,
                         nreads=None,
                         expected_coverage=None,
                         reference_length=None):
        """Select random reads

        :param nreads: number of reads to select randomly. Must be less than
            number of available reads in the orignal file.
        :param expected_coverage:
        :param reference_length:

        of expected_coverage and reference_length provided, nreads is replaced
        automatically.
        """
        assert output_filename != self.filename, \
            "output filename should be different from the input filename"
        self.reset()

        if expected_coverage and reference_length:
            mu = self.stats['mean']
            nreads = int(expected_coverage * reference_length / mu)

        assert nreads < len(
            self), "nreads parameter larger than actual Number of reads"
        selector = random.sample(range(len(self)), nreads)
        logger.info("Creating a pacbio BAM file with {} reads".format(nreads))

        with pysam.AlignmentFile(output_filename, "wb",
                                 template=self.data) as fh:
            for i, read in enumerate(self.data):
                if i in selector:
                    fh.write(read)
Exemplo n.º 4
0
    def _get_df(self):
        if self._df is None:
            self.reset()
            N = 0

            all_results = []
            for read in self.data:
                res = []
                # count reads
                N += 1
                if (N % 10000) == 0:
                    logger.info("Read %d sequences" % N)
                #res[0] = read length
                res.append(read.query_length)
                # res[1] = GC content
                c = collections.Counter(read.query_sequence)
                res.append(100 * (c['g'] + c['G'] + c['c'] + c['C']) /
                           float(sum(c.values())))

                # aggregate results
                all_results.append(res)

            self._df = pd.DataFrame(all_results,
                                    columns=['read_length', 'GC_content'])
            self.reset()
        return self._df
Exemplo n.º 5
0
def get_most_probable_strand_consensus(rnaseq_folder):
    """From a sequana rna-seq run folder get the most probable strand, based on the
    frequecies of counts assigned with '0', '1' or '2' type strandness
    (featureCounts nomenclature)
    """

    rnaseq_folder = Path(rnaseq_folder)
    sample_folders = list(
        set([x.parent for x in rnaseq_folder.glob("*/feature_counts_[012]")]))

    df = pd.concat([
        get_most_probable_strand(sample_folder)
        for sample_folder in sample_folders
    ])

    logger.info("Strandness probability report:")
    logger.info(df)

    probable_strands = df.loc[:, "strand"].unique()

    if len(probable_strands) == 1:
        return probable_strands[0]
    else:
        raise IOError(
            f"No consensus on most probable strand. Could be: {probable_strands}"
        )
Exemplo n.º 6
0
    def _enrichr(self, category, background=None, verbose=True):

        if background is None:
            background = self.background

        if isinstance(category, list):
            gene_list = category
        else:
            assert category in ['up', 'down', 'all']
            gene_list = list(self.rnadiff.gene_lists[category])

        if self.mapper is not None:
            logger.info("Input gene list of {} ids".format(len(gene_list)))
            #gene_list = [x.replace("gene:", "") for x in gene_list]
            identifiers = self.mapper.loc[gene_list]['name'].drop_duplicates(
            ).values
            logger.info("Mapped gene list of {} ids".format(len(identifiers)))
            gene_list = list(identifiers)

        enr = gseapy.enrichr(gene_list=gene_list,
                             gene_sets=self.gene_sets,
                             verbose=verbose,
                             background=background,
                             outdir="test",
                             no_plot=True)

        return enr
Exemplo n.º 7
0
    def save_significant_pathways(self,
                                  mode,
                                  cutoff=0.05,
                                  nmax=20,
                                  background=None):  #pragma: no cover
        """mode should be up, down or all"""

        if background is None:
            background = self.background

        # select the relevant pathways
        df = self._enrichr(mode, background).results
        df = self._get_final_df(df, cutoff=cutoff, nmax=nmax)
        logger.warning("Found {} pathways to save".format(len(df)))
        if len(df) == nmax:
            logger.warning("Restricted pathways to {}".format(nmax))

        logger.info("saving {} deregulated pathways".format(len(df)))

        summaries = {}
        # save them
        for ID in df['Term']:
            summary = self.save_pathway(ID,
                                        filename="{}_{}.png".format(ID, mode))
            summaries[ID] = summary
        return summaries
Exemplo n.º 8
0
 def __init__(self, filename, verbose=False):
     if filename.endswith(".gz"):
         raise ValueError("Must be decompressed.")
     self._fasta = FastxFile(filename)
     self.filename = filename
     logger.info("Reading input fasta file...please wait") 
     self._N = len([x for x in FastxFile(filename)])
Exemplo n.º 9
0
    def _get_df(self):
        if self._df is None:
            self.reset()
            N = 0

            all_results = []
            for read in self.data:
                res = []
                # count reads
                N += 1
                if (N % 10000) == 0:
                    logger.info("Read %d sequences" %N)
                #res[0] = read length
                res.append(read.query_length)
                # res[1] = GC content
                c = collections.Counter(read.query_sequence)
                res.append( 100 * (c['g'] + c['G'] + c['c'] + c['C']) /
                            float(sum(c.values())) )

                # aggregate results
                all_results.append(res)


            self._df = pd.DataFrame(all_results,
                columns=['read_length','GC_content'])
            self.reset()
        return self._df
Exemplo n.º 10
0
    def _get_data(self):
        # return list of lists
        # each list is made of 3 values: mapq, length, concordance
        from sequana import Cigar
        data = []
        self.reset()
        count = 0

        for align in self.data:
            mapq = align.mapq
            length = align.rlen
            if self.method in ["blasr", "minimap2"]:
                this = Cigar(align.cigarstring).stats()
                S, D, I, M = this[4], this[2], this[1], this[0]
                concordance = 1 - (D + I + S) / (D + I + M + S)
            else:
                this = align.get_cigar_stats()[0]
                error = this[-1]  # suppose to be I + D + X
                total = this[-1] + this[0]
                if total: concordance = 1 - (error) / (total)
                else: concordance = 0
            data.append([mapq, length, concordance])
            if count % 10000 == 0:
                logger.info("%s" % count)
            count += 1
        return data
Exemplo n.º 11
0
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename,
                                 sep="\t",
                                 header=None,
                                 usecols=[0, 2, 3],
                                 chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
            #only_classified_output when there is no found classified read
            self.unclassified = N  # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass  # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
Exemplo n.º 12
0
    def __init__(self, filename, force=False, **kwargs):
        """.. rubric:: constructor

        :param filename:
        :param force: even though the file format is not recognised,
            you can force the instanciation. Then, you can use your own
            filters.



        """
        vcf = VCFBase(filename, verbose=False, **kwargs)

        if vcf.version == "4.1":
            logger.info("Reading VCF v 4.1")
            self.vcf = VCF_mpileup_4dot1(filename, **kwargs)
        elif vcf.version == "4.2" and vcf.source.startswith("freeBayes"):
            logger.info("Reading VCF v 4.2 (freebayes)")
            from sequana.freebayes_vcf_filter import VCF_freebayes
            self.vcf = VCF_freebayes(filename, **kwargs)
        else:
            print(vcf.version)
            print(vcf.source)
            msg = """This VCF file is not recognised. So far we handle version
v4.1 with mpileup and v4.2 with freebayes. You may use the force option but not
all filters will be recognised"""
            if force is True:
                print("VCF version %s not tested" % vcf.version)
                self.vcf = vcf
            else:
                raise ValueError(msg)
Exemplo n.º 13
0
 def _add_db_in_config(self):
     """ Add new annotation at the end of snpEff.config file.
     """
     logger.info("Updating configuration file")
     if not self._check_database(self.ref_name):
         with open("snpEff.config", "a") as fp:
             print(self.ref_name + ".genome : " + self.ref_name, file=fp)
Exemplo n.º 14
0
def splitter_mapped_unmapped(filename, prefix):
    logger.info("Creating 2 files (mapped and unmapped reads)")
    data = sniff(filename)

    count = 0
    flags = []
    match = 0
    unmatch = 0
    logger.info("Please wait while creating output files")
    with open("{}.unmapped.fastq".format(prefix), "w") as fnosirv:
        with open("{}.mapped.fastq".format(prefix), "w") as fsirv:
            for a in data:
                if a.flag & 256:
                    unmatch += 1
                elif a.flag & 4:
                    read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence,
                                                     a.qual)
                    assert len(a.query_sequence) == len(a.qual)
                    fnosirv.write(read)
                    unmatch += 1
                else:
                    read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence,
                                                     a.qual)
                    assert len(a.query_sequence) == len(a.qual)
                    fsirv.write(read)
                    match += 1
                flags.append(a.flag)
    return match, unmatch, flags
Exemplo n.º 15
0
    def df(self):
        # RG: ID read group ??
        # np: number of passes
        # rq ?
        # rs: list 6 numbers ?
        # za:
        # zm ID of the ZMW
        # sn: SNR how is this computed ?
        # zs
        # - sn: list of ACGT SNRs. A, C, G, T in that order
        if self._df is not None:
            return self._df

        logger.info("Scanning input file. Please wait")
        self.reset()
        N = 0

        all_results = []
        # This takes 60%  of the time...could use cython ?
        for read in self.data:
            tags = dict(read.tags) #11% of the time
            res = []
            # count reads
            N += 1
            if (N % 10000) == 0:
                logger.info("Read %d sequences" %N)

            # res[0] = read length
            res.append(read.query_length) # also stored in tags["qe"] - tags["qs"]

            # collections.counter is slow, let us do it ourself
            res.append( 100. / read.qlen * sum(
                [read.query_sequence.count(letter) if read.query_sequence
                    else 0 for letter in "CGcgSs"]))

            # res[1:4] contains SNR  stored in tags['sn'] in the order A, C, G, T
            try:
                snr = list(tags['sn'])
            except:
                snr = [None] * 4
            res = res + snr

            # res[6] = ZMW name, also stored in tags["zm"]
            res.append(int(tags['zm']))
            res.append(tags['np'])

            # aggregate results
            all_results.append(res)

        self._df = pd.DataFrame(all_results,
            columns=['read_length','GC_content','snr_A','snr_C','snr_G','snr_T','ZMW',
                     "nb_passes"])
        self._df.ZMW = self._df.ZMW.astype(int)

        if len(self._df.ZMW.unique()) != len(self._df):
            logger.warning("Found non unique ZMW. This may not be a CCS but "
                        "a subread file. Consider using PacbioSubreads class")

        self.reset()
        return self._df
Exemplo n.º 16
0
    def _get_data(self):
        # return list of lists
        # each list is made of 3 values: mapq, length, concordance
        from sequana import Cigar
        data = []
        self.reset()
        count = 0

        for align in self.data:
            mapq = align.mapq
            length = align.rlen
            if self.method in ["blasr", "minimap2"]:
                this = Cigar(align.cigarstring).stats()
                S, D, I, M = this[4] , this[2] , this[1], this[0]
                concordance = 1 - (D+I+S)/(D + I + M + S)
            else:
                this = align.get_cigar_stats()[0]
                error = this[-1]  # suppose to be I + D + X
                total = this[-1] + this[0]
                if total:concordance = 1- (error)/(total)
                else:concordance = 0
            data.append([mapq, length, concordance])
            if count % 10000 == 0:
                logger.info("%s" % count)
            count+=1
        return data
Exemplo n.º 17
0
def test_analysis(krakendb):
    file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz")
    file2 = sequana_data("Hm2_GTGAAA_L005_R2_001.fastq.gz")

    # Test that database must be provided
    try:
        df = taxonomy.main([prog, '--file1', file1])
        assert False
    except:
        assert True

    from tempfile import TemporaryDirectory
    directory = TemporaryDirectory()

    # If on travis and we could not load the database, use the local one
    # that must have been downloaded
    try:
        df = taxonomy.main([
            prog, '--file1', file1, "--database", "toydb", "--file2", file2,
            "--level", "INFO", "--output-directory", directory.name,
            "--thread", "1"
        ])
    except:
        # For travis test
        HOME = os.getenv('HOME')
        database = os.sep.join([HOME, '.config', 'sequana', 'kraken_toydb'])
        df = taxonomy.main([
            prog, '--file1', file1, "--database", database, "--file2", file2,
            "--output-directory", directory.name, "--thread", "1"
        ])
    from sequana import logger
    logger.info(directory.name)
Exemplo n.º 18
0
    def download_accession_from_ncbi(self, accession):
        # a list of accessions in a file
        # can be a list, a unique string, a filename with 1-column wit accession
        # to retrieve
        if isinstance(accession, list):
            pass
        elif isinstance(accession, str):
            if os.path.exists(accession):
                with open(accession, "r") as fin:
                    accessions = fin.read().split()
            else:
                accessions = [accession]

        from easydev import Progress
        N = len(accessions)
        pb = Progress(N)
        logger.info("Fetching {} accession fasta files from NCBI".format(N))
        for i, accession in enumerate(accessions):
            data = self.eutils.EFetch("nucleotide",
                                      rettype="fasta",
                                      id=accession,
                                      retmode="text")
            if isinstance(data, int):
                logger.info(
                    "Could not fetch this accession: {}. continue".format(
                        accession))
                print("Could not fetch this accession: {}. continue".format(
                    accession))
            else:
                outname = "{}/library/{}.fa".format(self.dbname, accession)
                with open(outname, "wb") as fout:
                    fout.write(data)
            pb.animate(i + 1)
Exemplo n.º 19
0
    def create_taxonomy_file(self, filename="taxonomy.dat"):
        logger.info("Please wait while creating the output file. "
                    "This may take a few minutes")
        from easydev import Progress
        pb = Progress(len(self.df_nodes))
        count = 0
        df_names = self.df_names.query("key == 'scientific name'").copy()
        with open(filename, "w") as fout:

            for taxid in self.df_nodes.index:
                row = self.df_nodes.loc[taxid]
                fout.write("ID                        : {}\n".format(taxid))
                fout.write("PARENT ID                 : {}\n".format(
                    row.parent))
                fout.write("RANK                      : {}\n".format(
                    row['rank']))

                #names = df_names.loc[taxid]
                #print(
                fout.write("{:26s}: {}\n".format("SCIENTIFIC NAME",
                                                 df_names.loc[taxid, "name"]))
                """    len(names)
                    for k,v in zip(names['key'], names['name']):
                        if k.upper() in ['SCIENTIFIC NAME', 'SYNONYM']:
                            fout.write("{:26s}: {}\n".format(k.upper(), v))
                except:
                    k, v = names['key'], names['name']
                    fout.write("{:26s}: {}\n".format(k.upper(), v))
                """
                fout.write("//\n")
                count += 1
                pb.animate(count)
Exemplo n.º 20
0
def main(args=None):
    if args is None:
        args = sys.argv[:]

    print("Welcome to sequana_vcf_filter")
    user_options = Options(prog="sequana_vcf_filter")

    if "--version" in args:
        import sequana
        print(sequana.version)
        sys.exit()
    elif len(args) == 1 or "--help" in args:
        user_options.parse_args(["prog", "--help"])
    elif len(args) == 2:

        class SimpleOpt():
            pass

        options = SimpleOpt()
        options.input_filename = args[1]
    else:
        options = user_options.parse_args(args[1:])

    # set the level
    logger.level = options.level

    vcf = VCF(options.input_filename)
    vcf.vcf.filter_dict['QUAL'] = options.quality

    vcf.vcf.apply_indel_filter = options.apply_indel_filter
    vcf.vcf.apply_dp4_filter = options.apply_dp4_filter
    vcf.vcf.apply_af1_filter = options.apply_af1_filter
    vcf.vcf.dp4_minimum_depth = options.minimum_depth
    vcf.vcf.dp4_minimum_depth_strand = options.minimum_depth_strand
    vcf.vcf.dp4_minimum_ratio = options.minimum_ratio
    vcf.vcf.minimum_af1 = options.minimum_af1
    vcf.vcf.filter_dict['INFO'] = {}
    vcf.vcf.filter_dict['QUAL'] = options.quality

    for this in options.filter:
        this = this[0]
        signs = [">", "<", ">=", "<="]
        for sign in signs:
            if sign in this:
                key, value = this.split(sign, 1)
                key = key.strip()
                value = sign.strip() + value.strip()
                vcf.vcf.filter_dict['INFO'][key] = value
                break

    logger.info(vcf.vcf.filter_dict)

    res = vcf.vcf.filter_vcf(options.output_filename,
                             output_filtered=options.output_filtered_filename)

    print()
    #print(res)
    return res
Exemplo n.º 21
0
    def get_data(self, ontologies, include_negative_enrichment=True, fdr=0.05):

        if isinstance(ontologies, str):
            ontologies = [ontologies]
        else:
            assert isinstance(ontologies, list)
        # First, we select the required ontologies and build a common data set
        all_data = []
        for ontology in ontologies:
            data = self.enrichment[ontology]['result']
            if isinstance(data, dict):
                # there was only one hit, we expect:
                data = [data]
            all_data.extend(data)
        data = all_data

        # remove unclassified GO terms
        unclassified = [
            x for x in data if x['term']['label'] == "UNCLASSIFIED"
        ]
        logger.info("Found {} unclassified".format(len(unclassified)))
        data = [x for x in data if x['term']['label'] != "UNCLASSIFIED"]

        df = pd.DataFrame(data)
        if len(df) == 0:
            return df
        else:
            logger.info("Found {} GO terms".format(len(df)))

        df = df.query("number_in_list!=0").copy()
        logger.info(
            "Found {} GO terms with at least 1 gene in reference".format(
                len(df)))

        # extract the ID and label
        df['id'] = [x['id'] for x in df['term']]
        df['label'] = [x['label'] for x in df['term']]

        # some extra information for convenience
        df["pct_diff_expr"] = df['number_in_list'] * 100 / df[
            'number_in_reference']
        df["log2_fold_enrichment"] = pylab.log2(df['fold_enrichment'])
        df["abs_log2_fold_enrichment"] = abs(pylab.log2(df['fold_enrichment']))

        # Some user may want to include GO terms with fold enrichment
        # significanyly below 1 or not.
        if include_negative_enrichment is False:
            df = df.query("fold_enrichment>=1").copy()
            logger.info(
                "Found {} GO terms after keeping only positive enrichment".
                format(len(df)))

        # filter out FDR>0.05
        df = df.query("fdr<=@fdr").copy()
        logger.info("Found {} GO terms after keeping only FDR<{}".format(
            len(df), fdr))

        return df
Exemplo n.º 22
0
    def get_taxonomy_biokit(self, ids):
        """Retrieve taxons given a list of taxons

        :param list ids: list of taxons as strings or integers. Could also
            be a single string or a single integer
        :return: a dataframe

        .. note:: the first call first loads all taxons in memory and takes a
            few seconds but subsequent calls are much faster
        """
        # filter the lineage to keep only information from one of the main rank
        # that is superkingdom, kingdom, phylum, class, order, family, genus and
        # species
        ranks = ('kingdom', 'phylum', 'class', 'order', 'family', 'genus',
                 'species')

        if isinstance(ids, int):
            ids = [ids]

        if len(ids) == 0:
            return pd.DataFrame()

        logger.info('Retrieving taxon using biokit.Taxonomy')

        if isinstance(ids, list) is False:
            ids = [ids]

        lineage = [self.tax.get_lineage_and_rank(x) for x in ids]
        # Now, we filter each lineage to keep only relevant ranks
        # We drop the 'no rank' and create a dictionary
        # Not nice but works for now
        results = []
        for i, this in enumerate(lineage):
            default = dict.fromkeys(ranks, ' ')
            for entry in this:
                if entry[1] in ranks:
                    default[entry[1]] = entry[0]
                elif entry[1] == "superkingdom":
                    default["kingdom"] = entry[0]
            # Scientific name is the last entry tagged has no_rank  following
            # species TODO (check this assumption)
            # e.g. 351680 and 151529 have same 7 ranks so to differenatiate
            # them, the scientific name should be used.
            # By default, we will take the last one. If species or genus, we
            # repeat the term
            try:
                default['name'] = this[-1][0]
            except:
                default['name'] = "root (ambigous kingdom)"
            results.append(default)

        df = pd.DataFrame.from_records(results)
        df.index = ids
        df = df[list(ranks) + ['name']]
        df.index = df.index.astype(int)

        return df
Exemplo n.º 23
0
    def filter(self, identifiers_list=[], min_bp=None, max_bp=None,
        progressbar=True, output_filename='filtered.fastq'):
        """Save reads in a new file if there are not in the identifier_list

        :param int min_bp: ignore reads with length shorter than min_bp
        :param int max_bp: ignore reads with length above max_bp

        """
        # 7 seconds without identifiers to scan the file
        # on a 750000 reads

        if min_bp is None:
            min_bp = 0

        if max_bp is None:
            max_bp = 1e9

        # make sure we are at the beginning
        self.rewind()

        output_filename, tozip = self._istozip(output_filename)

        with open(output_filename, "w") as fout:
            pb = Progress(self.n_reads)
            buf = ""
            filtered = 0
            saved = 0 

            for count, lines in enumerate(grouper(self._fileobj)):
                identifier = lines[0].split()[0]
                if lines[0].split()[0].decode() in identifiers_list:
                    filtered += 1
                else:
                    N = len(lines[1])
                    if N <= max_bp and N >= min_bp:
                        buf += "{}{}+\n{}".format(
                            lines[0].decode("utf-8"),
                            lines[1].decode("utf-8"),
                            lines[3].decode("utf-8"))
                        saved += 1
                    else:
                        filtered += 1
                    if count % 100000 == 0:
                        fout.write(buf)
                        buf = ""
                if progressbar is True:
                    pb.animate(count+1)
            fout.write(buf)
            if filtered < len(identifiers_list):
                print("\nWARNING: not all identifiers were found in the fastq file to " +
                      "be filtered.")
        logger.info("\n{} reads were filtered out and {} saved in {}".format(
            filtered, saved, output_filename))

        if tozip is True: 
            logger.info("Compressing file")
            self._gzip(output_filename)
Exemplo n.º 24
0
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename, sep="\t", header=None,
                               usecols=[0,2,3], chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
                #only_classified_output when there is no found classified read
            self.unclassified = N # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
Exemplo n.º 25
0
    def download_taxonomic_file(self, overwrite=False):
        """Loads entire flat file from EBI

        Do not overwrite the file by default.
        """
        import ftplib
        from sequana import sequana_config_path
        if os.path.exists(self.database) and overwrite is False:
            logger.info(
                "Found taxonomy.dat file in sequana your path {}".format(
                    sequana_config_path))
            return
        else:
            logger.info(
                "Downloading and extracting the taxonomy file from the web. Please be patient."
            )

        if self.source == "ena":
            url = 'ftp.ebi.ac.uk'
        else:
            url = 'ftp.ncbi.nlm.nih.gov'

        self.ftp = ftplib.FTP(url)
        self.ftp.login()
        if self.source == "ena":
            # for the EBI ftp only: self.ftp.cwd('databases')
            self.ftp.cwd('pub')
            self.ftp.cwd('databases')
            self.ftp.cwd('taxonomy')
            logger.warning(
                'Downloading and saving in %s. This is from ebi and may be behind the NCBI taxonomy'
                % self.database)
            self.ftp.retrbinary('RETR taxonomy.dat',
                                open(self.database, 'wb').write)
            ftp.close()
        else:
            self.ftp.cwd('pub')
            self.ftp.cwd('taxonomy')
            logger.warning('Downloading and saving in %s from ncbi ftp' %
                           self.database)
            import tempfile
            import shutil
            with tempfile.TemporaryDirectory() as tmpdir:
                filename = tmpdir + os.sep + "taxdump.tar.gz"
                self.ftp.retrbinary('RETR taxdump.tar.gz',
                                    open(filename, "wb").write)
                import tarfile
                tf = tarfile.open(filename)
                assert "nodes.dmp" in tf.getnames()
                assert "names.dmp" in tf.getnames()
                tf.extract("nodes.dmp", tmpdir)
                tf.extract("names.dmp", tmpdir)
                ncbi = NCBITaxonomy(tmpdir + os.sep + "names.dmp",
                                    tmpdir + os.sep + "nodes.dmp")
                ncbi.create_taxonomy_file(tmpdir + os.sep + "taxonomy.dat")
                shutil.move(tmpdir + os.sep + "taxonomy.dat", self.database)
            self.ftp.close()
Exemplo n.º 26
0
    def get_taxonomy_biokit(self, ids):
        """Retrieve taxons given a list of taxons

        :param list ids: list of taxons as strings or integers. Could also
            be a single string or a single integer
        :return: a dataframe

        .. note:: the first call first loads all taxons in memory and takes a
            few seconds but subsequent calls are much faster
        """
        # filter the lineage to keep only information from one of the main rank
        # that is superkingdom, kingdom, phylum, class, order, family, genus and
        # species
        ranks = ('kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species')

        if isinstance(ids, int):
            ids = [ids]

        if len(ids) == 0:
            return pd.DataFrame()

        logger.info('Retrieving taxon using biokit.Taxonomy')

        if isinstance(ids, list) is False:
            ids = [ids]

        lineage = [self.tax.get_lineage_and_rank(x) for x in ids]
        # Now, we filter each lineage to keep only relevant ranks
        # We drop the 'no rank' and create a dictionary
        # Not nice but works for now
        results = []
        for i, this in enumerate(lineage):
            default = dict.fromkeys(ranks, ' ')
            for entry in this:
                if entry[1] in ranks:
                    default[entry[1]] = entry[0]
                elif entry[1] == "superkingdom":
                    default["kingdom"] = entry[0]
            # Scientific name is the last entry tagged has no_rank  following
            # species TODO (check this assumption)
            # e.g. 351680 and 151529 have same 7 ranks so to differenatiate
            # them, the scientific name should be used.
            # By default, we will take the last one. If species or genus, we
            # repeat the term
            try:
                default['name'] = this[-1][0]
            except:
                default['name'] = "root (ambigous kingdom)"
            results.append(default)

        df = pd.DataFrame.from_records(results)
        df.index = ids
        df = df[list(ranks) + ['name']]
        df.index = df.index.astype(int)

        return df
Exemplo n.º 27
0
def copy_config_from_sequana(module, source="config.yaml",
                             target="config.yaml"):
    # identify config name from the requested module
    user_config = module.path + os.sep + source
    if os.path.exists(user_config):
        shutil.copy(user_config, target)
        txt = "copied %s from sequana %s pipeline"
        logger.info(txt % (source, module.name))
    else:
        logger.warning(user_config + "not found")
Exemplo n.º 28
0
    def run(self, dbname="multiple", output_prefix="kraken_final"):
        """Run the hierachical analysis

        This method does not return anything but creates a set of files:

        - kraken_final.out
        - krona_final.html
        - kraken.png  (pie plot of the classified/unclassified reads)

        .. note:: the databases are run in the order provided in the constructor.
        """
        # list of all output to merge at the end
        self._list_kraken_output = []
        self._list_kraken_input = []

        # Iteration over the databases
        for iteration in range(len(self.databases)):
            status = self._run_one_analysis(iteration)
            last_unclassified = self._list_kraken_input[-1]
            stat = os.stat(last_unclassified)
            if stat.st_size == 0:
                break

        # concatenate all kraken output files
        file_output_final = self.output_directory + os.sep + "%s.out" % output_prefix
        with open(file_output_final, 'w') as outfile:
            for fname in self._list_kraken_output:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)

        # create html report
        logger.info("Analysing results")
        result = KrakenResults(file_output_final)

        # TODO: this looks similar to the code in KrakenPipeline. could be factorised
        result.to_js("%s%s%s.html" %
                     (self.output_directory, os.sep, output_prefix))
        result.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")
        prefix = self.output_directory + os.sep
        result.kraken_to_json(prefix + "kraken.json", dbname)
        result.kraken_to_csv(prefix + "kraken.csv", dbname)

        # remove kraken intermediate files (including unclassified files)
        if self.unclassified_output:
            # Just cp the last unclassified file
            import shutil
            shutil.copy2(self._list_kraken_input[-1], self.unclassified_output)

        if not self.keep_temp_files:
            for f_temp in self._list_kraken_output:
                os.remove(f_temp)
            for f_temp in self._list_kraken_input:
                os.remove(f_temp)
Exemplo n.º 29
0
    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info("Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
Exemplo n.º 30
0
    def _download_sequana_db1(self, verbose=True):
        dbname = "sequana_db1"
        from easydev import md5
        dir1 = sequana_config_path + os.sep + dbname
        dir2 = dir1 + os.sep + "taxonomy"
        self.dv.mkdir(dir1)
        self.dv.mkdir(dir2)

        logger.info("Downloading about 8Gb of data (if not already downloaded) from"
            " Synapse into %s" % dir1)

        from os.path import exists
        filename = dir1 + "ena_list.txt"
        if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2":
            pass
        else:
            self._download_from_synapse('syn6171700', dir1)

        # database.idx
        filename = dir1 + "database.idx"
        if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac":
            pass
        else:
            self._download_from_synapse('syn6171017', dir1)

        # database.kdb ; this one is large (8Gb)
        filename = dir1 + "database.kdb"
        if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf":
            pass
        else:
            self._download_from_synapse('syn6171107', dir1)

        # Then, the taxonomy directory
        filename = dir1 + "names.dmp"
        if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0":
            pass
        else:
            self._download_from_synapse('syn6171286', dir2)

        filename = dir1 + "nodes.dmp"
        if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980":
            pass
        else:
            self._download_from_synapse('syn6171289', dir2)

        filename = dir1 + "taxons.txt"
        if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f":
            pass
        else:
            self._download_from_synapse('syn6171290', dir2)
        logger.info('done. You should have a kraken DB in %s' % dir1)

        # The annotations
        wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv",
            dir1 + os.sep + "annotations.csv")
Exemplo n.º 31
0
 def _get_qualities(self):
     from sequana import logger
     logger.info("Extracting qualities")
     qualities = []
     with FastqReader(self.filename) as f:
         for i, record in enumerate(f):
             if i < self.max_sample:
                 quality = [ord(x) -33 for x in record.qualities]
                 qualities.append(quality)
             else:
                 break
     return qualities
Exemplo n.º 32
0
    def ena_id_to_gi_number(self, identifiers):

        # Now, let us convert the ENA accession to NCBI GI number once for all.
        # We can fetch only at max 200 identifiers:
        logger.info("Fetching %s identifiers from NCBI" % len(identifiers))
        Nbaskets = int(math.ceil(len(identifiers) / 200.))
        results = {}
        from easydev import split_into_chunks
        for chunk in split_into_chunks(identifiers, Nbaskets):
            result = self.eutils.accession_to_info(",".join(chunk))
            results.update(result)
        return results
Exemplo n.º 33
0
    def _download_assembly_report(self, category):
        assert category in self.category

        ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov")
        ftp.login("anonymous", "anonymous")
        ftp.cwd("genomes/refseq/{}".format(category))

        filename = "assembly_summary.txt"
        ftp.retrbinary(
            'RETR ' + filename,
            open(filename.replace(".txt", "_{}.txt".format(category)),
                 "wb").write)
        logger.info(filename)
Exemplo n.º 34
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Exemplo n.º 35
0
    def _download_minikraken(self, verbose=True):
        dv = DevTools()
        base = sequana_config_path + os.sep + ""
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        logger.info("Downloading minikraken (4Gb)")

        filename = base + os.sep + "minikraken.tgz"
        if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2":
            logger.warning("%s already present" % filename)
        else:
            wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
Exemplo n.º 36
0
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)
Exemplo n.º 37
0
    def _set_window(self,window):
        if (window > 0) & (window < 1):
            self._type_window = 'adapted to genome length : %.1f %% of total length' %(window*100)
            self._window = int(round(self.__len__() * window))

        elif (window >= 1) & ( window <= self.__len__()):
            self._type_window = 'fixed window length : %d' %(window)
            self._window = int(window)
        else:
            raise ValueError("Incorrect value for window: choose either float ]0,1]" +
                            " (fraction of genome) or integer [1,genome_length] (window size)")

        logger.info("Computing GC skew")
        self._compute_skews()
Exemplo n.º 38
0
    def export(self, filename='test.png'):
        if self.status is None:
            logger.error("Upload the tree first with upload() method")

        export = self.itol.get_itol_export()

        # Set the format
        if filename.endswith(".png"):
            logger.info("Exporting in {} format".format("png"))
            export.params['format'] = "png"
        elif filename.endswith(".svg"):
            logger.info("Exporting in {} format".format("svg"))
            export.params['format'] = "svg"
        elif filename.endswith(".pdf"):
            logger.info("Exporting in {} format".format("pdf"))
            export.params['format'] = "pdf"
        elif filename.endswith(".eps"):
            logger.info("Exporting in {} format".format("eps"))
            export.params['format'] = "eps"
        else:
            raise ValueError("filename must end in pdf, png, svg or eps")

        export.params.update(**self.params)

        export.export(filename)
Exemplo n.º 39
0
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)
Exemplo n.º 40
0
    def run(self, dbname="multiple", output_prefix="kraken_final"):
        """Run the hierachical analysis

        This method does not return anything but creates a set of files:

        - kraken_final.out
        - krona_final.html
        - kraken.png  (pie plot of the classified/unclassified reads)

        .. note:: the databases are run in the order provided in the constructor.
        """
        # list of all output to merge at the end
        self._list_kraken_output = []
        self._list_kraken_input = []

        # Iteration over the databases
        for iteration in range(len(self.databases)):
            self._run_one_analysis(iteration)

        # concatenate all kraken output files
        file_output_final = self.output_directory + os.sep + "%s.out" % output_prefix
        with open(file_output_final, 'w') as outfile:
            for fname in self._list_kraken_output:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)

        # create html report
        logger.info("Analysing results")
        result = KrakenResults(file_output_final)

        # TODO: this looks similar to the code in KrakenPipeline. could be factorised
        result.to_js("%s%s%s.html" % (self.output_directory, os.sep, output_prefix))
        result.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")
        prefix = self.output_directory + os.sep
        result.kraken_to_json(prefix + "kraken.json", dbname)
        result.kraken_to_csv(prefix + "kraken.csv", dbname)

        # remove kraken intermediate files (including unclassified files)
        if not self.keep_temp_files:
            for f_temp in self._list_kraken_output:
                os.remove(f_temp)
            for f_temp in self._list_kraken_input:
                os.remove(f_temp)
Exemplo n.º 41
0
    def _run_one_analysis(self, iteration):
        """ Run one analysis """
        db = self.databases[iteration]
        logger.info("Analysing data using database {}".format(db))

        # By default, the output contains only classified reads
        only_classified_output = True

        # a convenient alias
        _pathto = lambda x: self.output_directory + x

        # the output is saved in this file
        file_kraken_class = _pathto("kraken_%d.out" % iteration)
        output_filename_unclassified = _pathto("unclassified_%d.fastq" % iteration)
        file_fastq_unclass = _pathto("unclassified_%d.fastq" % iteration)

        if iteration == 0:
            inputs = self.inputs
        else:
            # previous results
            inputs = self._list_kraken_input[iteration-1]

        # if this is the last iteration (even if iteration is zero), save
        # classified and unclassified in the final kraken results.
        if iteration == len(self.databases) -1:
            only_classified_output = False

        analysis = KrakenAnalysis(inputs, db, self.threads)
        analysis.run(output_filename=file_kraken_class,
                     output_filename_unclassified=output_filename_unclassified,
                     only_classified_output=only_classified_output)

        self._list_kraken_input.append(file_fastq_unclass)
        self._list_kraken_output.append(file_kraken_class)

        if self.keep_temp_files:
            result = KrakenResults(file_kraken_class)
            result.to_js("%skrona_%d.html" %(self.output_directory, iteration))
Exemplo n.º 42
0
    def filter_mapq(self, output_filename, threshold_min=0,
        threshold_max=255):
        """Select and Write reads within a given range

        :param str output_filename: name of output file
        :param int threshold_min: minimum length of the reads to keep
        :param int threshold_max: maximum length of the reads to keep

        """
        assert threshold_min < threshold_max
        assert output_filename != self.filename, \
            "output filename should be different from the input filename"
        self.reset()
        count = 0
        with pysam.AlignmentFile(output_filename,  "wb", template=self.data) as fh:
            for read in self.data:
                if ((read.mapq < threshold_max) & (read.mapq > threshold_min)):
                    fh.write(read)
                else:
                    pass
                count += 1
                if count % 10000:
                    logger.info("%s sequence processed" % count)
Exemplo n.º 43
0
    def random_selection(self, output_filename, nreads=None,
            expected_coverage=None, reference_length=None, read_lengths=None):
        """Select random reads

        :param nreads: number of reads to select randomly. Must be less than
            number of available reads in the orignal file.
        :param expected_coverage:
        :param reference_length:

        if expected_coverage and reference_length provided, nreads is replaced
        automatically.

        .. note:: to speed up computation (if you need to call random_selection
            many times), you can provide the mean read length manually
        """
        assert output_filename != self.filename, \
            "output filename should be different from the input filename"

        if read_lengths is None:
            self.reset()
            read_lengths = [read.query_length for i, read in enumerate(self.data)]

        N = len(read_lengths)

        if expected_coverage and reference_length:
            mu = pylab.mean(read_lengths)
            nreads = int(expected_coverage * reference_length / mu)

        assert nreads < N, "nreads parameter larger than actual Number of reads"
        selector = random.sample(range(N), nreads)
        logger.info("Creating a pacbio BAM file with {} reads".format(nreads))

        with pysam.AlignmentFile(output_filename,"wb", template=self.data) as fh:
            self.reset()
            for i, read in enumerate(self.data):
                if i in selector:
                    fh.write(read)
Exemplo n.º 44
0
    def _download_kraken_toydb(self, verbose=True):
        """Download the kraken DB toy example from sequana_data into
        .config/sequana directory

        Checks the md5 checksums. About 32Mb of data
        """
        dv = DevTools()
        base = sequana_config_path + os.sep + "kraken_toydb"
        taxondir = base + os.sep + "taxonomy"
        dv.mkdir(base)
        dv.mkdir(taxondir)

        baseurl = "https://github.com/sequana/data/raw/master/"

        # download only if required
        logger.info("Downloading the database into %s" % base)

        md5sums = [
            "28661f8baf0514105b0c6957bec0fc6e",
            "97a39d44ed86cadea470352d6f69748d",
            "d91a0fcbbc0f4bbac918755b6400dea6",
            "c8bae69565af2170ece194925b5fdeb9"]
        filenames = [
            "database.idx",
            "database.kdb",
            "taxonomy/names.dmp",
            "taxonomy/nodes.dmp"]

        for filename, md5sum in zip(filenames, md5sums):
            url = baseurl + "kraken_toydb/%s" % filename
            filename = base + os.sep + filename
            if os.path.exists(filename) and md5(filename) == md5sum:
                logger.warning("%s already present" % filename)
            else:
                logger.info("Downloading %s" % url)
                wget(url, filename)
Exemplo n.º 45
0
    def _to_fastX(self, mode, output_filename, threads=2):
        """

        :param mode: fastq or fasta

        """
        # for now, we use samtools
        # can use bamtools as well but as long and output 10% larger (sequences
        # are split on 80-characters length)
        from snakemake import shell
        cmd = "samtools %s  -@ %s %s > %s" % (mode, threads,
            self.filename, output_filename)
        logger.info("Please be patient")
        logger.info("This may be long depending on your input data file: ")
        logger.info("typically, a minute per  500,000 reads")
        shell(cmd)
        logger.info("done")
Exemplo n.º 46
0
    def stats(self):
        results = {}
        if self.data is not None:
            logger.info("Reading strand")
            results['strand'] = {
                "+": sum(self.data.strand == "+"),
                "-": sum(self.data.strand == "-"),
                "?": sum(self.data.strand.isnull())
            }

            results['classification'] = {
                "total_ccs_reads" : len(self.data),
                "five_prime_reads" : int(self.data.fiveseen.sum()),
                "three_prime_reads" : int(self.data.threeseen.sum()),
                "chimera" : int(self.data.chimera.sum()),
                "polyA_reads" : int(self.data.polyAseen.sum()),
            }

        if self.lq_isoforms:
            logger.info("Reading LQ isoforms")
            results['lq_isoform'] = self.lq_sequence.stats() # number of 

        if self.hq_isoforms:
            logger.info("Reading HQ isoforms")
            results['hq_isoform'] = self.hq_sequence.stats() # number of polished HQ isoform

        if self.ccs:
            seq = [ len(read.sequence) for read in self.ccs]
            results["CCS"] = {
                "mean_length" : pylab.mean(seq),
                "number_ccs_bases" : sum(seq),
                "number_ccs_reads" : len(seq)
            }

        self.idents_v = []
        self.full_v = []
        self.non_full_v = []
        self.isoform_lengths = []
        for read in self.lq_sequence:
            ident, full, non_full, length = read['identifier'].decode().split(";")
            self.idents_v.append(ident)
            self.full_v.append(int(full.split("=")[1]))
            self.non_full_v.append(int(non_full.split("=")[1]))
            self.isoform_lengths.append(int(length.split("=")[1]))

        return results
Exemplo n.º 47
0
    def get_df_concordance(self, max_align=-1):
        """This methods returns a dataframe with Insert, Deletion, Match,
        Substitution, read length, concordance (see below for a definition)


        Be aware that the SAM or BAM file must be created using minimap2 and the
        --cs option to store the CIGAR in a new CS format, which also contains
        the information about substitution. Other mapper are also handled (e.g.
        bwa) but the substitution are solely based on the NM tag if it exists.

        alignment that have no CS tag or CIGAR are ignored.


        """
        from sequana import Cigar
        count = 0
        I, D, M, L, mapq, flags, NM = [], [], [], [], [], [], []
        S = []
        for i, a in enumerate(self._data):
            # tags and cigar populated  if there is a match
            # if we use --cs cigar is not populated so we can only look at tags
            # tags can be an empty list
            if a.tags is None or len(a.tags) == 0:
                continue
            count += 1
            mapq.append(a.mapq)
            L.append(a.qlen)
            try:
                NM.append([x[1] for x in a.tags if x[0] == "NM"][0])
            except:
                NM.append(-1)

            flags.append(a.flag)

            if 'cs' in dict(a.tags):
                cs = CS(dict(a.tags)['cs'])
                S.append(cs['S'])
                I.append(cs['I'])
                D.append(cs['D'])
                M.append(cs['M'])
            elif a.cigarstring:
                cigar = Cigar(a.cigarstring).as_dict()
                I.append(cigar["I"])
                D.append(cigar['D'])
                M.append(cigar['M'])
                S.append(None)  # no info about substitutions in the cigar
            else:
                I.append(0)
                D.append(0)
                M.append(0)
                S.append(0)

            if max_align>0 and count == max_align:
                break

            if count % 10000 == 0:
                logger.debug("Read {} alignments".format(count))

        I = np.array(I)
        D = np.array(D)
        M = np.array(M)
        NM = np.array(NM)

        try:
            S = np.array(S)
            C = 1 - (I + D + S)/(S + I + D + M)
            logger.info("computed Concordance based on minimap2 --cs option")
        except:
            logger.info("computed Concordance based on standard CIGAR information using INDEL and NM tag")
            computed_S = NM - D - I
            C = 1 - (I + D + computed_S)/(computed_S + I + D + M)

        df = pd.DataFrame([C, L, I, D, M, mapq, flags, NM, S])
        df = df.T
        df.columns = ["concordance", 'length', "I", "D", "M", "mapq", "flags", "NM", "mismatch"]
        return df
Exemplo n.º 48
0
    def _get_df(self):
        # When scanning the BAM, we can extract the length, SNR of ACGT (still
        # need to know how to use it). The GC content (note there is no
        # ambiguity so no S character). The ZMW. Also, from the tags we could
        # get more

        # In each alignement, there are lots of information to retrieve.
        # One could for instance introspect the tags.
        # - cx: subread local context flags
        # - ip: vector of length qlen from 0 to 250. This is the IPD (raw frames
        # or codec V1)
        # - np: number of passes (1 for subread, variable for CCS)
        # - pw: vector of length qlen from 0 to 128? This is the PulseWidth (raw
        # frames or codec V1)
        # - qs: 0-based start of query in the ZMW read (absent in CCS)
        # - qe: 0-based end of query in the ZMW read (absent in CCS)
        # - zm: position/ID of the ZMW
        # - sn: list of ACGT SNRs. A, C, G, T in that order
        # - rq: float encoding exepted accuracy

        # - dq: DeletionQV
        # - dt: deletion Tag
        # - iq: insertionQV
        # - mq: mergeQV
        # - sq: substituionQV
        # - st: substituion tag
        # - RG: ?

        # See http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html
        if self._df is None:
            logger.info("Scanning input file. Please wait")
            self.reset()
            N = 0

            all_results = []
            # This takes 60%  of the time...could use cython ?
            for i, read in enumerate(self.data):
                tags = dict(read.tags)
                res = []
                # count reads
                N += 1
                if (N % 10000) == 0:
                    logger.info("Read %d sequences" %N)

                # res[0] = read length
                res.append(read.query_length) # also stored in tags["qe"] - tags["qs"]
                res.append(read.reference_length) # also stored in tags["qe"] - tags["qs"]

                # collections.counter is slow, let us do it ourself
                if read.query_length and read.query_sequence:
                    res.append( 100. / read.query_length * sum(
                        [read.query_sequence.count(letter) for letter in "CGcgSs"]))
                else:
                    res.append(None)

                # res[1:4] contains SNR  stored in tags['sn'] in the order A, C, G, T
                try:
                    snr = list(tags['sn'])
                except:
                    snr = [None] * 4
                res = res + snr

                # res[6] = ZMW name, also stored in tags["zm"]
                try:
                    res.append(int(read.qname.split('/')[1]))
                except: # simulated data may not have the ZMW info, in which
                        #case, we store just a unique ID
                    res.append(i)

                # aggregate results
                all_results.append(res)

                if self._sample and N >= self._sample:
                    break

            self._df = pd.DataFrame(all_results,
                columns=['read_length', "reference_length", 'GC_content',
                            'snr_A','snr_C','snr_G','snr_T','ZMW'])

            # populate the nb passes from the ZMW
            grouped = self._df.groupby("ZMW")
            agg = grouped.agg({"read_length": len})

            ZMW = self._df.ZMW.unique()
            aa = list(pylab.flatten([[agg.loc[this][0]] * agg.loc[this][0] for this in ZMW]))
            self._df['nb_passes'] = aa
            self._df['nb_passes'] -= 1 # nb passes starts at 0

            self.reset()
        return self._df
Exemplo n.º 49
0
def run_analysis(chrom, options, feature_dict):


    logger.info("Computing some metrics")
    if chrom.DOC < 8:
        logger.warning("The depth of coverage is below 8. sequana_coverage is"
                        " not optimised for such depth. You may want to "
                        " increase the threshold to avoid too many false detections")
    logger.info(chrom.__str__())

    if options.w_median > len(chrom.df) / 4:
        NW = int(len(chrom.df) / 4)
        if NW % 2 == 0:
            NW += 1
        logger.warning("median window length is too long. \n"
            "    Setting the window length automatically to a fifth of\n"
            "    the chromosome length ({})".format(NW))
        options.w_median = NW

    # compute the running median, zscore and ROIs for each chunk summarizing the
    # results in a ChromosomeCovMultiChunk instane
    logger.info('Using running median (w=%s)' % options.w_median)
    logger.info("Number of mixture models %s " % options.k)
    results = chrom.run(options.w_median, options.k,
                        circular=options.circular, binning=options.binning,
                        cnv_delta=options.cnv_clustering)


    # Print some info related to the fitted mixture models
    try:
        mu = results.data[0][0].as_dict()['data']['fit_mu']
        sigma = results.data[0][0].as_dict()['data']['fit_sigma']
        pi = results.data[0][0].as_dict()['data']['fit_pi']
        logger.info("Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" %
              (round(mu,3), round(sigma,3), round(pi,3)))
    except:
        pass

    # some information about the ROIs found
    high = chrom.thresholds.high2
    low = chrom.thresholds.low2
    logger.info("Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format(
        chrom.thresholds.low, chrom.thresholds.high, low, high))
    ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane
    logger.info("Number of ROIs found: {}".format(len(ROIs.df)))
    logger.info("    - below average: {}".format(len(ROIs.get_low_rois())))
    logger.info("    - above average: {}".format(len(ROIs.get_high_rois())))

    # Create directory and save ROIs
    directory = options.output_directory
    directory += os.sep + "coverage_reports"
    directory += os.sep + chrom.chrom_name
    mkdirs(directory)
    ROIs.df.to_csv("{}/rois.csv".format(directory))

    # save summary and metrics
    logger.info("Computing extra metrics")
    summary = results.get_summary()

    summary.to_json(directory + os.sep + "sequana_summary_coverage.json")
    logger.info("Evenness: {}".format(summary.data['evenness']))
    logger.info("Centralness (3 sigma): {}".format(summary.data['C3']))
    logger.info("Centralness (4 sigma): {}".format(summary.data['C4']))

    if options.skip_html:
        return

    logger.info("Creating report in %s. Please wait" % config.output_dir)
    if chrom._mode == "chunks":
        logger.warning(("This chromosome is large. " 
            "Plots in the HTML reports are skipped"))
    datatable = CoverageModule.init_roi_datatable(ROIs)
    ChromosomeCoverageModule(chrom, datatable,
                options={"W": options.w_median,
                         "k": options.k,
                         "ROIs": ROIs,
                         "circular": options.circular},
                command=" ".join(["sequana_coverage"] + sys.argv[1:]))
Exemplo n.º 50
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.logging_level

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" %
            (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df
        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" %
            (options.download_genbank, options.database))
        from sequana.snpeff import download_fasta_and_genbank
        download_fasta_and_genbank(options.download_genbank,
                                   options.download_genbank,
                                   genbank=True, fasta=False)
        return

    if options.genbank:
        assert os.path.exists(options.genbank), \
            "%s does not exists" % options.genbank

    logger.info("Reading %s. This may take time depending on "
        "your input file" % options.input)

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile))
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # and output directory
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split('.')[0]

    # Now we can create the instance of GenomeCoverage
    if options.chromosome == -1:
        chrom_list = []
    else: 
        chrom_list = [options.chromosome]
    gc = GenomeCov(bedfile, options.genbank, options.low_threshold,
                   options.high_threshold, options.double_threshold,
                   options.double_threshold, chunksize=options.chunksize,
                   chromosome_list=chrom_list)


    # if we have the reference, let us use it
    if options.reference:
        logger.info('Computing GC content')
        gc.compute_gc_content(options.reference, options.w_gc,
                              options.circular)

    # Now we scan the chromosomes,
    if len(gc.chrom_names) == 1:
        logger.warning("There is only one chromosome. Selected automatically.")
        run_analysis(gc.chr_list[0], options, gc.feature_dict)
    elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names):
        msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))
        logger.error(msg)
        sys.exit(1)
    else:
        if options.chromosome == -1:
            chromosomes = gc.chrom_names # take all chromosomes
        else:
            # For user, we start at position 1 but in python, we start at zero
            chromosomes = [gc.chrom_names[options.chromosome-1]]

        logger.info("There are %s chromosomes/contigs." % len(gc))
        for this in gc.chrom_names:
            data = (this, gc.positions[this]["start"], gc.positions[this]["end"])
            logger.info("    {} (starting pos: {}, ending pos: {})".format(*data))

        # here we read chromosome by chromosome to save memory.
        # However, if the data is small.
        for i, chrom in enumerate(chromosomes):
            logger.info("==================== analysing chrom/contig %s/%s (%s)"
                  % (i + 1, len(gc), gc.chrom_names[i]))
            # since we read just one contig/chromosome, the chr_list contains
            # only one contig, so we access to it with index 0
            run_analysis(gc.chr_list[i], options, gc.feature_dict)

    if options.skip_multiqc is False:
        logger.info("=========================")
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg)
        import subprocess
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()
Exemplo n.º 51
0
    def _compute_skews(self):
        ### initialisation =  Calculating GC skew and AT skew for first window
        self._init_sliding_window()
        GC_content_slide, GC_skew_slide = self._init_list_results()
        AT_content_slide, AT_skew_slide = self._init_list_results()
        self._init_cumul_nuc()

        c = Counter(self._slide_window)
        dict_counts = {'G' : c['G'], 'C' : c['C'], 'A' : c['A'], 'T' : c['T']}
        i = 0

        # GC
        sumGC = float(dict_counts['G'] + dict_counts['C'])
        GC_content_slide[0][i] = sumGC
        if sumGC > 0:
            GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC
        # AT
        sumAT = float(dict_counts['A'] + dict_counts['T'])
        AT_content_slide[0][i] = sumAT
        if sumAT > 0:
            AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT

        ### Compute for all genome
        while(self._seq_right):
            out_nuc = self._slide_window.popleft()
            in_nuc = self._seq_right.popleft()
            self._slide_window.append(in_nuc)

            i += 1

            if i % 500000 == 0:
                logger.info("%d / %d" % (i, self.__len__()))
            # if in and out are the same : do nothing, append same result
            if out_nuc != in_nuc:
                # remove out from counters
                if out_nuc in self._dict_nuc:
                    dict_counts[out_nuc] -= 1
                if in_nuc in self._dict_nuc:
                    dict_counts[in_nuc] += 1
                sumGC = float(dict_counts['G'] + dict_counts['C'])
                sumAT = float(dict_counts['A'] + dict_counts['T'])

            # fill results
            # GC
            GC_content_slide[0][i] = sumGC
            if sumGC > 0:
                GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C'])/sumGC
            # AT
            AT_content_slide[0][i] = sumAT
            if sumAT > 0:
                AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T'])/sumAT
            # cumul
            if in_nuc in self._dict_nuc:
                self._cumul[self._dict_nuc[in_nuc]][i+self._window-1] +=1

        self._GC_content_slide = GC_content_slide/float(self._window)
        self._AT_content_slide = AT_content_slide/float(self._window)
        self._cumul = np.delete(self._cumul, range(self.__len__(),self._cumul.shape[1]),1)
        self._cumul = np.cumsum(self._cumul,axis=1)

        ### save result for Z curve
        self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']]))

        self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\
         (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']]))

        self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\
         (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']]))

        self._AT_skew_slide = AT_skew_slide
        self._GC_skew_slide = GC_skew_slide

        ### check proportion of ignored nucleotides
        GC_content_total = (self._cumul[self._dict_nuc['G']][-1] +
            self._cumul[self._dict_nuc['C']][-1]) / float(self.__len__())
        AT_content_total = (self._cumul[self._dict_nuc['A']][-1] +
            self._cumul[self._dict_nuc['T']][-1]) / float(self.__len__())
        self._ignored_nuc = 1.0 - GC_content_total - AT_content_total
Exemplo n.º 52
0
def main(args=None):
    if args is None:
        args = sys.argv[:]


    print("Welcome to sequana_vcf_filter")
    user_options = Options(prog="sequana_vcf_filter")

    if "--version" in args:
        import sequana
        print(sequana.version)
        sys.exit()
    elif len(args) == 1 or "--help" in args:
        user_options.parse_args(["prog", "--help"])
    elif len(args) == 2:
        class SimpleOpt():
            pass
        options = SimpleOpt()
        options.input_filename = args[1]
    else:
        options = user_options.parse_args(args[1:])


    # set the level
    logger.level = options.level

    vcf = VCF(options.input_filename)
    vcf.vcf.filter_dict['QUAL'] =  options.quality

    vcf.vcf.apply_indel_filter = options.apply_indel_filter
    vcf.vcf.apply_dp4_filter = options.apply_dp4_filter
    vcf.vcf.apply_af1_filter = options.apply_af1_filter
    vcf.vcf.dp4_minimum_depth = options.minimum_depth
    vcf.vcf.dp4_minimum_depth_strand = options.minimum_depth_strand
    vcf.vcf.dp4_minimum_ratio = options.minimum_ratio
    vcf.vcf.minimum_af1 = options.minimum_af1
    vcf.vcf.filter_dict['INFO'] = {}
    vcf.vcf.filter_dict['QUAL'] =  options.quality





    for this in options.filter:
        this = this[0]
        signs = [">", "<", ">=", "<="]
        for sign in signs:
            if sign in this:
                key, value = this.split(sign, 1)
                key = key.strip()
                value = sign.strip() + value.strip()
                vcf.vcf.filter_dict['INFO'][key] = value
                break


    logger.info(vcf.vcf.filter_dict)

    res = vcf.vcf.filter_vcf(options.output_filename,
                       output_filtered=options.output_filtered_filename)

    print()
    #print(res)
    return res