예제 #1
0
    def __init__(self, filename):

        # .xls is not stuctured similarly to .narrowPeak
        if filename.endswith(".xls"):
            #
            self.df = pd.read_csv(filename, comment="#", sep="\t")
            for x in [
                    'chr', 'start', 'end', 'abs_summit', '-log10(pvalue)',
                    'fold_enrichment', '-log10(qvalue)', 'name'
            ]:
                # we could have also pileup depeding on the user's options
                assert x in self.df.columns
            self.df['stop'] = self.df['end']
        elif filename.endswith("narrowPeak"):
            self.df = pd.read_csv(filename, header=None, sep='\t')
            self.df.columns = [
                'chr', 'start', 'stop', 'name', 'score', 'NA',
                'fold_enrichment', '-log10(pvalue)', '-log10(qvalue)',
                'abs_summit_from_start'
            ]
            self.df['end'] = self.df['stop']
        elif filename.endswith("broadPeak"):
            self.df = pd.read_csv(filename, header=None, sep='\t')
            self.df.columns = [
                'chr', 'start', 'stop', 'name', 'score', 'NA',
                'fold_enrichment', '-log10(pvalue)', '-log10(qvalue)'
            ]
            self.df['end'] = self.df['stop']
        self.df['length'] = self.df['stop'] - self.df['start']
예제 #2
0
    def __init__(self, names="names.dmp", nodes="nodes.dmp", verbose=True):
        if verbose:
            print("Reading %s" % names)
        self.df_name = pd.read_csv(names, sep='\t', header=None)
        self.df_name = self.df_name[[0, 2, 6]]
        self.df_name.columns = ["taxon", "name", "scname"]

        # This will provide a faster lookup table to search for scientific
        # names given a taxon. We can drop rows that are not scientific names
        # and set the taxons as index
        _subdf = self.df_name.query("'scientific name' in scname")
        self._subdf = _subdf.set_index("taxon")

        # Here, this is for general purpose (slower it we were to use
        # this for the get_scientic_name method
        self._group_name = self.df_name.groupby('taxon').groups

        if verbose:
            print("Reading %s" % nodes)
        self.df_nodes = pd.read_csv(nodes, sep='\t', header=None)
        self.df_nodes = self.df_nodes[[
            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24
        ]]
        self.df_nodes.columns = [
            'taxon', 'parent_taxon', 'rank', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9
        ]

        self._df_nodes_taxon = self.df_nodes.copy()
        self._df_nodes_taxon.set_index('taxon', inplace=True)
예제 #3
0
파일: mgi.py 프로젝트: sequana/sequana
    def __init__(self, filename):
        self.filename = filename
        self.df = pd.read_csv(self.filename, sep='\t', comment="#", header=None)

        self.metadata = {}
        with open(self.filename, "r") as fin:
            line = fin.readline()
            while line.startswith('#'):
                key, value = line.split(maxsplit=1)
                key = key[1:]  # skip the # character
                try:
                    self.metadata[key] = int(value.strip())
                except:
                    self.metadata[key] = value.strip()
                if key.endswith("%"):
                    try:
                        self.metadata[key] = float(value.strip())
                    except: #pragma: no cover
                        pass
                line = fin.readline()
        try:
            ncounts = self.metadata['N_Count']
            self.metadata['N_Count'] =  int(ncounts.split('\t')[0])
            self.metadata['N_Count%'] =  float(ncounts.split('\t')[1]  )
        except Exception as err: #pragma no cover
            print(err)
            pass
        self.df.columns = ['Pos'] + self.metadata['Pos'].split('\t')
        del self.metadata['Pos']
        self.df.set_index('Pos', inplace=True)
예제 #4
0
    def __init__(self, data):
        """.. rubric:: constructor

        :param data: it can be a csv filename created by
        sequana.freebayes_vcf_filter or a
        :class:`freebayes_vcf_filter.Filtered_freebayes` object.
        """
        super().__init__()
        self.title = "Variant Calling Report"
        try:
            with open(data, "r") as fp:
                self.filename = data
                line = fp.readline()
                if line.startswith("# sequana_variant_calling"):
                    string_dict = line.split(";")[-1].strip()
                    try:
                        self.filter_dict = ast.literal_eval(string_dict)
                    except SyntaxError:
                        self.filter_dict = None
                    self.df = pd.read_csv(fp)
        except FileNotFoundError:
            msg = ("The csv file is not present. Please, check if your"
                   " file is present.")
            raise FileNotFoundError(msg)
        except TypeError:
            self.df = data.df
            self.filter_dict = data.vcf.filters_params
        self.create_report_content()
        self.create_html("variant_calling.html")
예제 #5
0
파일: kraken.py 프로젝트: brwnj/sequana
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename,
                                 sep="\t",
                                 header=None,
                                 usecols=[0, 2, 3],
                                 chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
            #only_classified_output when there is no found classified read
            self.unclassified = N  # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass  # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
예제 #6
0
파일: isoseq.py 프로젝트: sequana/sequana
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)
예제 #7
0
파일: rnadiff.py 프로젝트: sequana/sequana
    def check_and_save_input_tables(self, sep_counts, sep_design):
        try:
            self.outdir.mkdir()
        except:
            pass

        counts = pd.read_csv(self.usr_counts,
                             sep=sep_counts,
                             index_col="Geneid",
                             comment="#").sort_index(axis=1)

        design = RNADesign(self.usr_design, sep=sep_design)
        design = design.df.set_index("label").sort_index()

        if list(counts.columns) != list(design.index):
            logger.error(
                f"Counts columns and design rows does not match (after sorting)."
            )
            logger.error(counts.columns)
            logger.error(design.index)
            sys.exit(1)

        counts.to_csv(self.counts_filename)
        design.to_csv(self.design_filename)

        return counts, design
예제 #8
0
    def __init__(self, directory=".", prefix=""):
        self.prefix = prefix
        self.directory = directory
        self.sample_name = "undefined"

        # low quality isoforms
        filename = "all.polished_lq.fastq"
        self.lq_isoforms = self.get_file(filename)
        if self.lq_isoforms:
            logger.info("Reading {}".format(filename))
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        filename = "all.polished_hq.fastq"
        self.hq_isoforms = self.get_file(filename)
        if self.hq_isoforms:
            logger.info("Reading {}".format(filename))
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        filename = "file.csv"
        self.csv = self.get_file(filename)
        if self.csv:
            logger.info("Reading {}".format(filename))
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        filename = "ccs.fasta"
        self.ccs = self.get_file(filename, noprefix=True)
        if self.ccs:
            logger.info("Reading {}".format(filename))
            self.ccs = FastA(self.ccs)
예제 #9
0
    def get_df(self):
        import pandas as pd
        data = {}
        for sample, filename in zip(self.sample_names, self.filenames):
            df = pd.read_csv(filename)
            df = df.groupby("kingdom")['percentage'].sum()
            # if a taxon is obsolete, the kingdom is empty.
            # We will set the kingdom as Unclassified and raise a warning
            # if the count is > 5%
            if " " in df.index:
                percent = df.loc[" "]
                if percent > 5:
                    logger.warning(
                        "Found {}% of taxons in obsolete category".format(
                            percent))
                if "Unclassified" in df.index:
                    df.loc['Unclassified'] += df.loc[' ']
                    df.drop(" ", inplace=True)
                else:
                    df.loc['Unclassified'] = df.loc[' ']
                    df.drop(" ", inplace=True)
            data[sample] = df

        df = pd.DataFrame(data)
        #df.to_json(output.data)
        df = df.sort_index(ascending=False)
        return df
예제 #10
0
 def __init__(self, filename):
     self.df = pd.read_csv(filename, sep='\t', header=None)
     self.df.columns = [
         'filename', 'num_reads', 'estimated_fragment_length', 'corr',
         'phantom_peak', 'corr_phantom_peak', 'argmin_corr', 'min_corr',
         'NSC', 'RSC', 'quality_tag'
     ]
예제 #11
0
    def read_align(self, readfile):

        self.df = pd.read_csv(readfile, sep='\t', header=None)
        self.df.columns = ['ref', 'start', 'end', 'dummy', 'quality', 'strand']
        from pylab import median
        self.read_length = round(median(self.df['end'] - self.df['start']))
        self.chromosomes = self.df['ref'].unique()
예제 #12
0
파일: assembly.py 프로젝트: sequana/sequana
    def __init__(self, filename="full_table_testbusco.tsv"):
        """.. rubric:: constructor

        :filename: a valid BUSCO input file (full table). See example in sequana
            code source (testing)

        """
        self.df = pd.read_csv(filename, sep="\t", skiprows=4)
예제 #13
0
    def __init__(self, filename="full_table_testbusco.tsv"):
        """.. rubric:: constructor

        :filename: a valid BUSCO input file (full table). See example in sequana
            code source (testing)

        """
        self.df = pd.read_csv(filename, sep="\t", skiprows=4)
예제 #14
0
파일: kraken.py 프로젝트: ranjit58/sequana
 class Taxonomy(object):
     from sequana import sequana_data # must be local
     df = pd.read_csv(sequana_data("test_taxon_rtd.csv"),
             index_col=0)
     def get_lineage_and_rank(self, x):
         # Note that we add the name as well here
         ranks = ['kingdom', 'phylum', 'class', 'order',
                 'family', 'genus', 'species', 'name']
         return [(self.df.ix[x][rank], rank) for rank in ranks]
예제 #15
0
파일: kraken.py 프로젝트: sequana/sequana
    def _parse_data(self):
        taxonomy = {}

        logger.info("Reading kraken data")
        columns = ["status", "taxon", "length"]
        # we select only col 0,2,3 to save memoty, which is required on very
        # large files
        try:
            # each call to concat in the for loop below
            # will take time and increase with chunk position.
            # for 15M reads, this has a big cost. So chunksize set to 1M
            # is better than 1000 and still reasonable in memory
            reader = pd.read_csv(self.filename, sep="\t", header=None,
                               usecols=[0,2,3], chunksize=1000000)
        except pd.parser.CParserError:
            raise NotImplementedError  # this section is for the case
                #only_classified_output when there is no found classified read
            self.unclassified = N # size of the input data set
            self.classified = 0
            self._df = pd.DataFrame([], columns=columns)
            self._taxons = self._df.taxon
            return

        for chunk in reader:
            try:
                self._df
                self._df = pd.concat([self._df, chunk])
            except AttributeError:
                self._df = chunk

        self._df.columns = columns

        count = sum(self._df.taxon == 1)
        if count:
            logger.warning("Found %s taxons with root ID (1)" % count)

        # This gives the list of taxons as index and their amount
        # above, we select only columns 0, 2, 3  the column are still labelled
        # 0, 2, 3 in the df
        self._taxons = self._df.groupby("taxon").size()
        try:
            self._taxons.drop(0, inplace=True)
        except:
            pass # 0 may not be there
        self._taxons.sort_values(ascending=False, inplace=True)

        category = self.df.groupby("status").size()

        if 'C' in category.index:
            self.classified = category['C']
        else:
            self.classified = 0

        if 'U' in category.index:
            self.unclassified = category['U']
        else:
            self.unclassified = 0
예제 #16
0
 def _read_csv(self, input_filename):
     """ Read csv generated by :class:'GenomeCov' and create
     :class:'ChromosomeCov' list.
     """
     # set regex to get important information about previous analysis
     re_threshold = re.compile("thresholds:([\d,\.-]+)")
     re_window_size = re.compile("\swindow_size:(\d+)")
     re_circular = re.compile("circular:(\w+)")
     re_gc_window_size = re.compile("gc_window_size:(\d+)")
     re_genbank = re.compile("genbank:([\{0}\w\.\-]+)".format(os.sep))
     re_chrom = re.compile("^# ([\w\-\.]+):")
     re_gaussian = re.compile("(\[\{.+\}\])")
     with open(input_filename, "r") as fp:
         line = fp.readline()
         # check if file was generated by sequana_coverage
         if not line.startswith("# sequana_coverage"):
             return None
         # get thresholds
         thresholds = re_threshold.findall(line)[0]
         thresholds = [float(f) for f in thresholds.split(',')]
         self.thresholds = DoubleThresholds(*thresholds)
         # get window size
         self.window_size = int(re_window_size.search(line).group(1))
         # get circular
         circular = re_circular.search(line).group(1)
         self.circular = False if circular == "False" else True
         # get gc_window_size
         gc = re_gc_window_size.search(line)
         if gc:
             self.gc_window_size = int(gc.group(1))
         # get genbank
         gb = re_genbank.search(line)
         if gb and not self.genbank_filename:
             self.genbank_filename = gb.group(1)
         # get gaussians for each chromosome
         gaussians_dict = dict()
         for line in fp:
             chrom = re_chrom.search(line)
             if chrom:
                 gaussians = re_gaussian.search(line)
                 gaussians = ast.literal_eval(gaussians.group(1))
                 gaussians_dict[chrom.group(1)] = gaussians
             else:
                 break
         df = pd.read_csv(fp, header=None, names=line.strip().split(","))
     chr_list = self._set_chr_list(df)
     # Add gaussians and range informations
     for chrom in chr_list:
         chrom.set_gaussians(gaussians_dict[chrom.chrom_name])
         if self.circular:
             chrom.range = [None, None]
         else:
             mid = int(self.window_size/2)
             chrom.range = [mid, -mid]
         chrom.mixture_fitting = mixture.EM(
             chrom.df['scale'][chrom.range[0]:chrom.range[1]])
     return chr_list
예제 #17
0
 def read(self, filename):
     df = pd.read_csv(filename, sep='\t', header=None)
     df.columns = [
         'filename', 'num_reads', 'estimated_fragment_length', 'corr',
         'phantom_peak', 'corr_phantom_peak', 'argmin_corr', 'min_corr',
         'NSC', 'RSC', 'quality_tag'
     ]
     self.df = self.df.append(df)
     self.df.reset_index(inplace=True, drop=True)
예제 #18
0
    def scanner(self):
        data = {}
        # shlex removes all white lines and split by return carriage
        # strip is also applied
        rawdata = shlex.split(open(self.filename, "r"))
        for line in rawdata:
            # sometimes, IEM will store the ;;; at the end
            # so we can get [HEADER];;;;;;;;;;;
            if line.startswith('[') and "]" in line:
                line = line.strip(";").strip(",").strip()
                currentkey = line.replace("[", "").replace("]", "")
                data[currentkey] = []
            else:
                data[currentkey].append(line)

        for key in data.keys():
            data[key] = "\n".join(data[key])

        for this in ["Header", "Reads", "Settings", "Data"]:
            if this not in data.keys():
                logger.warning("%s not found in the DesignExpMiSeq file" %
                               this)

        self.data = data
        self.df = pd.read_csv(io.StringIO(data["Data"]))

        ncols = [8, 9, 10, 12]
        if self.df.shape[1] not in ncols:
            self.df = pd.read_csv(io.StringIO(data["Data"]), ";")
            if self.df.shape[1] not in ncols:
                logger.warning(
                    "Data section must have 10 or 12 columns. Check the samplesheet"
                )

        # Fixes https://github.com/sequana/sequana/issues/507
        self.df["Sample_ID"] = self.df["Sample_ID"].astype(str)

        self.df.rename(columns={
            "I7_Index_ID": "Index1_ID",
            "index": "Index1_Seq",
            "I5_Index_ID": "Index2_ID",
            "index2": "Index2_Seq"
        },
                       inplace=True)
예제 #19
0
파일: isoseq.py 프로젝트: sequana/sequana
    def read(self, filename, tag):
        if filename.endswith(".csv"):
            data = pd.read_csv(filename)
            data = data.query("reference_name not in [-1, '-1']")
        else:
            sam = PacbioIsoSeqMappedIsoforms(filename)
            mapped, data = sam.plot_sirv_by_group(tag)

        self.rawdata.append(data)
        self.labels.append(tag)
예제 #20
0
파일: rnadiff.py 프로젝트: sequana/sequana
    def __init__(self, filename, sep="\s*,\s*", reference=None):
        self.filename = filename
        # \s to strip the white spaces
        self.df = pd.read_csv(filename,
                              sep=sep,
                              engine="python",
                              comment="#",
                              dtype={"label": str})

        self.reference = reference
예제 #21
0
    def read(self, filename, tag):
        if filename.endswith(".csv"):
            data = pd.read_csv(filename)
            data = data.query("reference_name not in [-1, '-1']")
        else:
            sam = PacbioIsoSeqMappedIsoforms(filename)
            mapped, data = sam.plot_sirv_by_group(tag)

        self.rawdata.append(data)
        self.labels.append(tag)
예제 #22
0
    def hist_read_length(self, bins=100, fontsize=16):
        """

        """
        filename = self.getfile("correction/*gkpStore/reads.txt")

        df = pd.read_csv(filename, header=None, sep="\t")
        df[2].hist(bins=bins)
        df.columns = ["ID", 1, "read_length", 3, 4]
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.xlim([0, pylab.xlim()[1]])
        return df
예제 #23
0
 def hist_read_length2(self, fontsize=16):
     df = pd.read_csv(self.getfile(
         "correction/2-correction/*.original-expected-corrected-length.dat"
     ),
                      sep="\t",
                      header=None)
     pylab.clf()
     df[1].hist(bins=100, alpha=0.5, density=True, label="original")
     df[2].hist(bins=100, alpha=0.5, density=True, label="expected")
     df[3].hist(bins=100, alpha=0.5, density=True, label="corrected")
     pylab.legend()
     pylab.xlabel("read length", fontsize=fontsize)
     pylab.ylabel("number of reads ", fontsize=fontsize)
     return df
예제 #24
0
    def __init__(self, filename, sep="\t", **kwargs):
        self.df = pd.read_csv(filename, sep=sep, **kwargs)
        # If there is a header, let us strip it from spaces
        try:
            self.df.columns = [x.strip() for x in self.df.columns]
        except:
            pass

        # let us strip strings from spaces
        for x in self.df.columns:
            try:
                self.df[x] = self.df[x].str.strip()
            except:
                pass
예제 #25
0
    def _get_table(self, pattern):
        """ Extract the complete (with all comparisons) table 
        from a RNADiff analysis or the normCounts table 
        depending on the pattern specified.
        """
        if pattern:
            table_files = [f for f in self._table_folder.glob(pattern)]
            if len(table_files) == 0:
                raise ValueError("Found no file for your pattern {}".format(pattern))
            elif len(table_files) != 1:
                print(table_files)
                raise ValueError("Found more than 1 file with the pattern {}".format(pattern))
            return pd.read_csv(table_files[0], self.SEP, index_col=0)
        else:
            table_files = [f for f in self._table_folder.glob("*.xls")]
            table = [f for f in table_files if re.match(pattern, str(f))]

            if len(table) == 1 and table[0].is_file():
                return pd.read_csv(table[0], self.SEP, index_col=0)
            else:
                raise IOError(
                    f"Cannot find a single proper table with pattern: {pattern} from RNADiff: {table}"
                )
예제 #26
0
    def __init__(self, filename):

        self.filename = filename

        self.df = pd.read_csv(filename, sep="\t", header=None)
        self.df.columns = [
            "type", "label", "size", "3", "4", "5", "6", "7", "8"
        ]
        name = self.df['label'].apply(lambda x: x.rsplit(":", 1)[0])
        start = self.df['label'].apply(
            lambda x: x.rsplit(":", 1)[1].split("-")[0])
        end = self.df['label'].apply(
            lambda x: x.rsplit(":", 1)[1].split("-")[1])
        self.df['start'] = start.astype(int)
        self.df['end'] = end.astype(int)
        self.df['name'] = name
예제 #27
0
파일: kraken.py 프로젝트: brwnj/sequana
    def _get_df_with_taxon(self, dbname):

        # line 14500
        # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome

        df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index])
        df['count'] = self.taxons.values
        df.reset_index(inplace=True)
        newrow = len(df)
        df.ix[newrow] = "Unclassified"
        df.ix[newrow, 'count'] = self.unclassified
        df.ix[newrow, 'index'] = -1
        df.rename(columns={"index": "taxon"}, inplace=True)
        df["percentage"] = df["count"] / df["count"].sum() * 100

        # Now get back all annotations from the database itself.
        filename = dbname + os.sep + "annotations.csv"
        if os.path.exists(filename):
            annotations = pd.read_csv(filename)
            annotations.set_index("taxon", inplace=True)

            df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']]
            # There are duplicates sohow. let us keep the first one for now
            df2 = df2.reset_index().drop_duplicates(
                subset="taxon", keep="first").set_index("taxon")
            self.df2 = df2
            self.df1 = df.set_index("taxon")
            df = pd.merge(self.df1, df2, left_index=True, right_index=True)
            df.reset_index(inplace=True)
            starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count']
            df = df[starter + [
                x
                for x in df.columns if x not in starter and x != "description"
            ] + ["description"]]

            df['gi'] = [int(x) for x in df['gi'].fillna(-1)]
            from easydev import precision
            df['percentage'] = [str(precision(x, 2)) for x in df['percentage']]
        else:
            starter = ['taxon', 'count', 'percentage']
            df = df[starter + [x for x in df.columns if x not in starter]]

        df.sort_values(by="percentage", inplace=True, ascending=False)
        return df
예제 #28
0
    def plot_kmer(self, bins=100):
        filename = self.getfile("correction/0-mercounts/*.ms16.histogram")

        df = pd.read_csv(filename, header=None, sep="\t")
        df.columns = ["kmer", "count", "X", "Y"]

        # Save some data
        self.data['correction']['largest mercount'] = list(df['kmer'])[-1]
        self.data['correction']['unique mers'] = df['count'][0]
        self.data['correction']['distinc mers'] = df['count'].sum()
        self.data['correction'][""] = sum(df.kmer * df['count'])

        # X is just df['count'].cumsum() / df['count'].sum() (distinct kmer)
        # Y is (df['kmer']*df['count']).cumsum() / (df['kmer']*df['count()).sum()
        # that is total kmer
        pylab.plot(df.X, df.Y, label="distinct vs total")
        pylab.grid()
        pylab.legend()
        return df
예제 #29
0
파일: rnadiff.py 프로젝트: sequana/sequana
    def __init__(self,
                 path,
                 alpha=0.05,
                 log2_fc=0,
                 sep=",",
                 condition="condition"):
        """ A representation of the results of a single rnadiff comparison """
        self.path = Path(path)
        self.name = self.path.stem.replace("_degs_DESeq2",
                                           "").replace("-", "_")

        self._alpha = alpha
        self._log2_fc = log2_fc

        self.df = pd.read_csv(self.path, index_col=0, sep=sep)
        self.condition = condition

        self.filt_df = self.filter()
        self.set_gene_lists()
예제 #30
0
파일: kraken.py 프로젝트: sequana/sequana
    def _get_df_with_taxon(self, dbname):

        # line 14500
        # >gi|331784|gb|K01711.1|MEANPCG[331784] Measles virus (strain Edmonston), complete genome

        df = self.get_taxonomy_biokit([int(x) for x in self.taxons.index])
        df['count'] = self.taxons.values
        df.reset_index(inplace=True)
        newrow = len(df)
        df.ix[newrow] = "Unclassified"
        df.ix[newrow, 'count'] = self.unclassified
        df.ix[newrow, 'index'] = -1
        df.rename(columns={"index":"taxon"}, inplace=True)
        df["percentage"] = df["count"] / df["count"].sum() * 100

        # Now get back all annotations from the database itself.
        filename = dbname + os.sep + "annotations.csv"
        if os.path.exists(filename):
            annotations = pd.read_csv(filename)
            annotations.set_index("taxon", inplace=True)

            df2 = annotations.ix[df.taxon][['ena', 'gi', 'description']]
            # There are duplicates sohow. let us keep the first one for now
            df2 = df2.reset_index().drop_duplicates(subset="taxon",
                keep="first").set_index("taxon")
            self.df2 = df2
            self.df1 = df.set_index("taxon")
            df = pd.merge(self.df1, df2, left_index=True, right_index=True)
            df.reset_index(inplace=True)
            starter = ['percentage', 'name', 'ena', 'taxon', "gi", 'count']
            df = df[starter + [x for x in df.columns if x not in starter and
                                x!="description"] +  ["description"]]

            df['gi'] = [int(x) for x in df['gi'].fillna(-1)]
            from easydev import precision
            df['percentage'] = [str(precision(x,2)) for x in df['percentage']]
        else:
            starter = ['taxon', 'count', 'percentage']
            df = df[starter + [x for x in df.columns if x not in starter]]

        df.sort_values(by="percentage", inplace=True, ascending=False)
        return df
예제 #31
0
    def __init__(self, filename, alpha=0.05, log2_fc=0, pattern="*complete*xls",
        sep="\t"):
        """.. rubric:: constructor

        :param rnadiff_results: can be a folder for a simple comparison, or an
            output file containing the rseults of a rnadiff analysis
        :param alpha:
        :param out_dir:
        :param log2_fc: the log2 fold change to apply

        """
        if os.path.isdir(filename):
            filenames = glob.glob(filename + "/tables/" + pattern)
            if len(filenames) == 1:
                self.filename = filenames[0]
            else:
                raise IOError("Found several file with the {} pattern. Please be"
                    "more restrictive using the pattern argument")
        elif os.path.exists(filename):
            # must be a 'complete' file from rnadiff analysis
            self.filename = filename
        else:
            raise TypeError("{} does not exists".format(filename))

        # Finally, read the data itself
        self.df = pd.read_csv(self.filename, sep, index_col=0)

        # Just an alias to a subset of the dataframe
        normed = [x for x in self.df.columns if x.startswith('norm')]
        self.normcounts = self.df[normed]

        # some parameters/attributes
        self._alpha = alpha
        self._log2_fc = log2_fc

        # What are the sample names and condition names
        self.sample_names = [x.replace('norm.', '') for x in normed]
        self.condition_names = set([x[0:-1] for x in self.sample_names])
        self.set_colors()

        self._set_gene_lists_one_condition()
예제 #32
0
    def get_taxons_from_gis(self, gis, filename="gi_taxid_nucl.dmp"):
        filename = self.taxon_path + os.sep + filename
        data = pd.read_csv(filename, chunksize=1000000, sep='\t', header=None)
        N = 560  # with time this number will be deprecated but good for now

        local_gis = gis[:]

        # We will found GI an order than different from the input gis list so
        # we will need to keep track of the order
        found_gis = []
        taxons = [32644] * len(gis)  # 32644 means unidentified
        # we search for the unique gis. Once found, we remove them from the
        # vector and keep going until the vector is empty or there is no more
        # chunks. A good sanity check is that the final gis vector should be
        # empty meaning all have been found. We do not care about the order
        # of the final taxons vector as compare to the GI vector

        print("Scanning %s to look for %s GI numbers" % (filename, len(gis)))
        pb = Progress(N)
        for i, chunk in enumerate(data):
            chunk.set_index(0, inplace=True)
            chunk = chunk.ix[local_gis].dropna()

            # keep the GI and Taxon
            found_gis.extend([int(x) for x in list(chunk.index)])

            # update the remaining GIs and the taxons
            for gi, tax in zip(chunk.index, chunk.values):
                local_gis.remove(gi)
                index = gis.index(gi)
                taxons[index] = tax

            # no need to carry on if all GIs were found
            if len(local_gis) == 0:
                break
            pb.animate(i + 1)
        print("")

        taxons = [int(x) for x in taxons]
        return taxons
예제 #33
0
    def __init__(self, directory=".", prefix="job-*"):
        self.prefix = prefix
        self.directory = directory

        # low quality isoforms
        self.lq_isoforms = self.get_file("lq_isoforms.fastq")
        if self.lq_isoforms:
            self.lq_sequence = FastQ(self.lq_isoforms)

        # high quality isoforms
        self.hq_isoforms = self.get_file("hq_isoforms.fastq")
        if self.hq_isoforms:
            self.hq_sequence = FastQ(self.hq_isoforms)

        # General info
        self.csv = self.get_file("-file.csv")
        if self.csv:
            self.data = pd.read_csv(self.csv)

        # CCS fasta sequence
        #self.ccs = self.get_file("-ccs.tar.gz")
        self.ccs = self.get_file("ccs.fasta", noprefix=True)
        if self.ccs:
            self.ccs = FastA(self.ccs)
예제 #34
0
파일: cutadapt.py 프로젝트: sequana/sequana
    def _get_histogram_data(self):
        """In cutadapt logs, an adapter section contains
        an histogram of matches that starts with a header
        and ends with a blank line
        """
        header = 'length\tcount\texpect\tmax.err\terror counts\n'
        with open(self.input_filename, 'r') as fin:
            # not too large so let us read everything
            data = fin.readlines()
            scanning_histogram = False
            adapters = []
            current_hist = header
            dfs = {}

            if "variable 5'/3'" in "\n".join(data):
                cutadapt_mode = "b"
            else:
                cutadapt_mode = "other"

            for this in data:
                # while we have not found a new adapter histogram section,
                # we keep going
                # !! What about 5' / 3'
                if this.startswith("==="):
                    if 'read: Adapter' in this:
                        # We keep read: Adatpter because it may be the first
                        # or second read so to avoid confusion we keep the full
                        # name for now.
                        name = this.replace("First read: Adapter ", "R1_")
                        name = name.replace("Second read: Adapter ", "R2_")
                        name = name.strip().strip("===")
                        name = name.strip()
                    elif "=== Adapter" in this:
                        name = this.split("=== ")[1].split(" ===")[0]
                        name = name.strip()
                    else:
                        pass

                if scanning_histogram is False:
                    if this == header:
                        # we found the beginning of an histogram
                        scanning_histogram = True
                    else:
                        # we are somewhere in the log we do not care about
                        pass
                elif scanning_histogram is True and len(this.strip()) != 0:
                    # accumulate the histogram data
                    current_hist += this
                elif scanning_histogram is True and len(this.strip()) == 0:
                    # we found the end of the histogram
                    # Could be a 5'/3' case ? if so another histogram is
                    # possible
                    df = pd.read_csv(io.StringIO(current_hist), sep='\t')
                    #reinitiate the variables
                    if cutadapt_mode != "b":
                        dfs[name] = df.set_index("length")
                        current_hist = header
                        scanning_histogram = False
                    else:
                        # there will be another histogram so keep scanning
                        current_hist = header
                        # If we have already found an histogram, this is
                        # therefore the second here.
                        if name in dfs:
                            if len(df):
                                dfs[name] = dfs[name].append(df.set_index("length"))
                            scanning_histogram = False
                            dfs[name] = dfs[name].reset_index().groupby("length").aggregate(sum)
                        else:
                            dfs[name] = df.set_index("length")
                            scanning_histogram = True
                else:
                    pass
        return dfs
예제 #35
0
################################ IMPORT ################################################################################################
import sys
from sequana.lazy import pandas as pd
from Bio import SeqIO

################################ PARAMETERS ################################################################################################

file_input = str(sys.argv[1])
col_name_pos = str(sys.argv[2])
file_output = str(sys.argv[3])
file_genbank = str(sys.argv[4])

################################ INPUT DATA ################################################################################################

df = pd.read_csv(file_input, sep=",")
df = df.sort_values(by=col_name_pos, ascending=True)

seq_records = SeqIO.parse(file_genbank, "genbank")
record = next(seq_records)

# for each position, check if there is CDS annotation
rec_i = 0
b = 0
e = 0

# init table all results
result_annot = []
header_df_results = [
    "CDS", " start", " end", " strand", " gene_ID", " gene_name", " product",
    " note"
예제 #36
0
파일: isoseq.py 프로젝트: sequana/sequana
 def __init__(self, filename):
     self.filename = filename
     self.df = pd.read_csv(filename, header=None, sep="\t")
     self.coverage_column = 12
예제 #37
0
 def read(self):
     """Read a CSV file"""
     self.df = pd.read_csv(self.filename, sep=self.sep)
예제 #38
0
alpha = 0.6
nb_bars = 15
figsize = (14, 8)
max_chars = 60
fontsize = 24

if save_output:
    title = filename_output.replace(".png",
                                    "").replace("_", " ").replace(".txt", "")
else:
    title = f_input.replace("fof_", "").replace(".txt", "")

################################ INPUT DATA ##############################################################################################

df = pd.read_csv(f_input)

################################ EXECUTE ##############################################################################################

names_prod = collections.Counter(df.loc[:, "product"])

list_names = []
list_count = []
for k in names_prod.keys():
    if (str(k) != "nan") & (k != "None"):
        list_names.append(str(k))
        list_count.append(int(names_prod[k]))
    # print(names_prod[k])

list_names_sort = [
    n for (c, n) in sorted(zip(list_count, list_names), reverse=True)
예제 #39
0
    def __init__(self, filename):
        super(ExpDesignMiSeq, self).__init__(filename)
        self.name = "ExpDesignMiSeq"

        data = {}
        # shlex removes all white lines and split by return carriage
        # strip is also applied
        rawdata = shlex.split(open(filename, "r"))
        for line in rawdata:
            # sometimes, IEM will store the ;;; at the end
            # so we can get [HEADER];;;;;;;;;;;
            if line.startswith('[') and "]" in line:
                line = line.strip(";").strip(",").strip()
                currentkey = line.replace("[", "").replace("]", "")
                data[currentkey] = []
            else:
                data[currentkey].append(line)

        for key in data.keys():
            data[key] = "\n".join(data[key])

        for this in ["Header", "Reads", "Settings", "Data"]:
            if this not in data.keys():
                logger.warning("%s not found in the DesignExpMiSeq file" % this)

        self.data = data
        self.df = pd.read_csv(io.StringIO(data["Data"]))
        if self.df.shape[1] not in [8, 10]:
            self.df = pd.read_csv(io.StringIO(data["Data"]), ";")
            if self.df.shape[1] not in [8, 10]:
                logger.warning("Data section must have 8 or 10 columns. Check the samplesheet")

        # Fixes https://github.com/sequana/sequana/issues/507
        self.df["Sample_ID"] = self.df["Sample_ID"].astype(str)

        self.df.rename(columns={"I7_Index_ID":"Index1_ID", "index":"Index1_Seq",
            "I5_Index_ID": "Index2_ID", "index2":"Index2_Seq"},
                       inplace=True)

        # The name of the Index_ID is not standard....
        # Depends on the experimentalist because a prefix may be added.
        # One known prefix is NF. We agreed that future prefix must end with an
        # underscore so that it can be removed. Since ID may contain letters
        # (e.g.S501), it would be impossible otherwise to split the prefix from
        # the index.
        self.df["Index1_ID"] = self.df["Index1_ID"].apply(
                lambda x: x.replace("NF", ""))
        self.df["Index1_ID"] = self.df["Index1_ID"].apply(
                lambda x: x.split("_",1)[-1])
        try:
            self.df["Index1_ID"] = self.df["Index1_ID"].astype(int)
        except:
            pass

        if "Index2_ID" in self.df.columns:
            self.df["Index2_ID"] = self.df["Index2_ID"].apply(
                lambda x: x.replace("NF", ""))
            self.df["Index2_ID"] = self.df["Index2_ID"].apply(
                lambda x: x.split("_",1)[-1])
            try:
                self.df["Index2_ID"] = self.df["Index2_ID"].astype(int)
            except:
                pass

        # Figure out the type of adapters if possible
        try:
            header = self.data['Header']
            assay = [x for x in header.split('\n') if x.startswith("Assay")]
            assay = assay[0]
            items = assay.split(',')
            if items[0] == "Assay":
                self.adapter_type = assay.split(",")[1]
            else:
                items = assay.split(';')
                self.adapter_type = assay.split(";")[1]
        except:
            pass

        self.check()