Пример #1
0
 def score_sc2(self, prediction_file):
     fh = TempFile()
     _, gs2 = self.download_gs()
     script = self.classpath + os.sep + "DREAM_Olfaction_scoring_Q2.pl"
     cmd = "perl %s %s %s %s"
     cmd = cmd % (script, prediction_file, fh.name, gs2)
     shellcmd(cmd)
     df = pd.read_csv(fh.name, sep='\t', index_col=None).ix[0]
     fh.delete()
     return df
Пример #2
0
 def score_sc2(self, prediction_file):
     fh = TempFile()
     _, gs2 = self.download_gs()
     script = self.classpath + os.sep + "DREAM_Olfaction_scoring_Q2.pl"
     cmd = "perl %s %s %s %s"
     cmd = cmd % (script, prediction_file, fh.name, gs2)
     shellcmd(cmd)
     df = pd.read_csv(fh.name, sep='\t', index_col=None).ix[0]
     fh.delete()
     return df
Пример #3
0
    def score_A(self, filename):
        from easydev import TempFile
        fh = TempFile()
        script = self._pj(
            [self.classpath, 'weighted_average_concordance_index.pl'])
        datadir = self._pj([self.classpath, 'data'])
        cmd = "perl %s %s %s %s"
        cmd = cmd % (script, filename, datadir, fh.name)

        shellcmd(cmd, verbose=True, ignore_errors=True)
        try:
            df = pd.read_csv(fh.name, sep='\t', header=None)
        except:
            print("Something wrong in the Scoring while executing \n  %s. " %
                  cmd)
            print(
                "\n The D7C4 challenge requires a Perl package to be installed"
            )
            print("See D7C4 documentation e.g., on dreamtools.readthedocs.org")
            import sys
            sys.exit(1)
        df.columns = [
            'DrugID', 'probabilistic c-index',
            'weighted probabilistic c-index', 'zscores'
        ]
        df = df.set_index('DrugID')
        fh.delete()

        ws = (df.sum() / df.sum().ix['zscores'])
        ws = ws.ix['weighted probabilistic c-index']

        results = df.mean()
        results['weight average probabilistic c-index'] = ws

        del results['zscores']

        # Finally compute pvalues based on precomputed scores
        precomp = pd.read_csv(self._pj([
            self.classpath, 'data', 'DREAM7_DrugSensitivity1_drug_zscores.txt'
        ]),
                              sep='\t',
                              skiprows=6,
                              header=None)

        overall_mean = precomp.ix[31][1]
        overall_var = precomp.ix[31][2]

        pval = 1 - (.5 * (math.erf(
            (ws - overall_mean) / (math.sqrt(2 * overall_var))) + 1))

        results['weight average probabilistic c-index p-value'] = pval

        return {'Results': results}
Пример #4
0
    def score_sc1(self, prediction_file):
        """Compute all results and compare user prediction with all official participants

        This scoring function can take a long time (about 5-10 minutes).
        """
        fh = TempFile()
        gs1, _ = self.download_gs()
        script = self.classpath + os.sep + "DREAM_Olfaction_scoring_Q1.pl"
        cmd = "perl %s %s %s %s"
        cmd = cmd % (script, prediction_file, fh.name, gs1)
        shellcmd(cmd)
        df = pd.read_csv(fh.name, sep='\t', index_col=None).ix[0]
        fh.delete()
        return df
Пример #5
0
    def score_sc1(self, prediction_file):
        """Compute all results and compare user prediction with all official participants

        This scoring function can take a long time (about 5-10 minutes).
        """
        fh = TempFile()
        gs1, _ = self.download_gs()
        script = self.classpath + os.sep + "DREAM_Olfaction_scoring_Q1.pl"
        cmd = "perl %s %s %s %s"
        cmd = cmd % (script, prediction_file, fh.name, gs1)
        shellcmd(cmd)
        df = pd.read_csv(fh.name, sep='\t', index_col=None).ix[0]
        fh.delete()
        return df
Пример #6
0
    def _convert(self, filename):
        size = self.thumbnail_size
        drive, filename = os.path.split(filename)
        thumb_filename = os.sep.join([drive, "thumb_" + filename ])
        filename = drive + os.sep + filename

        ret = easydev.shellcmd("convert \"%s\" -resize %sx%s \"%s\"" % 
                (filename, size, size, thumb_filename))
Пример #7
0
    def score_A(self, filename):
        from easydev import TempFile
        fh = TempFile()
        script = self._pj([self.classpath,
            'weighted_average_concordance_index.pl'])
        datadir = self._pj([self.classpath, 'data'])
        cmd = "perl %s %s %s %s"
        cmd = cmd % (script, filename, datadir , fh.name)

        shellcmd(cmd, verbose=True, ignore_errors=True)
        try:
            df = pd.read_csv(fh.name, sep='\t', header=None)
        except:
            print("Something wrong in the Scoring while executing \n  %s. " % cmd)
            print("\n The D7C4 challenge requires a Perl package to be installed")
            print("See D7C4 documentation e.g., on dreamtools.readthedocs.org")
            import sys
            sys.exit(1)
        df.columns = ['DrugID', 'probabilistic c-index',
        'weighted probabilistic c-index', 'zscores']
        df = df.set_index('DrugID')
        fh.delete()

        ws = (df.sum() / df.sum().ix['zscores'])
        ws = ws.ix['weighted probabilistic c-index']

        results = df.mean()
        results['weight average probabilistic c-index'] = ws

        del results['zscores']

        # Finally compute pvalues based on precomputed scores
        precomp = pd.read_csv(self._pj([self.classpath, 'data',
            'DREAM7_DrugSensitivity1_drug_zscores.txt']), sep='\t',
            skiprows=6,  header=None)

        overall_mean = precomp.ix[31][1]
        overall_var = precomp.ix[31][2]

        pval = 1 -  (.5 * (math.erf((ws - overall_mean)/(math.sqrt(2*overall_var))) + 1))

        results['weight average probabilistic c-index p-value'] = pval

        return {'Results': results}
Пример #8
0
def main(options):
	
	if options.input.endswith(".bam"):
		bedfile = options.input.replace(".bam", ".bed")
		shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile))
	
	elif options.input.endswith(".bed"):
		bedfile = options.input
	else:
		raise ValueError("Input file must be a BAM or BED file")
	
	###
	
	gc = GenomeCov(bedfile, -4, 4, 0.5, 0.5)
	if len(gc.chr_list) == 1:
		chrom = gc.chr_list[0]
		run_analysis(gc, chrom, 1, options)
	else:
		raise ValueError("Error, more than 1 chr in the BED file")
Пример #9
0
    def build_thumbnails(self):
        """Builds all thumbnails.

        .. warning:: overwrite them if they exists
        """
        filenames = self._get_files()
        print filenames
        for filename in filenames:
            self._convert(filename)
        # find thumb.jpg
        pattern = "thumb.jpg"
        if self.directory != "":
            pattern = self.directory + os.sep + pattern
            
        if len(glob.glob(pattern)) == 0:
            # if not found, create a link
            thumb_names = self._get_thumbnails()
            if len(thumb_names):
                thumb_name = thumb_names[0]
                dr, filename = os.path.split(thumb_name)
                cmd = "ln \"%s\" \"%s\"" % (thumb_name, dr + os.sep + "thumb.jpg")
                easydev.shellcmd(cmd)
Пример #10
0
    def score_A(self, filename):
        from easydev import TempFile
        fh = TempFile()
        script = self._pj([self._path2data, 
            'weighted_average_concordance_index.pl'])
        datadir = self._pj([self._path2data, 'data'])
        cmd = "perl %s %s %s %s"
        cmd = cmd % (script, filename, datadir , fh.name)

        shellcmd(cmd, verbose=True, ignore_errors=True)
        df = pd.read_csv(fh.name, sep='\t', header=None)
        df.columns = ['DrugID','probabilistic c-index',	
        'weighted probabilistic c-index', 'zscores']
        df = df.set_index('DrugID')
        fh.delete()

        ws = (df.sum() / df.sum().ix['zscores'])
        ws = ws.ix['weighted probabilistic c-index']

        results = df.mean()
        results['weight average probabilitis c-index'] = ws 

        del results['zscores']

        # Finally compute pvalues based on precomputed scores
        precomp = pd.read_csv(self._pj([self._path2data, 'data',
            'DREAM7_DrugSensitivity1_drug_zscores.txt']), sep='\t', 
            skiprows=6,  header=None)

        overall_mean = precomp.ix[31][1]
        overall_var = precomp.ix[31][2]

        pval = 1 -  (.5 * (math.erf((ws - overall_mean)/(math.sqrt(2*overall_var))) + 1))

        results['weight average probabilitis c-index p-value'] = pval

        return {'Results': results}
Пример #11
0
    def get_full_stats_as_df(self):
        """Return a dictionary with full stats about the BAM/SAM file

        The index of the dataframe contains the flags. The column contains
        the counts.

        ::

            >>> from sequana import BAM, sequana_data
            >>> b = BAM(sequana_data("test.bam"))
            >>> df = b.get_full_stats_as_df()
            >>> df.query("description=='average quality'")
            36.9

        .. note:: uses samtools behind the scene
        """
        from easydev import shellcmd
        res = shellcmd("samtools stats %s" % self._filename)
        res = res.decode('utf-8')

        # First, we can extract all data that statrts with SN
        # The format is
        #
        # SN name: value #comment
        #
        # separators are \t tabulation
        #
        # so we split with the : character, remove the starting SN\t characters
        # remove comments and ignore other \t characters. We should end up with
        # only 2 columns; names/values

        # extra all relevnt lines starting with SN
        data = [x for x in res.split("\n") if x.startswith('SN')]

        # remove comments
        data = [x.split('#')[0][3:] for x in data]
        names = [x.split(":")[0] for x in data]
        values = [x.split(":")[1].strip() for x in data]
        df = pd.DataFrame({"description": names, "count": values })
        df = df[['description', 'count']]
        df.sort_values(by='count', inplace=True)
        return df
Пример #12
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    sequana_debug_level(options.logging_level)

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" %
            (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df
        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" %
            (options.download_reference, options.database))
        from sequana.snpeff import download_fasta_and_genbank
        download_fasta_and_genbank(options.download_genbank,
                                   options.download_genbank, 
                                   genbank=True, fasta=False)
        return

    if options.genbank:
        assert os.path.exists(options.genbank), \
            "%s does not exists" % options.genbank

    if options.verbose:
        logger.info("Reading %s. This may take time depending on " 
            "your input file" % options.input)

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        if options.verbose:
            logger.info("Converting BAM into BED file")
        shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile))
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # and output directory
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split('.')[0]

    # Now we can create the instance of GenomeCoverage
    gc = GenomeCov(bedfile, options.genbank, options.low_threshold,
                   options.high_threshold, 0.5, 0.5)

    # if we have the reference, let us use it
    if options.reference:
        logger.info('Computing GC content')
        gc.compute_gc_content(options.reference, options.w_gc,
                              options.circular)

    # Now we scan the chromosomes, 
    if len(gc.chr_list) == 1:
        if options.verbose:
            logger.warning("There is only one chromosome. Selected automatically.")
        chrom = gc.chr_list[0]
        chromosomes = [chrom]
        run_analysis(chrom, options, gc.feature_dict)
    elif options.chromosome <=-1 or options.chromosome > len(gc.chr_list):
        raise ValueError("invalid chromosome index ; must be in [1-{}]".format(len(gc.chr_list)+1))
    else: # chromosome index is zero 
        # For user, we start at position 1 but in python, we start at zero
        if options.chromosome:
            chromosomes = [gc[options.chromosome-1]]
        else:
            chromosomes = gc

        if options.verbose:
            print("There are %s chromosomes/contigs." % len(gc))
            for this in gc.chr_list:
                print("    {}".format(this.chrom_name))

        for i, chrom in enumerate(chromosomes):
            if options.verbose:
                print("==================== analysing chrom/contig %s/%s (%s)"
                      % (i + options.chromosome, len(gc),
                      chrom.chrom_name))
            run_analysis(chrom, options, gc.feature_dict)

    if options.verbose:
        logger.info("Creating report in %s. Please wait" % config.output_dir)

    if options.chromosome:
        cc = options.chromosome - 1
        datatable = CoverageModule.init_roi_datatable(gc[cc])
        ChromosomeCoverageModule(chromosomes[0], datatable, None)
        page = "{0}{1}{2}.cov.html".format(config.output_dir, os.sep,
                                           chrom.chrom_name)
    else:
        CoverageModule(gc)
        page = "{0}{1}coverage.html".format(config.output_dir, os.sep)

    if options.show_html:
        from easydev import onweb
        onweb(page)
Пример #13
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.logging_level

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" %
            (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df
        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" %
            (options.download_genbank, options.database))
        from sequana.snpeff import download_fasta_and_genbank
        download_fasta_and_genbank(options.download_genbank,
                                   options.download_genbank,
                                   genbank=True, fasta=False)
        return

    if options.genbank:
        assert os.path.exists(options.genbank), \
            "%s does not exists" % options.genbank

    logger.info("Reading %s. This may take time depending on "
        "your input file" % options.input)

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile))
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # and output directory
    config.output_dir = options.output_directory
    config.sample_name = os.path.basename(options.input).split('.')[0]

    # Now we can create the instance of GenomeCoverage
    if options.chromosome == -1:
        chrom_list = []
    else: 
        chrom_list = [options.chromosome]
    gc = GenomeCov(bedfile, options.genbank, options.low_threshold,
                   options.high_threshold, options.double_threshold,
                   options.double_threshold, chunksize=options.chunksize,
                   chromosome_list=chrom_list)


    # if we have the reference, let us use it
    if options.reference:
        logger.info('Computing GC content')
        gc.compute_gc_content(options.reference, options.w_gc,
                              options.circular)

    # Now we scan the chromosomes,
    if len(gc.chrom_names) == 1:
        logger.warning("There is only one chromosome. Selected automatically.")
        run_analysis(gc.chr_list[0], options, gc.feature_dict)
    elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names):
        msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))
        logger.error(msg)
        sys.exit(1)
    else:
        if options.chromosome == -1:
            chromosomes = gc.chrom_names # take all chromosomes
        else:
            # For user, we start at position 1 but in python, we start at zero
            chromosomes = [gc.chrom_names[options.chromosome-1]]

        logger.info("There are %s chromosomes/contigs." % len(gc))
        for this in gc.chrom_names:
            data = (this, gc.positions[this]["start"], gc.positions[this]["end"])
            logger.info("    {} (starting pos: {}, ending pos: {})".format(*data))

        # here we read chromosome by chromosome to save memory.
        # However, if the data is small.
        for i, chrom in enumerate(chromosomes):
            logger.info("==================== analysing chrom/contig %s/%s (%s)"
                  % (i + 1, len(gc), gc.chrom_names[i]))
            # since we read just one contig/chromosome, the chr_list contains
            # only one contig, so we access to it with index 0
            run_analysis(gc.chr_list[i], options, gc.feature_dict)

    if options.skip_multiqc is False:
        logger.info("=========================")
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg)
        import subprocess
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()
Пример #14
0
def main(args=None):

    if args is None:
        args = sys.argv[:]

    user_options = Options(prog="sequana")

    # If --help or no options provided, show the help
    if len(args) == 1:
        user_options.parse_args(["prog", "--help"])
    else:
        options = user_options.parse_args(args[1:])

    logger.level = options.logging_level

    if options.download_reference:
        logger.info("Downloading reference %s from %s\n" %
            (options.download_reference, options.database))

        from bioservices.apps import download_fasta as df
        df.download_fasta(options.download_reference, method=options.database)
        if options.download_genbank is None:
            return

    if options.download_genbank:
        logger.info("Downloading genbank %s from %s\n" %
            (options.download_genbank, options.database))
        from sequana.snpeff import download_fasta_and_genbank
        download_fasta_and_genbank(options.download_genbank,
                                   options.download_genbank,
                                   genbank=True, fasta=False)
        return

    if options.genbank:
        assert os.path.exists(options.genbank), \
            "%s does not exists" % options.genbank

    logger.info("Reading %s. This may take time depending on "
        "your input file" % options.input)

    # Convert BAM to BED
    if options.input.endswith(".bam"):
        bedfile = options.input.replace(".bam", ".bed")
        logger.info("Converting BAM into BED file")
        shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile))
    elif options.input.endswith(".bed"):
        bedfile = options.input
    else:
        raise ValueError("Input file must be a BAM or BED file")

    # Set the thresholds
    if options.low_threshold is None:
        options.low_threshold = -options.threshold

    if options.high_threshold is None:
        options.high_threshold = options.threshold

    # Now we can create the instance of GenomeCoverage
    if options.chromosome == -1:
        chrom_list = []
    else:
        chrom_list = [options.chromosome]

    gc = GenomeCov(bedfile, options.genbank, options.low_threshold,
                   options.high_threshold, options.double_threshold,
                   options.double_threshold, chunksize=options.chunksize,
                   chromosome_list=chrom_list)

    # if we have the reference, let us use it
    if options.reference:
        logger.info('Computing GC content')
        gc.compute_gc_content(options.reference, options.w_gc,
                              options.circular)

    # Now we scan the chromosomes,
    if len(gc.chrom_names) == 1:
        logger.warning("There is only one chromosome. Selected automatically.")
        run_analysis(gc.chr_list[0], options, gc.feature_dict)
    elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names):
        msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names))
        logger.error(msg)
        sys.exit(1)
    else:
        if options.chromosome == -1:
            chromosomes = gc.chrom_names # take all chromosomes
        else:
            # For user, we start at position 1 but in python, we start at zero
            chromosomes = [gc.chrom_names[options.chromosome-1]]

        logger.info("There are %s chromosomes/contigs." % len(gc))
        for this in gc.chrom_names:
            end = gc.positions[this]["end"]
            start = gc.positions[this]["start"]
            data = (this, gc.positions[this]["start"], gc.positions[this]["end"], end-start)
            logger.info("    {} (starting pos: {}, ending pos: {}, length: {})".format(*data))

        # here we read chromosome by chromosome to save memory.
        # However, if the data is small.
        for i, chrom in enumerate(chromosomes):
            logger.info("==================== analysing chrom/contig %s/%s (%s)"
                  % (i + 1, len(gc), gc.chrom_names[i]))
            # since we read just one contig/chromosome, the chr_list contains
            # only one contig, so we access to it with index 0
            run_analysis(gc.chr_list[i], options, gc.feature_dict)
            # logging level seems to be reset to warning somewhere
            logger.level = options.logging_level

    if options.skip_multiqc is False:
        logger.info("Creating multiqc report")
        pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/")
        cmd = 'multiqc . -m sequana_coverage -f -c {} '.format(pathtocfg)
        import subprocess
        proc = subprocess.Popen(cmd.split(), cwd=options.output_directory)
        proc.wait()
        #    stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        #out, err = proc.communicate()
        #with open("multiqc.log", "w") as fout:
        #    fout.write(err.decode())
    logger.info("Done")
Пример #15
0
    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                  (ii + 1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                    "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [
                    True if x in drug_decode_company.df.index else False
                    for x in drug_ids_in_results
                ]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename,
                           drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep

                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1)

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from
                    # the analysis made in ALL
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga, os.sep,
                                                        os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep, tcga,
                                                     os.sep, os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest)
                        shellcmd(cmd, verbose=False)
                        pb.animate(i + 1)
Пример #16
0
 def execute(self, cmd):
     logger.info("CMD> " + cmd)
     res = shellcmd(cmd, verbose=False)
     return res
Пример #17
0
    def create_data_packages_for_companies(self, companies=None):
        ##########################################################
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        #                                                        #
        # DRUG_DECODE and IC50 inputs must be filtered to keep   #
        # only WEBRELEASE=Y and owner                            #
        #                                                        #
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
        ##########################################################
        if isinstance(companies, str):
            companies = [companies]

        if companies is None:
            companies = self.companies

        Ncomp = len(companies)
        for ii, company in enumerate(companies):
            print("\n\n========= Analysing company %s out of %s (%s)" %
                    (ii+1, Ncomp, company))
            self.mkdir(company)
            for gf_filename in sorted(self.gf_filenames):
                tcga = gf_filename.split("_")[1].split('.')[0]
                print("---------------- for TCGA %s" % tcga)

                # Read the results previously computed
                try:
                    results_df = self.results[tcga].df.copy()
                except:
                    results_path = "ALL/%s/OUTPUT/results.csv" % tcga
                    print("Downloading results from %s" % results_path)
                    results_df = ANOVAResults(results_path)

                results = ANOVAResults(results_df)

                # Get a DrugDecode for that company
                drug_decode_company = self.drug_decode.df.query(
                        "WEBRELEASE=='Y' or OWNED_BY=='%s'" % company)
                # Transform into a proper DrugDecode class for safety
                drug_decode_company = DrugDecode(drug_decode_company)

                # filter results using the new drug decode
                drug_ids_in_results = get_drug_id(results.df.DRUG_ID)

                mask = [True if x in drug_decode_company.df.index else False
                        for x in drug_ids_in_results]

                results.df = results.df.ix[mask]

                # Just to create an instance with the subset of drug_decode
                # and correct settings. This is also used to store
                # the entire input data set. So, we must remove all drugs
                # not relevant for the analysis of this company
                an = ANOVA(self.ic50_filename, gf_filename, drug_decode_company)

                def drug_to_keep(drug):
                    to_keep = get_drug_id(drug) in drug_decode_company.df.index
                    return to_keep
                an.ic50.df = an.ic50.df.select(drug_to_keep, axis=1) 

                an.settings = ANOVASettings(**self.settings)
                an.init()
                an.settings.directory = company + os.sep + tcga
                an.settings.analysis_type = tcga
                self.report = ANOVAReport(an, results)
                self.report.settings.analysis_type = tcga
                self.report.create_html_main(False)
                self.report.create_html_manova(False)

                if self.debug is False:
                    self.report.create_html_features()
                    self.report.create_html_associations()

                    # For now, we just copy all DRUG images from 
                    # the analysis made in ALL 
                    from easydev import shellcmd, Progress
                    print("\nCopying drug files")
                    drug_ids = results.df.DRUG_ID.unique()
                    pb = Progress(len(drug_ids))
                    for i, drug_id in enumerate(drug_ids):
                        # copy the HTML
                        filename = "%s.html" % drug_id
                        source = "ALL%s%s%s" % (os.sep, tcga, os.sep)
                        dest = "%s%s%s%s" % (company, os.sep, tcga, os.sep )
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        #copy the images
                        filename = "volcano_%s.*" % drug_id
                        source = "ALL%s%s%simages%s" % (os.sep, tcga,
                                os.sep, os.sep)
                        dest = "%s%s%s%simages%s" % (company, os.sep,
                                tcga, os.sep , os.sep)
                        cmd = "cp %s%s %s" % (source, filename, dest )
                        shellcmd(cmd, verbose=False)
                        pb.animate(i+1)