示例#1
0
def _joinpath(fpath: Union[S3Path, Path], other: str) -> Union[S3Path, Path]:
    if isinstance(fpath, Path):
        return fpath.joinpath(other).absolute()
    else:
        if (fpath.to_string().endswith("/") and not other.startswith("/") or
                not fpath.to_string().endswith("/") and other.startswith("/")):
            return S3Path.from_string(fpath.to_string() + other)
        elif fpath.to_string().endswith("/") and other.startswith("/"):
            return S3Path.from_string(fpath.to_string() + other[1:])
        elif not fpath.to_string().endswith("/") and not other.startswith("/"):
            return S3Path.from_string(fpath.to_string() + "/" + other)
        else:
            raise ValueError(f"Unable to join {fpath.to_string()} and "
                             f'{other} with "/"')
示例#2
0
    def pg_restore(self, dump_file, **options):
        """Load content into the database from a dump file on s3."""
        if isinstance(dump_file, str):
            dump_file = S3Path.from_string(dump_file)
        elif dump_file is not None and not isinstance(dump_file, S3Path):
            raise ValueError("Argument `dump_file` must be appropriately "
                             "formatted string or S3Path object, not %s."
                             % type(dump_file))

        from subprocess import run
        from os import environ

        self.session.close()
        self.grab_session()

        # Add the password to the env
        my_env = environ.copy()
        my_env['PGPASSWORD'] = self.url.password

        # Pipe the database dump from s3 through this machine into the database
        logger.info("Dumping into the database.")
        option_list = [f'--{opt}' if isinstance(val, bool) and val
                       else f'--{opt}={val}' for opt, val in options.items()]
        run(' '.join(['aws', 's3', 'cp', dump_file.to_string(), '-', '|',
                      'pg_restore', *self._form_pg_args(), *option_list,
                      '--no-owner']),
            env=my_env, shell=True, check=True)
        self.session.close()
        self.grab_session()
        return dump_file
示例#3
0
def _main():
    parser = _make_parser()
    args = parser.parse_args()
    if args.debug:
        logger.setLevel(logging.DEBUG)
        from indra_db.databases import logger as db_logger
        db_logger.setLevel(logging.DEBUG)
    print("Getting %s database." % args.database)
    db = get_db(args.database)
    assert db is not None
    db.grab_session()
    s3_cache = S3Path.from_string(args.cache)
    pa = DbPreassembler(args.batch,
                        s3_cache,
                        stmt_type=args.stmt_type,
                        yes_all=args.yes_all)

    desc = 'Continuing' if args.continuing else 'Beginning'
    print("%s to %s preassembled corpus." % (desc, args.task))
    if args.task == 'create':
        pa.create_corpus(db, args.continuing)
    elif args.task == 'update':
        pa.supplement_corpus(db, args.continuing)
    else:
        raise IndraDBPreassemblyError('Unrecognized task: %s.' % args.task)
示例#4
0
    def get_s3_path(self) -> S3Path:
        """Return an S3Path object of the saved s3 location

        Returns
        -------
        S3Path
        """
        if self.s3_location is None:
            raise ValueError("s3_location is not set")
        return S3Path.from_string(self.s3_location)
示例#5
0
    def pg_dump(self, dump_file, **options):
        """Use the pg_dump command to dump part of the database onto s3.

        The `pg_dump` tool must be installed, and must be a compatible version
        with the database(s) being used.

        All keyword arguments are converted into flags/arguments of pg_dump. For
        documentation run `pg_dump --help`. This will also confirm you have
        `pg_dump` installed.

        By default, the "General" and "Connection" options are already set. The
        most likely specification you will want to use is `--table` or
        `--schema`, specifying either a particular table or schema to dump.

        Parameters
        ----------
        dump_file : S3Path or str
            The location on s3 where the content should be dumped.
        """
        if isinstance(dump_file, str):
            dump_file = S3Path.from_string(dump_file)
        elif dump_file is not None and not isinstance(dump_file, S3Path):
            raise ValueError("Argument `dump_file` must be appropriately "
                             "formatted string or S3Path object, not %s."
                             % type(dump_file))

        from subprocess import check_call
        from os import environ

        # Make sure the session is fresh and any previous session are done.
        self.session.close()
        self.grab_session()

        # Add the password to the env
        my_env = environ.copy()
        my_env['PGPASSWORD'] = self.url.password

        # Dump the database onto s3, piping through this machine (errors if
        # anything went wrong).
        option_list = [f'--{opt}' if isinstance(val, bool) and val
                       else f'--{opt}={val}' for opt, val in options.items()]
        cmd = ' '.join(["pg_dump", *self._form_pg_args(), *option_list, '-Fc',
                        '|', 'aws', 's3', 'cp', '-', dump_file.to_string()])
        check_call(cmd, shell=True, env=my_env)
        return dump_file
示例#6
0
    def plot_interesting(
        self,
        outdir: str,
        z_corr: Optional[Union[str, pd.DataFrame]] = None,
        show_plot: Optional[bool] = False,
        max_proc: Optional[int] = None,
        index_counter: Optional[Union[Iterator, Generator]] = None,
        max_so_pairs_size: int = 10000,
        mp_pairs: bool = True,
        run_linear: bool = False,
        log_scale_y: bool = False,
    ):
        """Plots the same type of plot as plot_dists, but filters A, B

        A, B are filtered to those that fulfill the following:
            - No a-b or b-a explanations
            - Not explained by apriori explanations
            - Without common reactome pathways
            - With a-x-b, b-x-a or shared target explanation

        Parameters
        ----------
        outdir : str
            The output directory to save the plots in. If string starts with
            's3://' upload to s3. outdir must then have the form
            's3://<bucket>/<sub_dir>' where <bucket> must be specified and
            <sub_dir> is optional and may contain subdirectories.
        z_corr : Union[str, pd.DataFrame]
            A pd.DataFrame containing the correlation z scores used to
            create the statistics in this object. If not provided,
            an attempt will be made to load it from the file path present in
            script_settings.
        show_plot : bool
            If True also show plots
        max_proc : int > 0
            The maximum number of processes to run in the multiprocessing in
            get_corr_stats_mp. Default: multiprocessing.cpu_count()
        index_counter : Union[Iterator, Generator]
            An object which produces a new int by using 'next()' on it. The
            integers are used to separate the figures so as to not append
            new plots in the same figure.
        max_so_pairs_size : int
            The maximum number of correlation pairs to process. If the
            number of eligible pairs is larger than this number, a random
            sample of max_so_pairs_size is used. Default: 10000.
        mp_pairs : bool
            If True, get the pairs to process using multiprocessing if larger
            than 10 000. Default: True.
        run_linear : bool
            If True, gather the data without multiprocessing. This option is
            good when debugging or if the environment for some reason does
            not support multiprocessing. Default: False.
        log_scale_y : bool
            If True, plot the plots in this method with log10 scale on y-axis.
            Default: False.
        """
        # Local file or s3
        if outdir.startswith("s3://"):
            s3_path = S3Path.from_string(outdir)
            od = None
        else:
            s3_path = None
            od = Path(outdir)
            if not od.is_dir():
                od.mkdir(parents=True, exist_ok=True)

        # Get corr stats
        corr_stats: Results = self.get_corr_stats_axb(
            z_corr=z_corr,
            max_proc=max_proc,
            max_so_pairs_size=max_so_pairs_size,
            mp_pairs=mp_pairs,
            run_linear=run_linear,
        )
        fig_index = (next(index_counter) if index_counter else floor(
            datetime.timestamp(datetime.utcnow())))
        plt.figure(fig_index)
        plt.hist(
            corr_stats.azfb_avg_corrs,
            bins="auto",
            density=True,
            color="b",
            alpha=0.3,
            log=log_scale_y,
        )
        plt.hist(
            corr_stats.avg_x_filtered_corrs,
            bins="auto",
            density=True,
            color="r",
            alpha=0.3,
            log=log_scale_y,
        )
        legend = [
            "Filtered A-X-B for any X", "Filtered A-X-B for X in network"
        ]

        sd_str = self.get_sd_str()
        title = (f"avg X corrs, filtered {sd_str} "
                 f'({self.script_settings["graph_type"]})')
        plt.title(title)
        plt.ylabel("Norm. Density")
        plt.xlabel("mean(abs(corr(a,x)), abs(corr(x,b))) (SD)")
        plt.legend(legend)
        name = "%s_%s_axb_filtered_hist_comparison.pdf" % (
            sd_str,
            self.script_settings["graph_type"],
        )

        # Save to file or ByteIO and S3
        if od is None:
            fname = BytesIO()
        else:
            fname = od.joinpath(name).as_posix()
        plt.savefig(fname, format="pdf")
        if od is None:
            # Reset pointer
            fname.seek(0)
            # Upload to s3
            full_s3_path = _joinpath(s3_path, name)
            _upload_bytes_io_to_s3(bytes_io_obj=fname, s3p=full_s3_path)

        # Show plot
        if show_plot:
            plt.show()

        # Close figure
        plt.close(fig_index)
示例#7
0
    def plot_corr_stats(
        self,
        outdir: str,
        z_corr: Optional[Union[str, pd.DataFrame]] = None,
        show_plot: bool = False,
        max_proc: bool = None,
        index_counter: Optional[Union[Iterator, Generator]] = None,
        max_so_pairs_size: int = 10000,
        mp_pairs: bool = True,
        run_linear: bool = False,
        log_scale_y: bool = False,
    ):
        """Plot the results of running explainer.get_corr_stats_axb()

        Parameters
        ----------
        outdir : str
            The output directory to save the plots in. If string starts with
            's3://' upload to s3. outdir must then have the form
            's3://<bucket>/<sub_dir>' where <bucket> must be specified and
            <sub_dir> is optional and may contain subdirectories.
        z_corr : Union[str, pd.DataFrame]
            A pd.DataFrame containing the correlation z scores used to
            create the statistics in this object. If not provided,
            an attempt will be made to load it from the file path present in
            script_settings.
        show_plot : bool
            If True, also show plots after saving them. Default False.
        max_proc : int > 0
            The maximum number of processes to run in the multiprocessing in
            get_corr_stats_mp. Default: multiprocessing.cpu_count()
        index_counter : Union[Iterator, Generator]
            An object which produces a new int by using 'next()' on it. The
            integers are used to separate the figures so as to not append
            new plots in the same figure.
        max_so_pairs_size : int
            The maximum number of correlation pairs to process. If the
            number of eligible pairs is larger than this number, a random
            sample of max_so_pairs_size is used. Default: 10 000.
        mp_pairs : bool
            If True, get the pairs to process using multiprocessing if larger
            than 10 000. Default: True.
        run_linear : bool
            If True, gather the data without multiprocessing. This option is
            good when debugging or if the environment for some reason does
            not support multiprocessing. Default: False.
        log_scale_y : bool
            If True, plot the plots in this method with log10 scale on y-axis.
            Default: False.
        """
        # Local file or s3
        if outdir.startswith("s3://"):
            s3_path = S3Path.from_string(outdir)
            logger.info(f"Outdir path is on S3: {str(s3_path)}")
            od = None
        else:
            s3_path = None
            od = Path(outdir)
            if not od.is_dir():
                logger.info(f"Creating directory/ies for {od}")
                od.mkdir(parents=True, exist_ok=True)

        # Get corr stats
        corr_stats: Results = self.get_corr_stats_axb(
            z_corr=z_corr,
            max_proc=max_proc,
            max_so_pairs_size=max_so_pairs_size,
            mp_pairs=mp_pairs,
            run_linear=run_linear,
        )
        sd_str = self.get_sd_str()
        for m, (plot_type, data) in enumerate(corr_stats.dict().items()):
            if len(data) > 0:
                name = f'{plot_type}_{self.script_settings["graph_type"]}.pdf'
                logger.info(f"Using file name {name}")
                if od is None:
                    fname = BytesIO()
                else:
                    fname = od.joinpath(name).as_posix()
                if isinstance(data[0], tuple):
                    data = [t[-1] for t in data]

                fig_index = next(index_counter) if index_counter else m
                plt.figure(fig_index)
                plt.hist(x=data, bins="auto", log=log_scale_y)
                title = (f'{plot_type.replace("_", " ").capitalize()}; '
                         f'{sd_str} {self.script_settings["graph_type"]}')

                plt.title(title)
                plt.xlabel("combined z-score")
                plt.ylabel("count")

                # Save to file or ByteIO and S3
                plt.savefig(fname, format="pdf")
                if od is None:
                    # Reset pointer
                    fname.seek(0)
                    # Upload to s3
                    full_s3_path = _joinpath(s3_path, name)
                    _upload_bytes_io_to_s3(bytes_io_obj=fname,
                                           s3p=full_s3_path)

                # Show plot
                if show_plot:
                    plt.show()

                # Close figure
                plt.close(fig_index)
            else:
                logger.warning(f"Empty result for {plot_type} in "
                               f"range {sd_str} for graph type "
                               f'{self.script_settings["graph_type"]}')
示例#8
0
    def get_corr_stats_axb(
        self,
        z_corr: Optional[Union[str, pd.DataFrame]] = None,
        max_proc: Optional[int] = None,
        max_so_pairs_size: int = 10000,
        mp_pairs: bool = True,
        run_linear: bool = False,
    ) -> Results:
        """Get statistics of the correlations from different explanation types

        Note: the provided options have no effect if the data is loaded
        from cache.

        Parameters
        ----------
        z_corr : Optional[Union[pd.DataFrame, str]]
            A pd.DataFrame containing the correlation z scores used to
            create the statistics in this object. Pro
        max_proc : int > 0
            The maximum number of processes to run in the multiprocessing
            in get_corr_stats_mp. Default: multiprocessing.cpu_count()
        max_so_pairs_size : int
            The maximum number of correlation pairs to process. If the
            number of eligible pairs is larger than this number, a random
            sample of max_so_pairs_size is used. Default: 10 000. If the
            number of pairs to check is smaller than 10 000, no sampling is
            done.
        mp_pairs : bool
            If True, get the pairs to process using multiprocessing if larger
            than 10 000. Default: True.
        run_linear : bool
            If True, gather the data without multiprocessing. This option is
            good when debugging or if the environment for some reason does
            not support multiprocessing. Default: False.

        Returns
        -------
        Results
            A BaseModel containing correlation data for different explanations
        """
        if not self.corr_stats_axb:
            s3 = get_s3_client(unsigned=False)
            try:
                corr_stats_loc = self.get_s3_corr_stats_path()
                if S3Path.from_string(corr_stats_loc).exists(s3):
                    logger.info(f"Found corr stats data at {corr_stats_loc}")
                    corr_stats_json = file_opener(corr_stats_loc)
                    self.corr_stats_axb = Results(**corr_stats_json)
                else:
                    logger.info(f"No corr stats data at found at "
                                f"{corr_stats_loc}")
            except ValueError as ve:
                # Raised when s3 location is not set
                logger.warning(ve)

            # If not found on s3 or ValueError was raised
            if not self.corr_stats_axb:
                logger.info("Generating corr stats data")
                # Load correlation matrix
                if z_corr is None:
                    z_corr = self.load_z_corr()
                if isinstance(z_corr, str):
                    z_corr = self.load_z_corr(local_file_path=z_corr)
                # Load reactome if present
                try:
                    reactome = self.load_reactome()
                except FileNotFoundError:
                    logger.info("No reactome file used in script")
                    reactome = None
                self.corr_stats_axb: Results = axb_stats(
                    self.expl_df,
                    self.stats_df,
                    z_corr=z_corr,
                    reactome=reactome,
                    eval_str=False,
                    max_proc=max_proc,
                    max_corr_pairs=max_so_pairs_size,
                    do_mp_pairs=mp_pairs,
                    run_linear=run_linear,
                )
                try:
                    corr_stats_loc = self.get_s3_corr_stats_path()
                    logger.info(f"Uploading corr stats to S3 at "
                                f"{corr_stats_loc}")
                    s3p_loc = S3Path.from_string(corr_stats_loc)
                    s3p_loc.put(s3=s3, body=self.corr_stats_axb.json())
                    logger.info("Finished uploading corr stats to S3")
                except ValueError:
                    logger.warning("Unable to upload corr stats to S3")
        else:
            logger.info("Data already present in corr_stats_axb")
        return self.corr_stats_axb