Exemplos de S3Path.from_string em Python, exemplos de indra_db.util.s3_path.S3Path.from_string em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: statistics.py Projeto: johnbachman/depmap_analysis

def _joinpath(fpath: Union[S3Path, Path], other: str) -> Union[S3Path, Path]:
    if isinstance(fpath, Path):
        return fpath.joinpath(other).absolute()
    else:
        if fpath.to_string().endswith('/') and not other.startswith('/') or \
                not fpath.to_string().endswith('/') and other.startswith('/'):
            return S3Path.from_string(fpath.to_string() + other)
        elif fpath.to_string().endswith('/') and other.startswith('/'):
            return S3Path.from_string(fpath.to_string() + other[1:])
        elif not fpath.to_string().endswith('/') and not other.startswith('/'):
            return S3Path.from_string(fpath.to_string() + '/' + other)
        else:
            raise ValueError(f'Unable to join {fpath.to_string()} and '
                             f'{other} with "/"')

Exemplo n.º 2

0

Exibir arquivo

Arquivo: util.py Projeto: kkaris/depmap_analysis

def get_dir_iter(path: str, file_ending: Optional[str] = None) -> List:
    """Takes a directory path and returns a list of files

    Parameters
    ----------
    path :
        The path to the directory to loop over
    file_ending :
        If provided, files in the returned list must be of this format,
        e.g. .pkl

    Returns
    -------
    :
        A list of file in the directory
    """
    if path.startswith('s3://'):
        s3 = get_s3_client(unsigned=False)
        s3_base_path = S3Path.from_string(path)
        input_iter = \
            [s3p.to_string() for s3p in s3_base_path.list_objects(s3)]
    else:
        local_base_path = Path(path)
        input_iter = [
            f.absolute().as_posix() for f in local_base_path.glob('*')
            if f.is_file()
        ]

    if file_ending:
        input_iter = [f for f in input_iter if f.endswith(file_ending)]

    return input_iter

Exemplo n.º 3

0

Exibir arquivo

Arquivo: dump_sif.py Projeto: kolusask/indra_db

def load_db_content(ns_list, pkl_filename=None, ro=None, reload=False):
    if isinstance(pkl_filename, str) and pkl_filename.startswith('s3:'):
        pkl_filename = S3Path.from_string(pkl_filename)
    # Get the raw data
    if reload or not pkl_filename:
        if not ro:
            ro = get_ro('primary')
        logger.info("Querying the database for statement metadata...")
        results = []
        for ns in ns_list:
            logger.info("Querying for {ns}".format(ns=ns))
            res = ro.select_all([
                ro.PaMeta.mk_hash, ro.PaMeta.db_name, ro.PaMeta.db_id,
                ro.PaMeta.ag_num, ro.PaMeta.ev_count, ro.PaMeta.type_num
            ], ro.PaMeta.db_name.like(ns))
            results.extend(res)
        results = {(h, dbn, dbi, ag_num, ev_cnt, ro_type_map.get_str(tn))
                   for h, dbn, dbi, ag_num, ev_cnt, tn in results}
        if pkl_filename:
            if isinstance(pkl_filename, S3Path):
                upload_pickle_to_s3(results, pkl_filename)
            else:
                with open(pkl_filename, 'wb') as f:
                    pickle.dump(results, f)
    # Get a cached pickle
    else:
        logger.info("Loading database content from %s" % pkl_filename)
        if pkl_filename.startswith('s3:'):
            results = load_pickle_from_s3(pkl_filename)
        else:
            with open(pkl_filename, 'rb') as f:
                results = pickle.load(f)
    logger.info("{len} stmts loaded".format(len=len(results)))
    return results

Exemplo n.º 4

0

Exibir arquivo

Arquivo: dump_sif.py Projeto: kolusask/indra_db

def dump_sif(df_file=None,
             db_res_file=None,
             csv_file=None,
             src_count_file=None,
             reload=False,
             reconvert=True,
             ro=None):
    if ro is None:
        ro = get_db('primary')

    # Get the db content from a new DB dump or from file
    db_content = load_db_content(reload=reload,
                                 ns_list=NS_LIST,
                                 pkl_filename=db_res_file,
                                 ro=ro)

    # Convert the database query result into a set of pairwise relationships
    df = make_dataframe(pkl_filename=df_file,
                        reconvert=reconvert,
                        db_content=db_content)

    if csv_file:
        if isinstance(csv_file, str) and csv_file.startswith('s3:'):
            csv_file = S3Path.from_string(csv_file)
        # Aggregate rows by genes and stmt type
        logger.info("Saving to CSV...")
        filt_df = df.filter(items=[
            'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name',
            'stmt_type', 'evidence_count'
        ])
        type_counts = filt_df.groupby(by=[
            'agA_ns', 'agA_id', 'agA_name', 'agB_ns', 'agB_id', 'agB_name',
            'stmt_type'
        ]).sum()
        # This requires package s3fs under the hood. See:
        # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v0.20.0.html#s3-file-handling
        if isinstance(csv_file, S3Path):
            try:
                type_counts.to_csv(csv_file.to_string())
            except Exception as e:
                try:
                    logger.warning('Failed to upload csv to s3 using direct '
                                   's3 url, trying boto3: %s.' % e)
                    s3 = get_s3_client(unsigned=False)
                    csv_buf = StringIO()
                    type_counts.to_csv(csv_buf)
                    s3.put_object(Body=csv_buf.getvalue(), **csv_file.kw())
                    logger.info('Uploaded CSV file to s3')
                except Exception as second_e:
                    logger.error('Failed to upload csv file with fallback '
                                 'method')
                    logger.exception(second_e)
        # save locally
        else:
            type_counts.to_csv(csv_file)

    if src_count_file:
        _ = get_source_counts(src_count_file, ro=ro)
    return

Exemplo n.º 5

0

Exibir arquivo

Arquivo: statistics.py Projeto: johnbachman/depmap_analysis

    def get_s3_path(self) -> S3Path:
        """Return an S3Path object of the saved s3 location

        Returns
        -------
        S3Path
        """
        if self.s3_location is None:
            raise ValueError('s3_location is not set')
        return S3Path.from_string(self.s3_location)

Exemplo n.º 6

0

Exibir arquivo

 def is_dir(path: str):
     if path.startswith('s3://'):
         from indra_db.util.s3_path import S3Path
         from .aws import get_s3_client
         s3 = get_s3_client(False)
         s3dp = S3Path.from_string(path)
         if not s3dp.exists(s3):
             raise ValueError(f'Path {path} does not seem to exists')
     else:
         dp = Path(path)
         if not dp.is_dir():
             raise ValueError(f'Path {path} does not exist')
     return path

Exemplo n.º 7

0

Exibir arquivo

def s3_file_opener(s3_url: str, unsigned: bool = False, **kwargs) -> \
        Union[object, pd.DataFrame, Dict]:
    """Open a file from s3 given a standard s3-path

    kwargs are only relevant for csv/tsv files and are used for pd.read_csv()

    Parameters
    ----------
    s3_url : str
        S3 url of the format 's3://<bucket>/<key>'. The key is assumed to
        also contain a file ending
    unsigned : bool
        If True, perform S3 calls unsigned. Default: False

    Returns
    -------
    Union[object, pd.DataFrame, Dict]
        Object stored on S3
    """
    from indra_db.util.s3_path import S3Path
    from .aws import load_pickle_from_s3, read_json_from_s3, get_s3_client
    logger.info(f'Loading {s3_url} from s3')
    s3_path = S3Path.from_string(s3_url)
    s3 = get_s3_client(unsigned=unsigned)
    bucket, key = s3_path.bucket, s3_path.key
    if key.endswith('.json'):
        return read_json_from_s3(s3=s3, key=key, bucket=bucket)
    elif key.endswith('.pkl'):
        return load_pickle_from_s3(s3=s3, key=key, bucket=bucket)
    elif key.endswith(('.csv', '.tsv')):
        fileio = S3Path.from_string(s3_url).get(s3=s3)
        csv_str = fileio['Body'].read().decode('utf-8')
        raw_file = StringIO(csv_str)
        return pd.read_csv(raw_file, **kwargs)
    else:
        logger.warning(f'File type {key.split(".")[-1]} not recognized, '
                       f'returning S3 file stream handler (access from '
                       f'`res["Body"].read()`)')
        return S3Path.from_string(s3_url).get(s3=s3)

Exemplo n.º 8

0

Exibir arquivo

 def check_path(fpath: str):
     if fpath.startswith('s3://'):
         if file_ending and not fpath.endswith(file_ending):
             raise ValueError(f'Unrecognized file type '
                              f'{fpath.split("/")[-1]}')
         from indra_db.util.s3_path import S3Path
         from .aws import get_s3_client
         if not S3Path.from_string(fpath).exists(s3=get_s3_client(False)):
             raise ValueError(f'File {fpath} does not exist')
         return fpath
     p = Path(fpath)
     if not p.is_file():
         raise ValueError(f'File {fpath} does not exist')
     if file_ending and not p.name.endswith(file_ending):
         raise ValueError(f'Unrecognized file type {p.name.split(".")[-1]}')
     return fpath

Exemplo n.º 9

0

Exibir arquivo