示例#1
0
 def __init__(self):
     message = "In order to use the UniversalSentenceEncoder primitive install 'nlp_primitives[complete]'"
     self.tf = import_or_raise("tensorflow", message)
     hub = import_or_raise("tensorflow_hub", message)
     self.tf.compat.v1.disable_eager_execution()
     self.module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
     self.embed = hub.Module(self.module_url)
     self.number_output_features = 512
     self.n = 512
 def save(self, location, profile_name):
     features_dict = self.to_dict()
     if location is None:
         return json.dumps(features_dict)
     if isinstance(location, str):
         transport_params = {}
         if _is_url(location):
             raise ValueError("Writing to URLs is not supported")
         if _is_s3(location):
             boto3 = import_or_raise("boto3", BOTO3_ERR_MSG)
             session = boto3.Session()
             if isinstance(profile_name, str):
                 transport_params = {'session': boto3.Session(profile_name=profile_name)}
                 use_smartopen_features(location, features_dict, transport_params, read=False)
             elif profile_name is False:
                 use_s3fs_features(location, features_dict, read=False)
             elif session.get_credentials() is not None:
                 use_smartopen_features(location, features_dict, read=False)
             else:
                 use_s3fs_features(location, features_dict, read=False)
         else:
             with open(location, "w") as f:
                 json.dump(features_dict, f)
     else:
         json.dump(features_dict, location)
示例#3
0
def use_s3fs_es(file_path, path, read=True):
    s3fs = import_or_raise("s3fs", S3FS_ERR_MSG)
    s3 = s3fs.S3FileSystem(anon=True)
    if read:
        s3.get(path, file_path)
    else:
        s3.put(file_path, path)
示例#4
0
def write_data_description(entityset, path, profile_name=None, **kwargs):
    '''Serialize entityset to data description and write to disk or S3 path.

    Args:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
        path (str) : Location on disk or S3 path to write `data_description.json` and entity data.
        profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials.
            Set to False to use an anonymous profile.
        kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile.
    '''
    if _is_s3(path):
        boto3 = import_or_raise("boto3", BOTO3_ERR_MSG)

        with tempfile.TemporaryDirectory() as tmpdir:
            os.makedirs(os.path.join(tmpdir, 'data'))
            dump_data_description(entityset, tmpdir, **kwargs)
            file_path = create_archive(tmpdir)

            transport_params = {}
            session = boto3.Session()
            if isinstance(profile_name, str):
                transport_params = {'session': boto3.Session(profile_name=profile_name)}
                use_smartopen_es(file_path, path, transport_params, read=False)
            elif profile_name is False:
                use_s3fs_es(file_path, path, read=False)
            elif session.get_credentials() is not None:
                use_smartopen_es(file_path, path, read=False)
            else:
                use_s3fs_es(file_path, path, read=False)
    elif _is_url(path):
        raise ValueError("Writing to URLs is not supported")
    else:
        path = os.path.abspath(path)
        os.makedirs(os.path.join(path, 'data'), exist_ok=True)
        dump_data_description(entityset, path, **kwargs)
示例#5
0
def get_transport_params(profile_name):
    boto3 = import_or_raise("boto3", BOTO3_ERR_MSG)
    UNSIGNED = import_or_raise("botocore", BOTOCORE_ERR_MSG).UNSIGNED
    Config = import_or_raise("botocore.config", BOTOCORE_ERR_MSG).Config

    if isinstance(profile_name, str):
        session = boto3.Session(profile_name=profile_name)
        transport_params = {'client': session.client('s3')}
    elif profile_name is False or boto3.Session().get_credentials() is None:
        session = boto3.Session()
        client = session.client('s3',
                                config=Config(signature_version=UNSIGNED))
        transport_params = {'client': client}
    else:
        transport_params = None
    return transport_params
示例#6
0
 def load(cls, features, profile_name):
     if isinstance(features, str):
         try:
             features_dict = json.loads(features)
         except ValueError:
             if _is_url(features):
                 features_dict = use_smartopen_features(features)
             elif _is_s3(features):
                 boto3 = import_or_raise("boto3", BOTO3_ERR_MSG)
                 session = boto3.Session()
                 if isinstance(profile_name, str):
                     transport_params = {
                         'session': boto3.Session(profile_name=profile_name)
                     }
                     features_dict = use_smartopen_features(
                         features, transport_params)
                 elif profile_name is False:
                     features_dict = use_s3fs_features(features)
                 elif session.get_credentials() is not None:
                     features_dict = use_smartopen_features(features)
                 else:
                     features_dict = use_s3fs_features(features)
             else:
                 with open(features, 'r') as f:
                     features_dict = json.load(f)
         return cls(features_dict)
     return cls(json.load(features))
示例#7
0
def get_transport_params(profile_name):
    boto3 = import_or_raise("boto3", BOTO3_ERR_MSG)
    UNSIGNED = import_or_raise("botocore", BOTOCORE_ERR_MSG).UNSIGNED
    Config = import_or_raise("botocore.config", BOTOCORE_ERR_MSG).Config

    if isinstance(profile_name, str):
        transport_params = {
            'session': boto3.Session(profile_name=profile_name)
        }
    elif profile_name is False or boto3.Session().get_credentials() is None:
        transport_params = {
            'resource_kwargs': {
                'config': Config(signature_version=UNSIGNED)
            }
        }
    else:
        transport_params = None
    return transport_params
示例#8
0
def use_smartopen_features(path, features_dict=None, transport_params=None, read=True):
    open = import_or_raise("smart_open", SMART_OPEN_ERR_MSG).open
    if read:
        with open(path, 'r', encoding='utf-8', transport_params=transport_params) as f:
            features_dict = json.load(f)
            return features_dict
    else:
        with open(path, "w", transport_params=transport_params) as f:
            json.dump(features_dict, f)
示例#9
0
def use_smartopen_es(file_path, path, transport_params=None, read=True):
    open = import_or_raise("smart_open", SMART_OPEN_ERR_MSG).open
    if read:
        with open(path, "rb", transport_params=transport_params) as fin:
            with open(file_path, 'wb') as fout:
                shutil.copyfileobj(fin, fout)
    else:
        with open(file_path, 'rb') as fin:
            with open(path, 'wb', transport_params=transport_params) as fout:
                shutil.copyfileobj(fin, fout)
示例#10
0
def use_s3fs_features(file_path, features_dict=None, read=True):
    s3fs = import_or_raise("s3fs", S3FS_ERR_MSG)
    s3 = s3fs.S3FileSystem(anon=True)
    if read:
        with s3.open(file_path, "r", encoding='utf-8') as f:
            features_dict = json.load(f)
            return features_dict
    else:
        with s3.open(file_path, "w", encoding='utf-8') as f:
            features = json.dumps(features_dict, ensure_ascii=False)
            f.write(features)
示例#11
0
def check_graphviz():
    GRAPHVIZ_ERR_MSG = (
        'Please install graphviz to plot.' +
        ' (See https://docs.featuretools.com/en/stable/getting_started/install.html#installing-graphviz for'
        + ' details)')
    graphviz = import_or_raise("graphviz", GRAPHVIZ_ERR_MSG)
    # Try rendering a dummy graph to see if a working backend is installed
    try:
        graphviz.Digraph().pipe()
    except graphviz.backend.ExecutableNotFound:
        raise RuntimeError(
            "To plot entity sets, a graphviz backend is required.\n" +
            "Install the backend using one of the following commands:\n" +
            "  Mac OS: brew install graphviz\n" +
            "  Linux (Ubuntu): sudo apt-get install graphviz\n" +
            "  Windows: conda install python-graphviz\n" +
            "  For more details visit: https://docs.featuretools.com/en/stable/getting_started/install.html"
        )
    return graphviz
示例#12
0
def check_graphviz():
    GRAPHVIZ_ERR_MSG = (
        "Please install graphviz to plot."
        + " (See https://featuretools.alteryx.com/en/stable/install.html#installing-graphviz for"
        + " details)"
    )
    graphviz = import_or_raise("graphviz", GRAPHVIZ_ERR_MSG)
    # Try rendering a dummy graph to see if a working backend is installed
    try:
        graphviz.Digraph().pipe()
    except graphviz.backend.ExecutableNotFound:
        raise RuntimeError(
            "To plot entity sets, a graphviz backend is required.\n"
            + "Install the backend using one of the following commands:\n"
            + "  Mac OS: brew install graphviz\n"
            + "  Linux (Ubuntu): $ sudo apt install graphviz\n"
            + "  Windows (conda): conda install -c conda-forge python-graphviz\n"
            + "  Windows (pip): pip install graphviz\n"
            + "  Windows (EXE required if graphviz was installed via pip): https://graphviz.org/download/#windows"
            + "  For more details visit: https://featuretools.alteryx.com/en/stable/install.html#installing-graphviz"
        )
    return graphviz
示例#13
0
def read_entityset(path, profile_name=None, **kwargs):
    '''Read entityset from disk, S3 path, or URL.

        Args:
            path (str): Directory on disk, S3 path, or URL to read `data_description.json`.
            profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials.
                Set to False to use an anonymous profile.
            kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method.
    '''
    if _is_url(path) or _is_s3(path):
        boto3 = import_or_raise("boto3", BOTO3_ERR_MSG)

        with tempfile.TemporaryDirectory() as tmpdir:
            file_name = Path(path).name
            file_path = os.path.join(tmpdir, file_name)
            transport_params = {}
            session = boto3.Session()

            if _is_url(path):
                use_smartopen_es(file_path, path)
            elif isinstance(profile_name, str):
                transport_params = {'session': boto3.Session(profile_name=profile_name)}
                use_smartopen_es(file_path, path, transport_params)
            elif profile_name is False:
                use_s3fs_es(file_path, path)
            elif session.get_credentials() is not None:
                use_smartopen_es(file_path, path)
            else:
                use_s3fs_es(file_path, path)

            with tarfile.open(str(file_path)) as tar:
                tar.extractall(path=tmpdir)

            data_description = read_data_description(tmpdir)
            return description_to_entityset(data_description, **kwargs)
    else:
        data_description = read_data_description(path)
        return description_to_entityset(data_description, **kwargs)
示例#14
0
def test_import_or_raise_errors():
    with pytest.raises(ImportError, match="error message"):
        import_or_raise("_featuretools", "error message")
示例#15
0
def read_entity_data(description, path):
    '''Read description data from disk.

    Args:
        description (dict) : Description of :class:`.Entity`.
        path (str): Location on disk to read entity data.

    Returns:
        df (DataFrame) : Instance of dataframe.
    '''
    file = os.path.join(path, description['loading_info']['location'])
    kwargs = description['loading_info'].get('params', {})
    load_format = description['loading_info']['type']
    entity_type = description['loading_info'].get('entity_type', 'pandas')
    read_kwargs = {}
    if entity_type == 'dask':
        lib = dd
    elif entity_type == 'koalas':
        import_error = 'Cannot load Koalas entityset - unable to import Koalas. ' \
                       'Consider doing a pip install with featuretools[koalas] to install Koalas with pip'
        lib = import_or_raise('databricks.koalas', import_error)
        read_kwargs['multiline'] = True
        kwargs['compression'] = str(kwargs['compression'])
    else:
        lib = pd
    if load_format == 'csv':
        dataframe = lib.read_csv(file,
                                 engine=kwargs['engine'],
                                 compression=kwargs['compression'],
                                 encoding=kwargs['encoding'],
                                 **read_kwargs)
    elif load_format == 'parquet':
        dataframe = lib.read_parquet(file, engine=kwargs['engine'])
    elif load_format == 'pickle':
        dataframe = pd.read_pickle(file, **kwargs)
    else:
        error = 'must be one of the following formats: {}'
        raise ValueError(error.format(', '.join(FORMATS)))
    dtypes = description['loading_info']['properties']['dtypes']
    if entity_type == 'koalas':
        for col, dtype in dtypes.items():
            if dtype == 'object':
                dtypes[col] = 'str'
            if dtype == 'datetime64[ns]':
                dtypes[col] = np.datetime64
    dataframe = dataframe.astype(dtypes)

    if load_format in ['parquet', 'csv']:
        latlongs = []
        for var_description in description['variables']:
            if var_description['type']['value'] == LatLong.type_string:
                latlongs.append(var_description["id"])

        def parse_latlong_tuple(x):
            return tuple(float(y) for y in x[1:-1].split(","))

        def parse_latlong_list(x):
            return list(float(y) for y in x[1:-1].split(","))

        for column in latlongs:
            if entity_type == 'dask':
                meta = (column, tuple([float, float]))
                dataframe[column] = dataframe[column].apply(
                    parse_latlong_tuple, meta=meta)
            elif entity_type == 'koalas':
                dataframe[column] = dataframe[column].apply(parse_latlong_list)

            else:
                dataframe[column] = dataframe[column].apply(
                    parse_latlong_tuple)

    return dataframe
示例#16
0
def test_import_or_raise_imports():
    math = import_or_raise("math", "error message")
    assert math.ceil(0.1) == 1
示例#17
0
    def plot(self, to_file=None):
        """
        Create a UML diagram-ish graph of the EntitySet.

        Args:
            to_file (str, optional) : Path to where the plot should be saved.
                If set to None (as by default), the plot will not be saved.

        Returns:
            graphviz.Digraph : Graph object that can directly be displayed in
                Jupyter notebooks.

        """
        GRAPHVIZ_ERR_MSG = (
            'Please install graphviz to plot entity sets.' +
            ' (See https://docs.featuretools.com/en/stable/getting_started/install.html#installing-graphviz for'
            + ' details)')
        graphviz = import_or_raise("graphviz", GRAPHVIZ_ERR_MSG)
        # Try rendering a dummy graph to see if a working backend is installed
        try:
            graphviz.Digraph().pipe()
        except graphviz.backend.ExecutableNotFound:
            raise RuntimeError(
                "To plot entity sets, a graphviz backend is required.\n" +
                "Install the backend using one of the following commands:\n" +
                "  Mac OS: brew install graphviz\n" +
                "  Linux (Ubuntu): sudo apt-get install graphviz\n" +
                "  Windows: conda install python-graphviz\n" +
                "  For more details visit: https://docs.featuretools.com/en/stable/getting_started/install.html"
            )

        if to_file:
            # Explicitly cast to str in case a Path object was passed in
            to_file = str(to_file)

            split_path = to_file.split('.')
            if len(split_path) < 2:
                raise ValueError("Please use a file extension like '.pdf'" +
                                 " so that the format can be inferred")

            format = split_path[-1]
            valid_formats = graphviz.backend.FORMATS
            if format not in valid_formats:
                raise ValueError("Unknown format. Make sure your format is" +
                                 " amongst the following: %s" % valid_formats)
        else:
            format = None

        # Initialize a new directed graph
        graph = graphviz.Digraph(self.id,
                                 format=format,
                                 graph_attr={'splines': 'ortho'})

        # Draw entities
        for entity in self.entities:
            variables_string = '\l'.join([
                var.id + ' : ' + var.type_string  # noqa: W605
                for var in entity.variables
            ])
            nrows = entity.shape[0]
            label = '{%s (%d row%s)|%s\l}' % (entity.id, nrows, 's' *
                                              (nrows > 1), variables_string
                                              )  # noqa: W605
            graph.node(entity.id, shape='record', label=label)

        # Draw relationships
        for rel in self.relationships:
            # Display the key only once if is the same for both related entities
            if rel._parent_variable_id == rel._child_variable_id:
                label = rel._parent_variable_id
            else:
                label = '%s -> %s' % (rel._parent_variable_id,
                                      rel._child_variable_id)

            graph.edge(rel._child_entity_id,
                       rel._parent_entity_id,
                       xlabel=label)

        if to_file:
            # Graphviz always appends the format to the file name, so we need to
            # remove it manually to avoid file names like 'file_name.pdf.pdf'
            offset = len(format) + 1  # Add 1 for the dot
            output_path = to_file[:-offset]
            graph.render(output_path, cleanup=True)

        return graph