Python SmvApp.getInstance примеры, smv.SmvApp.getInstance Python примеры использования

Пример #1

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def exportToHive(dsname):
    """Export dataset's running result to a Hive table

        Args:
            dsname (str): The name of an SmvModule
    """
    SmvApp.getInstance().publishModuleToHiveByName(dsname)

Пример #2

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def smvDiscoverSchemaToFile(path, n=100000, ca=None):
    """Try best to discover Schema from raw Csv file

        Will save a schema file with postfix ".toBeReviewed" in local directory.

        Args:
            path (str): Path to the CSV file
            n (int): Number of records to check for schema discovery, default 100k
            ca (CsvAttributes): Defaults to CsvWithHeader
    """
    SmvApp.getInstance()._jvm.SmvPythonHelper.smvDiscoverSchemaToFile(path, n, ca or SmvApp.getInstance().defaultCsvWithHeader())

Пример #3

0

Показать файл

Файл: matcher.py Проект: shuangshuangwang/SMV

    def __init__(self, leftId, rightId, exactMatchFilter, groupCondition,
                 levelLogics):
        jlls = SmvApp.getInstance().sc._gateway.new_array(
            SmvApp.getInstance()._jvm.org.tresamigos.smv.matcher.LevelLogic,
            len(levelLogics))
        for i in range(0, len(jlls)):
            jlls[i] = levelLogics[i]

        self.jem = SmvApp.getInstance(
        )._jvm.org.tresamigos.smv.python.SmvPythonHelper.createMatcher(
            leftId, rightId, exactMatchFilter, groupCondition, jlls)

Пример #4

0

Показать файл

def openCsv(path, validate=False):
    """Read in a CSV file as a DataFrame

        Args:
            path (str): The path of the CSV file
            validate (bool): If true, validate the CSV before return DataFrame (raise error if malformatted)

        Returns:
            (DataFrame): The resulting DataFrame
    """
    app = SmvApp.getInstance()
    jdf = app.j_smvPyClient.shellOpenCsv(path, validate)
    return DataFrame(jdf, SmvApp.getInstance().sqlContext)

Пример #5

0

Показать файл

def get_graph_json():
    '''
    body: none
    function: return the json file of the entire dependency graph
    '''
    res = SmvApp.getInstance().get_graph_json()
    return jsonify(graph=res)

Пример #6

0

Показать файл

Файл: smvserver.py Проект: TresAmigosSD/SMV

def getStageFromFqn(fqn):
    '''returns the stage given a a dataset's fqn'''
    try:
        stage = SmvApp.getInstance().getStageFromModuleFqn(fqn).encode("utf-8")
    except:
        raise ValueError("Could not retrive stage with the given fqn: " + str(fqn))
    return stage

Пример #7

0

Показать файл

def smvStrCat(head, *others):
    """Concatenate multiple columns to a single string. Similar to `concat` and `concat_ws` functions in Spark but behaves differently
       when some columns are nulls.
       The Spark version will return null if any of the inputs is null.
       smvStrCat will return null if all of the inputs are nulls, otherwise it will coalesce null cols to blank.

       This function can take 2 forms:
       - smvStrCat(sep, col1, col2, ...)
       - smvStrCat(col1, col2, ...)

       Args:
           sep (String): separater for the concats
           col. (Column): columns to be concatenated

       Return:
           (col): a StringType column
    """
    if (isinstance(head, basestring)):
        sep = head
        cols = list(others)
    elif (isinstance(head, Column)):
        sep = ""
        cols = [head] + list(others)
    else:
        raise RuntimeError(
            "first parameter must be either a String or a Column")
    app = SmvApp.getInstance()
    return Column(
        app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvStrCat(
            sep, smv_copy_array(app.sc, *cols)))

Пример #8

0

Показать файл

    def test_readSchemaWhenFileExist(self):
        cls = self.__class__
        app = SmvApp.getInstance()
        schema_file_name = "schemaToBeRead1.schema"
        schema_file_path = os.path.join(cls.tmpInputDir(), schema_file_name)
        schema_file_content = ('@delimiter = ,\n'
                               '@has-header = true\n'
                               '@quote-char = "\n'
                               'a: String\n'
                               'b: Integer')

        self.createTempInputFile(schema_file_name, schema_file_content)

        data_file_path = schema_file_path.replace(".schema", ".csv")
        smv_schema_instance = app.j_smvPyClient.readSchemaFromDataPathAsSmvSchema(data_file_path)

        entries = smv_schema_instance.getEntriesStr()
        attributes = smv_schema_instance.extractCsvAttributes()

        self.assertEqual(len(entries), 2)
        self.assertEqual(entries[0], 'a: String')
        self.assertEqual(entries[1], 'b: Integer')

        self.assertTrue(attributes.hasHeader())
        self.assertEqual(attributes.delimiter(), ',')
        self.assertEqual(attributes.quotechar(), '"')

Пример #9

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def openCsv(path, validate=False):
    """Read in a CSV file as a DataFrame

        Args:
            path (str): The path of the CSV file
            validate (bool): If true, validate the CSV before return DataFrame (raise error if malformatted)

        Returns:
            (DataFrame): The resulting DataFrame
    """
    app = SmvApp.getInstance()
    class TmpCsv(SmvCsvInputFile):
        def connectionName(self):
            return None

        def get_connection(self):
            return SmvHdfsEmptyConn

        def fileName(self):
            return path

        def failAtParsingError(self):
            return validate

    return TmpCsv(app).doRun(None)

Пример #10

0

Показать файл

Файл: matcher.py Проект: shuangshuangwang/SMV

    def doMatch(self, df1, df2, keepOriginalCols=True):
        """Apply `SmvEntityMatcher` to the 2 DataFrames

            Args:
                df1 (DataFrame): DataFrame 1 with an id column with name "id"
                df2 (DataFrame): DataFrame 2 with an id column with name "id"
                keepOriginalCols (boolean): whether to keep all input columns of df1 and df2, defaults to true

            Example:
                code::

                    SmvEntityMatcher("id", "_id",
                        ExactMatchPreFilter("Full_Name_Match", col("full_name") == col("_full_name")),
                        GroupCondition(soundex("first_name") == soundex("_first_name")),
                        [
                            ExactLogic("First_Name_Match", col("first_name") == col("_first_name")),
                            FuzzyLogic("Levenshtein_City", lit(True), normlevenshtein(col("city"),col("_city")), 0.9)
                        ]
                    ).doMatch(df1, df2, False)

            Returns:
                (DataFrame): a DataFrame with df1's id and df2's id and match flags of all the levels. For levels with fuzzy logic, the matching score is also provided. A column named "MatchBitmap" also provided to summarize all the matching flags. When keepOriginalCols is true, input columns are also kept
        """
        jres = self.jem.doMatch(df1._jdf, df2._jdf, keepOriginalCols)
        return DataFrame(jres, SmvApp.getInstance().sqlContext)

Пример #11

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def props():
    """The current app propertied used by SMV after the app, user, command-line
        and dynamic props are merged.

        Returns:
            (dict): The 'mergedProps' or final props used by SMV
    """
    return SmvApp.getInstance().getCurrentProperties()

Пример #12

0

Показать файл

Файл: smvserver.py Проект: shuangshuangwang/SMV

def getFqnsInApp():
    """returns all known module FQNs in app. Note: excluded links"""
    repo = DataSetRepoFactory(SmvApp.getInstance()).createRepo()
    # generate list of URNs in a stage for each stage (list-of-list)
    urnsLL = [repo.dataSetsForStage(s) for s in getStagesInApp()]
    # flatten the list-of-list to simple list of urns and remove the "mod:" prefix
    urns = [u.split(":")[1] for ul in urnsLL for u in ul]
    return urns

Пример #13

0

Показать файл

def getStageFromFqn(fqn):
    '''returns the stage given a a dataset's fqn'''
    try:
        stage = SmvApp.getInstance().getStageFromModuleFqn(fqn).encode("utf-8")
    except:
        raise ValueError("Could not retrive stage with the given fqn: " +
                         str(fqn))
    return stage

Пример #14

0

Показать файл

Файл: utils.py Проект: DataSenseAnalytics/OpenHCDP

def getH2oContext():
    """
        Init the hc (H2OContext) using the current sparkSession.
        Using this instead of h2o.init()
    """
    sparkSession = SmvApp.getInstance().sparkSession
    import pysparkling
    hc = pysparkling.H2OContext.getOrCreate(sparkSession)
    return hc

Пример #15

0

Показать файл

Файл: smvshell.py Проект: Mallik-G/SMV

def dshash(name):
    """The current hashOfHash for the named module as a hex string

        Args:
            name (str): The uniquen name of a module. Does not have to be the FQN.

        Returns:
            (int): The hashOfHash of the named module
    """
    return SmvApp.getInstance().getDsHash(name)

Пример #16

0

Показать файл

    def test_readSchemaWhenFileNotExist(self):
        cls = self.__class__
        app = SmvApp.getInstance()
        schema_file_name = "schemaToBeRead2.schema"
        schema_file_path = os.path.join(cls.tmpInputDir(), schema_file_name)

        data_file_path = schema_file_path.replace(".schema", ".csv")
        smv_schema_instance = app.j_smvPyClient.readSchemaFromDataPathAsSmvSchema(data_file_path)

        self.assertIsNone(smv_schema_instance)

Пример #17

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def get_run_info(name, runConfig=None):
    """Get the SmvRunInfoCollector with full information about a module and its dependencies

        Args:
            name (str): name of the module whose information to collect
            runConfig (dict): runConfig to apply when collecting info. If module
                              was run with a config, the same config needs to be
                              specified here to retrieve the info.
    """
    return SmvApp.getInstance().getRunInfoByPartialName(name, runConfig)

Пример #18

0

Показать файл

Файл: smvshell.py Проект: jacobdr/SMV

def openHive(tableName):
    """Read in a Hive table as a DataFrame

        Args:
            tableName (str): The name of the Hive table

        Returns:
            (DataFrame): The resulting DataFrame
    """
    return DataFrame(_jvmShellCmd().openHive(tableName), SmvApp.getInstance().sqlContext)

Пример #19

0

Показать файл

Файл: smvshell.py Проект: jacobdr/SMV

def openCsv(path):
    """Read in a CSV file as a DataFrame

        Args:
            path (str): The path of the CSV file

        Returns:
            (DataFrame): The resulting DataFrame
    """
    return DataFrame(_jvmShellCmd().openCsv(path), SmvApp.getInstance().sqlContext)

Пример #20

0

Показать файл

def openHive(tableName):
    """Read in a Hive table as a DataFrame

        Args:
            tableName (str): The name of the Hive table

        Returns:
            (DataFrame): The resulting DataFrame
    """
    return DataFrame(_jvmShellCmd().openHive(tableName),
                     SmvApp.getInstance().sqlContext)

Пример #21

0

Показать файл

def openCsv(path):
    """Read in a CSV file as a DataFrame

        Args:
            path (str): The path of the CSV file

        Returns:
            (DataFrame): The resulting DataFrame
    """
    return DataFrame(_jvmShellCmd().openCsv(path),
                     SmvApp.getInstance().sqlContext)

Пример #22

0

Показать файл

def smvCollectSet(col, datatype):
    """An aggregate function, which will collect all the values of the given column and create a set as an array typed column.
       Since Spark 1.6, a spark function collect_set was introduced, so as migrate to Spark 1.6 and later, this smvCollectSet
       will be depricated.

       Args:
            col (Column): column to be aggregated on
            datatype (DataType): datatype of the input column
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.python.
                  SmvPythonHelper.smvCollectSet(col._jc, datatype.json()))

Пример #23

0

Показать файл

def nGram3(c1, c2):
    """3-gram UDF with formula (number of overlaped gramCnt)/max(s1.gramCnt, s2.gramCnt)

        Args:
            c1 (Column): first column
            c2 (Column): second column

        Returns:
            (Column): 3-gram
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.nGram3(
        c1._jc, c2._jc))

Пример #24

0

Показать файл

def run_module():
    '''
    body: fqn = 'xxx' (fqn)
    function: run the module
    '''
    try:
        module_fqn = request.form['fqn'].encode("utf-8")
    except:
        raise err_res('MODULE_NOT_PROVIDED_ERR')

    run_result = SmvApp.getInstance().runModule("mod:{}".format(module_fqn))
    return ok_res(str(run_result))

Пример #25

0

Показать файл

def df(name, forceRun=False, version=None):
    """The DataFrame result of running the named module

        Args:
            name (str): The name of a module. Does not have to be the FQN.
            forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise.
            version (str): The name of the published version to load from

        Returns:
            (DataFrame): The result of running the named module.
    """
    return SmvApp.getInstance().runModuleByName(name, forceRun, version)

Пример #26

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def df(name, forceRun=False, quickRun=True):
    """The DataFrame result of running the named module

        Args:
            name (str): The unique name of a module. Does not have to be the FQN.
            forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise.
            quickRun (bool): skip computing dqm+metadata and persisting csv

        Returns:
            (DataFrame): The result of running the named module.
    """
    return SmvApp.getInstance().runModuleByName(name, forceRun, quickRun)[0]

Пример #27

0

Показать файл

def diceSorensen(c1, c2):
    """2-gram UDF with formula (2 * number of overlaped gramCnt)/(s1.gramCnt + s2.gramCnt)

        Args:
            c1 (Column): first column
            c2 (Column): second column

        Returns:
            (Column): 2-gram
    """
    return Column(
        SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.diceSorensen(
            c1._jc, c2._jc))

Пример #28

0

Показать файл

def smvArrayCat(sep, col):
    """For an array typed column, concat the elements to a string with the given separater.

       Args:
            sep: a Python string to separate the fields
            col: a Column with ArrayType

       Return:
            (col): a Column in StringType with array elements concatenated
    """
    return Column(
        SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.smvArrayCat(
            sep, col._jc))

Пример #29

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def dshash(name):
    """The current hashOfHash for the named module as a hex string

        Args:
            name (str): The uniquen name of a module. Does not have to be the FQN.
            runConfig (dict): runConfig to apply when collecting info. If module
                              was run with a config, the same config needs to be
                              specified here to retrieve the correct hash.

        Returns:
            (int): The hashOfHash of the named module
    """
    return SmvApp.getInstance().getDsHash(name)

Пример #30

0

Показать файл

def normlevenshtein(c1, c2):
    """Levenshtein edit distance metric UDF

        Args:
            c1 (Column): first column
            c2 (Column): second column

        Returns:
            (Column): distances
    """
    return Column(
        SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.normlevenshtein(
            c1._jc, c2._jc))

Пример #31

0

Показать файл

Файл: smvshell.py Проект: jacobdr/SMV

def df(name, forceRun = False, version = None, runConfig = None):
    """The DataFrame result of running the named module

        Args:
            name (str): The unique name of a module. Does not have to be the FQN.
            forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise.
            version (str): The name of the published version to load from
            runConfig (dict): runtime configuration to use when running the module

        Returns:
            (DataFrame): The result of running the named module.
    """
    return SmvApp.getInstance().runModuleByName(name, forceRun, version, runConfig)[0]

Пример #32

0

Показать файл

def jaroWinkler(c1, c2):
    """Jaro-Winkler edit distance metric UDF

        Args:
            c1 (Column): first column
            c2 (Column): second column

        Returns:
            (Column): distances
    """
    return Column(
        SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.jaroWinkler(
            c1._jc, c2._jc))

Пример #33

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def getModel(name, forceRun = False):
    """Get the result of running the named SmvModel module

        Args:
            name (str): The name of a module. Does not have to be the FQN.
            forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise.
            version (str): The name of the published version to load from

        Returns:
            (object): The result of running the named module
    """
    app = SmvApp.getInstance()
    fqn = app.dsm.inferFqn(name)
    return app.getModuleResult(fqn, forceRun)

Пример #34

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def openHive(tableName):
    """Read in a Hive table as a DataFrame

        Args:
            tableName (str): The name of the Hive table

        Returns:
            (DataFrame): The resulting DataFrame
    """
    app = SmvApp.getInstance()
    class TmpHive(SmvHiveTable):
        def tableName(self):
            return tableName

    return DataFrame(TmpHive(app).doRun(None), app.sqlContext)

Пример #35

0

Показать файл

Файл: matcher.py Проект: shuangshuangwang/SMV

def ExactLogic(colName, expr):
    """Level match with exact logic

        Args:
            colName (string): level name used in the output DF
            expr (Column): match logic colName

        Example:
            >>> ExactLogic("First_Name_Match", col("first_name") == col("_first_name"))

        Returns:
            (ExactLogic)
    """
    return SmvApp.getInstance()._jvm.org.tresamigos.smv.matcher.ExactLogic(
        colName, expr._jc)

Пример #36

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def help():
    """Print a list of the SMV helper functions available in the shell
    """
    this_mod = sys.modules[__name__]

    help_msg = "SMV shell commands:"
    for func_name in __all__:
        func = getattr(this_mod, func_name)
        signature = formatargspec(*getargspec(func))
        help_msg += "\n* {}{}".format(func_name, signature)

    smv_version = SmvApp.getInstance().smvVersion()
    doc_url = ("http://tresamigossd.github.io/SMV/pythondocs/{}/smv.html#module-smv.smvshell"
                .format(smv_version))
    help_msg += "\nDocumentation may be found at " + doc_url

    print(help_msg)

Пример #37

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def run_test(test_name):
    """Run a test with the given name without creating new Spark context

        First reloads SMV and the test from source, then runs the test.

        Args:
            test_name (str): Name of the test to run
    """
    # Ensure TestConfig has a canonical SmvApp (this will eventually be used
    # to restore the singleton SmvApp)
    TestConfig.setSmvApp(SmvApp.getInstance())

    first_dot = test_name.find(".")
    if first_dot == -1:
        test_root_name = test_name
    else:
        test_root_name = test_name[:first_dot]

    _clear_from_sys_modules(["smv", test_root_name])

    SmvTestRunner("src/test/python").run([test_name])

Пример #38

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def _appInfo():
    return SmvAppInfo(SmvApp.getInstance())

Пример #39

0

Показать файл

Файл: smvserver.py Проект: TresAmigosSD/SMV

def runModule(fqn, run_config=None):
    '''runs module of given fqn and runtime configuration'''
    return SmvApp.getInstance().runModule(fqn, runConfig=run_config)[0]

Пример #40

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def fullRun(name):
    """Run module and return result.
        Persist and run DQM if given
    """
    return SmvApp.getInstance().runModuleByName(name, forceRun=False, quickRun=False)[0]

Пример #41

0

Показать файл

Файл: smvserver.py Проект: TresAmigosSD/SMV

def getDatasetInstance(fqn):
    '''returns dataset object given a fqn'''
    return DataSetRepoFactory(SmvApp.getInstance()).createRepo().loadDataSet(fqn)

Пример #42

0

Показать файл

Файл: smvshell.py Проект: jacobdr/SMV

def _jvmShellCmd():
    return SmvApp.getInstance()._jvm.org.tresamigos.smv.shell.ShellCmd

Пример #43

0

Показать файл

Файл: graph.py Проект: jacobdr/SMV

 def svg_graph(*stageNames):
     if (not stageNames):
         return SmvDependencyGraph(SmvApp.getInstance())
     else:
         return SmvDependencyGraph(SmvApp.getInstance(), list(stageNames))

Пример #44

0

Показать файл

Файл: smvserver.py Проект: TresAmigosSD/SMV

def getStagesInApp():
    """returns list of all stages defined in app"""
    return list(SmvApp.getInstance().stages())

Пример #45

0

Показать файл

Файл: smvshell.py Проект: TresAmigosSD/SMV

def quickRun(name):
    """Run module and return result.
        No persist, but use existing persisted if possible.
        No DQM
    """
    return SmvApp.getInstance().runModuleByName(name, forceRun=False, quickRun=True)[0]

Пример #46

0

Показать файл

Файл: smvserver.py Проект: TresAmigosSD/SMV

def getMetadataHistoryJson(fqn):
    '''returns metadata history given a fqn'''
    return SmvApp.getInstance().getMetadataHistoryJson(fqn)

Python SmvApp.getInstance примеры использования