class ExplainableObj(Transformer, HasSetInputCol, HasSetOutputCol): partitionKey = Param( Params._dummy(), "partitionKey", "The name of the column to partition by." ) secondPartitionKey = Param( Params._dummy(), "secondPartitionKey", "The name of the column to partition by." ) def __init__(self): super().__init__() ExplainBuilder.build(self, inputCol='input', partitionKey=1) self.setSecondPartitionKey(2) self.setOutputCol('output') def _transform(self, dataset): pass
class RegexMatcher(AnnotatorTransformer): strategy = Param(Params._dummy(), "strategy", "MATCH_FIRST|MATCH_ALL|MATCH_COMPLETE", typeConverter=TypeConverters.toString) rulesPath = Param(Params._dummy(), "rulesPath", "rules file path, must be a tuple of regex and identifier. replace config with this", typeConverter=TypeConverters.toString) @keyword_only def __init__(self): super(RegexMatcher, self).__init__() self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.RegexMatcher", self.uid) def setStrategy(self, value): return self._set(strategy=value) def setRulesPath(self, value): return self._set(rulesPath=value)
class Stemmer(AnnotatorModel): language = Param(Params._dummy(), "language", "stemmer algorithm", typeConverter=TypeConverters.toString) name = "Stemmer" @keyword_only def __init__(self): super(Stemmer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Stemmer") self._setDefault( language="english" )
class DateMatcher(AnnotatorModel): dateFormat = Param(Params._dummy(), "dateFormat", "desired format for dates extracted", typeConverter=TypeConverters.toString) @keyword_only def __init__(self): super(DateMatcher, self).__init__(classname="com.johnsnowlabs.nlp.annotators.DateMatcher") def setDateFormat(self, value): return self._set(dateFormat=value)
class Normalizer(AnnotatorTransformer): pattern = Param(Params._dummy(), "pattern", "normalization regex pattern which match will be replaced with a space", typeConverter=TypeConverters.toString) lowercase = Param(Params._dummy(), "lowercase", "whether to convert strings to lowercase") @keyword_only def __init__(self): super(Normalizer, self).__init__() self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.Normalizer", self.uid) def setPattern(self, value): return self._set(pattern=value) def setLowercase(self, value): return self._set(lowercase=value)
class HasSignatureDefKey(Params): signature_def_key = Param(Params._dummy(), "signature_def_key", "Identifier for a specific saved_model signature", typeConverter=TypeConverters.toString) def __init__(self): super(HasSignatureDefKey, self).__init__() self._setDefault(signature_def_key=None) def setSignatureDefKey(self, value): return self._set(signature_def_key=value) def getSignatureDefKey(self): return self.getOrDefault(self.signature_def_key)
class Stemmer(AnnotatorTransformer): algorithm = Param(Params._dummy(), "algorithm", "stemmer algorithm", typeConverter=TypeConverters.toString) @keyword_only def __init__(self): super(Stemmer, self).__init__() self._java_obj = self._new_java_obj( "com.johnsnowlabs.nlp.annotators.Stemmer", self.uid)
class DocumentAssembler(AnnotatorTransformer): inputCol = Param(Params._dummy(), "inputCol", "input column name", typeConverter=TypeConverters.toString) outputCol = Param(Params._dummy(), "outputCol", "output column name", typeConverter=TypeConverters.toString) idCol = Param(Params._dummy(), "idCol", "column for setting an id to such string in row", typeConverter=TypeConverters.toString) metadataCol = Param(Params._dummy(), "metadataCol", "String to String map column to use as metadata", typeConverter=TypeConverters.toString) calculationsCol = Param( Params._dummy(), "calculationsCol", "String to Float vector map column to use as embeddigns and other representations", typeConverter=TypeConverters.toString) trimAndClearNewLines = Param( Params._dummy(), "trimAndClearNewLines", "whether to clear out new lines and trim context to remove leadng and trailing white spaces", typeConverter=TypeConverters.toBoolean) name = 'DocumentAssembler' @keyword_only def __init__(self): super( DocumentAssembler, self).__init__(classname="com.johnsnowlabs.nlp.DocumentAssembler") self._setDefault(outputCol="document", trimAndClearNewLines=True) @keyword_only def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs) def setInputCol(self, value): return self._set(inputCol=value) def setOutputCol(self, value): return self._set(outputCol=value) def setIdCol(self, value): return self._set(idCol=value) def setMetadataCol(self, value): return self._set(metadataCol=value) def setCalculationsCol(self, value): return self._set(metadataCol=value) def setTrimAndClearNewLines(self, value): return self._set(trimAndClearNewLines=value)
class SentimentDetectorModel(AnnotatorTransformer): dictPath = Param(Params._dummy(), "dictPath", "path for dictionary to sentiment analysis") @keyword_only def __init__(self): super(SentimentDetectorModel, self).__init__() self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetectorModel", self.uid) def setDictPath(self, value): self._set(dictPath=value) return self
class TokenAssembler(JavaTransformer, JavaMLReadable, JavaMLWritable): inputCols = Param(Params._dummy(), "inputCols", "input token annotations", typeConverter=TypeConverters.toListString) outputCol = Param(Params._dummy(), "outputCol", "output column name.", typeConverter=TypeConverters.toString) @keyword_only def __init__(self): super(TokenAssembler, self).__init__() self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.TokenAssembler", self.uid) kwargs = self._input_kwargs self.setParams(**kwargs) @keyword_only def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs) def setInputCols(self, value): return self._set(inputCols=value) def setOutputCol(self, value): return self._set(outputCol=value)
class IdIndexer(Estimator, HasSetInputCol, HasSetOutputCol): partitionKey = Param( Params._dummy(), "partitionKey", "The name of the column to partition by, i.e., make sure the indexing takes the partition into account. " "This is exemplified in reset_per_partition.") resetPerPartition = Param( Params._dummy(), "resetPerPartition", "When set to True then indexing is consecutive from [1..n] for each value of the partition column. " "When set to False then indexing is consecutive for all partition and column values." ) def __init__(self, input_col: str, partition_key: str, output_col: str, reset_per_partition: bool): super().__init__() ExplainBuilder.build(self, inputCol=input_col, partitionKey=partition_key, outputCol=output_col, resetPerPartition=reset_per_partition) def _make_vocab_df(self, df): ucols = [self.getPartitionKey(), self.getInputCol()] the_df = df.select(ucols).distinct().orderBy(ucols) return DataFrameUtils.zip_with_index( df=the_df, start_index=1, col_name=self.getOutputCol(), partition_col=self.getPartitionKey(), order_by_col=self.getInputCol()) if self.getResetPerPartition( ) else DataFrameUtils.zip_with_index( df=the_df, start_index=1, col_name=self.getOutputCol()) def _fit(self, df: DataFrame) -> IdIndexerModel: return IdIndexerModel(self.input_col, self.partition_key, self.output_col, self._make_vocab_df(df).cache())
class EntityExtractor(AnnotatorTransformer): entitiesPath = Param(Params._dummy(), "entitiesPath", "Path to entities (phrases) to extract", typeConverter=TypeConverters.toString) insideSentences = Param(Params._dummy(), "insideSentences", "Should extractor search only within sentences borders?", typeConverter=TypeConverters.toBoolean) @keyword_only def __init__(self): super(EntityExtractor, self).__init__() self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.EntityExtractor", self.uid) def setInsideSentences(self, value): return self._set(insideSentences=value) def setEntitiesPath(self, value): return self._set(entitiesPath=value)
class ViveknSentimentApproach(AnnotatorApproach): sentimentCol = Param(Params._dummy(), "sentimentCol", "column with the sentiment result of every row. Must be 'positive' or 'negative'", typeConverter=TypeConverters.toString) positiveSource = Param(Params._dummy(), "positiveSource", "positive sentiment file or folder", typeConverter=TypeConverters.identity) negativeSource = Param(Params._dummy(), "negativeSource", "negative sentiment file or folder", typeConverter=TypeConverters.identity) pruneCorpus = Param(Params._dummy(), "pruneCorpus", "Removes unfrequent scenarios from scope. The higher the better performance. Defaults 1", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self): super(ViveknSentimentApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.sda.vivekn.ViveknSentimentApproach") def setSentimentCol(self, value): return self._set(sentimentCol=value) def setPositiveSource(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "tokenPattern": "\S+"}.copy()): return self._set(positiveSource=ExternalResource(path, read_as, options)) def setNegativeSource(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "tokenPattern": "\S+"}.copy()): return self._set(negativeSource=ExternalResource(path, read_as, options)) def setPruneCorpus(self, value): return self._set(pruneCorpus=value) def _create_model(self, java_model): return ViveknSentimentModel(java_model)
class DateMatcher(AnnotatorTransformer): dateFormat = Param(Params._dummy(), "dateFormat", "desired format for dates extracted", typeConverter=TypeConverters.toString) @keyword_only def __init__(self): super(DateMatcher, self).__init__() self._java_obj = self._new_java_obj("com.johnsnowlabs.nlp.annotators.DateMatcher", self.uid) def setDateFormat(self, value): return self._set(dateFormat=value)
class EmbeddingsFinisher(AnnotatorTransformer): inputCols = Param(Params._dummy(), "inputCols", "name of input annotation cols containing embeddings", typeConverter=TypeConverters.toListString) outputCols = Param(Params._dummy(), "outputCols", "output EmbeddingsFinisher ouput cols", typeConverter=TypeConverters.toListString) cleanAnnotations = Param(Params._dummy(), "cleanAnnotations", "whether to remove all the existing annotation columns", typeConverter=TypeConverters.toBoolean) outputAsVector = Param(Params._dummy(), "outputAsVector", "if enabled it will output the embeddings as Vectors instead of arrays", typeConverter=TypeConverters.toBoolean) name = "EmbeddingsFinisher" @keyword_only def __init__(self): super(EmbeddingsFinisher, self).__init__(classname="com.johnsnowlabs.nlp.EmbeddingsFinisher") self._setDefault( cleanAnnotations=False, outputAsVector=False ) @keyword_only def setParams(self): kwargs = self._input_kwargs return self._set(**kwargs) def setInputCols(self, *value): if len(value) == 1 and type(value[0]) == list: return self._set(inputCols=value[0]) else: return self._set(inputCols=list(value)) def setOutputCols(self, *value): if len(value) == 1 and type(value[0]) == list: return self._set(outputCols=value[0]) else: return self._set(outputCols=list(value)) def setCleanAnnotations(self, value): return self._set(cleanAnnotations=value) def setOutputAsVector(self, value): return self._set(outputAsVector=value)
class SentenceDetector(AnnotatorModel): useAbbreviations = Param( Params._dummy(), "useAbbreviations", "whether to apply abbreviations at sentence detection", typeConverter=TypeConverters.toBoolean) customBounds = Param(Params._dummy(), "customBounds", "characters used to explicitly mark sentence bounds", typeConverter=TypeConverters.toListString) useCustomBoundsOnly = Param( Params._dummy(), "useCustomBoundsOnly", "Only utilize custom bounds in sentence detection", typeConverter=TypeConverters.toBoolean) name = 'SentenceDetector' def setCustomBounds(self, value): return self._set(customBounds=value) def setUseAbbreviations(self, value): return self._set(useAbbreviations=value) def setUseCustomBoundsOnly(self, value): return self._set(useCustomBoundsOnly=value) @keyword_only def __init__(self): super(SentenceDetector, self).__init__( classname= "com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector") self._setDefault(inputCols=["document"], useAbbreviations=False, useCustomBoundsOnly=False, customBounds=[])
class SentimentDetector(AnnotatorApproach): dictionary = Param(Params._dummy(), "dictionary", "path for dictionary to sentiment analysis", typeConverter=TypeConverters.identity) positiveMultiplier = Param( Params._dummy(), "positiveMultiplier", "multiplier for positive sentiments. Defaults 1.0", typeConverter=TypeConverters.toFloat) negativeMultiplier = Param( Params._dummy(), "negativeMultiplier", "multiplier for negative sentiments. Defaults -1.0", typeConverter=TypeConverters.toFloat) incrementMultiplier = Param( Params._dummy(), "incrementMultiplier", "multiplier for increment sentiments. Defaults 2.0", typeConverter=TypeConverters.toFloat) decrementMultiplier = Param( Params._dummy(), "decrementMultiplier", "multiplier for decrement sentiments. Defaults -2.0", typeConverter=TypeConverters.toFloat) reverseMultiplier = Param( Params._dummy(), "reverseMultiplier", "multiplier for revert sentiments. Defaults -1.0", typeConverter=TypeConverters.toFloat) def __init__(self): super(SentimentDetector, self).__init__( classname= "com.johnsnowlabs.nlp.annotators.sda.pragmatic.SentimentDetector") self._setDefault(positiveMultiplier=1.0, negativeMultiplier=-1.0, incrementMultiplier=2.0, decrementMultiplier=-2.0, reverseMultiplier=-1.0) def setDictionary(self, path, delimiter, read_as=ReadAs.LINE_BY_LINE, options={'format': 'text'}): opts = options.copy() if "delimiter" not in opts: opts["delimiter"] = delimiter return self._set(dictionary=ExternalResource(path, read_as, opts)) def _create_model(self, java_model): return SentimentDetectorModel(java_model=java_model)
class SymmetricDeleteApproach(AnnotatorApproach): corpus = Param(Params._dummy(), "corpus", "folder or file with text that teaches about the language", typeConverter=TypeConverters.identity) dictionary = Param(Params._dummy(), "dictionary", "folder or file with text that teaches about the language", typeConverter=TypeConverters.identity) maxEditDistance = Param(Params._dummy(), "maxEditDistance", "max edit distance characters to derive strings from a word", typeConverter=TypeConverters.toInt) @keyword_only def __init__(self): super(SymmetricDeleteApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.spell.symmetric.SymmetricDeleteApproach") self._setDefault(maxEditDistance=3) def setCorpus(self, path, token_pattern="\S+", read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): opts = options.copy() if "tokenPattern" not in opts: opts["tokenPattern"] = token_pattern return self._set(corpus=ExternalResource(path, read_as, opts)) def setDictionary(self, path, token_pattern="\S+", read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): opts = options.copy() if "tokenPattern" not in opts: opts["tokenPattern"] = token_pattern return self._set(dictionary=ExternalResource(path, read_as, opts)) def setMaxEditDistance(self, v): return self._set(maxEditDistance=v) def _create_model(self, java_model): return SymmetricDeleteModel(java_model)
class Normalizer(AnnotatorApproach): patterns = Param(Params._dummy(), "patterns", "normalization regex patterns which match will be replaced with a space", typeConverter=TypeConverters.toListString) lowercase = Param(Params._dummy(), "lowercase", "whether to convert strings to lowercase") slangDictionary = Param(Params._dummy(), "slangDictionary", "slang dictionary is a delimited text. needs 'delimiter' in options", typeConverter=TypeConverters.identity) @keyword_only def __init__(self): super(Normalizer, self).__init__(classname="com.johnsnowlabs.nlp.annotators.Normalizer") self._setDefault( patterns=["[^\\pL+]"], lowercase=True ) def setPatterns(self, value): return self._set(patterns=value) def setLowercase(self, value): return self._set(lowercase=value) def setSlangDictionary(self, path, delimiter, read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): opts = options.copy() if "delimiter" not in opts: opts["delimiter"] = delimiter return self._set(slangDictionary=ExternalResource(path, read_as, opts)) def _create_model(self, java_model): return NormalizerModel(java_model)
class AssertionDLModel(AnnotatorModel): name = "AssertionDLModel" startCol = Param( Params._dummy(), "startCol", "Column that contains the token number for the start of the target", typeConverter=TypeConverters.toString) endCol = Param( Params._dummy(), "endCol", "Column that contains the token number for the end of the target", typeConverter=TypeConverters.toString) nerCol = Param( Params._dummy(), "nerCol", "Column of NER Annotations to use instead of start and end columns", typeConverter=TypeConverters.toString) targetNerLabels = Param( Params._dummy(), "targetNerLabels", "List of NER labels to mark as target for assertion, must match NER output", typeConverter=TypeConverters.toListString) def __init__(self, java_model=None): if java_model: super(JavaModel, self).__init__(java_model) else: super(AssertionDLModel, self).__init__( classname= "com.johnsnowlabs.nlp.annotators.assertion.dl.AssertionDLModel" ) @staticmethod def pretrained(name="as_fast_dl", language="en"): from sparknlp.pretrained import ResourceDownloader return ResourceDownloader.downloadModel(AssertionDLModel, name, language)
class NerDLApproach(AnnotatorApproach, AnnotatorWithEmbeddings, NerApproach): lr = Param(Params._dummy(), "lr", "Learning Rate", TypeConverters.toFloat) po = Param(Params._dummy(), "po", "Learning rate decay coefficient. Real Learning Rage = lr / (1 + po * epoch)", TypeConverters.toFloat) batchSize = Param(Params._dummy(), "batchSize", "Batch size", TypeConverters.toInt) dropout = Param(Params._dummy(), "dropout", "Dropout coefficient", TypeConverters.toFloat) minProba = Param(Params._dummy(), "minProba", "Minimum probability. Used only if there is no CRF on top of LSTM layer", TypeConverters.toFloat) validationDataset = Param(Params._dummy(), "validationDataset", "Path to validation dataset. If set used to calculate statistic on it during training.", TypeConverters.identity) testDataset = Param(Params._dummy(), "testDataset", "Path to test dataset. If set used to calculate statistic on it during training.", TypeConverters.identity) def setLr(self, v): self._set(lr = v) return self def setPo(self, v): self._set(po = v) return self def setBatchSize(self, v): self._set(batchSize = v) return self def setDropout(self, v): self._set(dropout = v) return self def setMinProbability(self, v): self._set(minProba = v) return self def setValidationDataset(self, path, read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): return self._set(validationDataset=ExternalResource(path, read_as, options.copy())) def setTestDataset(self, path, read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}): return self._set(testDataset=ExternalResource(path, read_as, options.copy())) def _create_model(self, java_model): return NerDLModel(java_model) @keyword_only def __init__(self): super(NerDLApproach, self).__init__(classname="com.johnsnowlabs.nlp.annotators.ner.dl.NerDLApproach") self._setDefault( minEpochs = 0, maxEpochs = 50, lr = float(0.2), po = float(0.05), batchSize = 9, dropout = float(0.5), verbose = 4 )
class HasEpochs(Params): epochs = Param(Params._dummy(), "epochs", "Number of epochs to train", typeConverter=TypeConverters.toInt) def __init__(self): super(HasEpochs, self).__init__() def setEpochs(self, value): return self._set(epochs=value) def getEpochs(self): return self.getOrDefault(self.epochs)
class HasInputMapping(Params): input_mapping = Param(Params._dummy(), "input_mapping", "Mapping of input DataFrame column to input tensor", typeConverter=TFTypeConverters.toDict) def __init__(self): super(HasInputMapping, self).__init__() def setInputMapping(self, value): return self._set(input_mapping=value) def getInputMapping(self): return self.getOrDefault(self.input_mapping)
class HasBatchSize(Params): batch_size = Param(Params._dummy(), "batch_size", "Number of records per batch", typeConverter=TypeConverters.toInt) def __init__(self): super(HasBatchSize, self).__init__() def setBatchSize(self, value): return self._set(batch_size=value) def getBatchSize(self): return self.getOrDefault(self.batch_size)
class HasClusterSize(Params): cluster_size = Param(Params._dummy(), "cluster_size", "Number of nodes in the cluster", typeConverter=TypeConverters.toInt) def __init__(self): super(HasClusterSize, self).__init__() def setClusterSize(self, value): return self._set(cluster_size=value) def getClusterSize(self): return self.getOrDefault(self.cluster_size)
class HasExportDir(Params): export_dir = Param(Params._dummy(), "export_dir", "Directory to export saved_model", typeConverter=TypeConverters.toString) def __init__(self): super(HasExportDir, self).__init__() def setExportDir(self, value): return self._set(export_dir=value) def getExportDir(self): return self.getOrDefault(self.export_dir)
class HasTensorboard(Params): tensorboard = Param(Params._dummy(), "tensorboard", "Launch tensorboard process", typeConverter=TypeConverters.toBoolean) def __init__(self): super(HasTensorboard, self).__init__() def setTensorboard(self, value): return self._set(tensorboard=value) def getTensorboard(self): return self.getOrDefault(self.tensorboard)
class HasSteps(Params): steps = Param(Params._dummy(), "steps", "Maximum number of steps to train", typeConverter=TypeConverters.toInt) def __init__(self): super(HasSteps, self).__init__() def setSteps(self, value): return self._set(steps=value) def getSteps(self): return self.getOrDefault(self.steps)
class HasReaders(Params): readers = Param(Params._dummy(), "readers", "number of reader/enqueue threads", typeConverter=TypeConverters.toInt) def __init__(self): super(HasReaders, self).__init__() def setReaders(self, value): return self._set(readers=value) def getReaders(self): return self.getOrDefault(self.readers)
class HasProtocol(Params): protocol = Param(Params._dummy(), "protocol", "Network protocol for Tensorflow (grpc|rdma)", typeConverter=TypeConverters.toString) def __init__(self): super(HasProtocol, self).__init__() def setProtocol(self, value): return self._set(protocol=value) def getProtocol(self): return self.getOrDefault(self.protocol)