예제 #1
0
 def setCorpus(self,
               path,
               token_pattern="\S+",
               read_as=ReadAs.LINE_BY_LINE,
               options={"format": "text"}):
     opts = options.copy()
     if "tokenPattern" not in opts:
         opts["tokenPattern"] = token_pattern
     return self._set(corpus=ExternalResource(path, read_as, opts))
예제 #2
0
 def setExternalRules(self,
                      path,
                      delimiter,
                      read_as=ReadAs.LINE_BY_LINE,
                      options={"format": "text"}):
     opts = options.copy()
     if "delimiter" not in opts:
         opts["delimiter"] = delimiter
     return self._set(externalRules=ExternalResource(path, read_as, opts))
예제 #3
0
 def setDictionary(self,
                   path,
                   delimiter,
                   read_as=ReadAs.LINE_BY_LINE,
                   options={'format': 'text'}):
     opts = options.copy()
     if "delimiter" not in opts:
         opts["delimiter"] = delimiter
     return self._set(dictionary=ExternalResource(path, read_as, opts))
예제 #4
0
    def readDataset(self, path, read_as=ReadAs.LINE_BY_LINE, opts={}):
        resource = ExternalResource(path, read_as, opts)

        # ToDo Replace with std pyspark
        session = SparkSession(self.sc)
        jSession = session._jsparkSession

        jdf = self._java_obj.readDataset(resource, jSession)
        return DataFrame(jdf, session._wrapped)
예제 #5
0
 def setCorpus(self,
               path,
               delimiter,
               read_as=ReadAs.SPARK_DATASET,
               options={
                   "format": "text",
                   "repartition": "8"
               }):
     opts = options.copy()
     opts["delimiter"] = delimiter
     return self._set(corpus=ExternalResource(path, read_as, opts))
예제 #6
0
 def __init__(self):
     super(NorvigSweetingApproach, self).__init__(
         classname=
         "com.johnsnowlabs.nlp.annotators.spell.norvig.NorvigSweetingApproach"
     )
     self._setDefault(dictionary=ExternalResource(
         "/spell/words.txt", ReadAs.LINE_BY_LINE,
         {"tokenPattern": "[a-zA-Z]+"}),
                      caseSensitive=False,
                      doubleVariants=False,
                      shortCircuit=False)
예제 #7
0
 def __init__(self):
     super(PerceptronApproach, self).__init__(
         classname=
         "com.johnsnowlabs.nlp.annotators.pos.perceptron.PerceptronApproach"
     )
     self._setDefault(corpus=ExternalResource("/anc-pos-corpus/",
                                              ReadAs.LINE_BY_LINE, {
                                                  "delimiter": "|",
                                                  "format": "text"
                                              }),
                      nIterations=5)
예제 #8
0
 def setDictionary(self,
                   path,
                   key_delimiter,
                   value_delimiter,
                   read_as=ReadAs.LINE_BY_LINE,
                   options={"format": "text"}):
     opts = options.copy()
     if "keyDelimiter" not in opts:
         opts["keyDelimiter"] = key_delimiter
     if "valueDelimiter" not in opts:
         opts["valueDelimiter"] = value_delimiter
     return self._set(dictionary=ExternalResource(path, read_as, opts))
예제 #9
0
 def setExternalDataset(self,
                        path,
                        read_as=ReadAs.LINE_BY_LINE,
                        options={"format": "text"}):
     return self._set(
         externalDataset=ExternalResource(path, read_as, options.copy()))
예제 #10
0
 def setExternalDataset(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "delimiter": ":"}.copy()):
     return self._set(externalDataset=ExternalResource(path, read_as, options))
예제 #11
0
 def setSlangDictionary(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "tokenPattern": "\S+"}.copy()):
     return self._set(slangDictionary=ExternalResource(path, read_as, options))
예제 #12
0
 def setCorpus(self, path=None, read_as="LINE_BY_LINE", options={"format": "text", "tokenPattern": "\S+"}.copy()):
     return self._set(corpus=ExternalResource(path, read_as, options))
예제 #13
0
 def setDictionary(self, path=None, read_as="LINE_BY_LINE", options={'format':'text', 'delimiter':','}.copy()):
     return self._set(dictionary=ExternalResource(path, read_as, options))
예제 #14
0
 def setEntities(self, path=None, read_as="LINE_BY_LINE", options={"format": "text"}.copy()):
     return self._set(entities=ExternalResource(path, read_as, options))
예제 #15
0
 def setDictionary(self, path=None, read_as="LINE_BY_LINE", options={"format": "text",
                                                                     "keyDelimiter": "->",
                                                                     "valueDelimiter": "\t"}.copy()):
     return self._set(dictionary=ExternalResource(path, read_as, options))
예제 #16
0
 def setCorpus(self, path, delimiter, read_as=ReadAs.LINE_BY_LINE, options={"format": "text"}):
     opts = options.copy()
     opts["delimiter"] = delimiter
     return self._set(corpus=ExternalResource(path, read_as, opts))