Exemplo n.º 1
0
 def tokenize_text_rdd(self, rdd_input):
     input_parser = ParserFactory.get_parser(self.config, **self.options)
     if input_parser:
         # TEXT (NEW): Each RDD element when parsed yields a tuple of strings
         # to be tokenized, and each string yields a list of tokens
         # corresponding to one per input column_path
         print('got parsed')
         rdd_parsed = rdd_input.mapValues(lambda x: input_parser.parse_values(x))
         if(self.options.get("emptylines") == 'False'):
             rdd_parsed = rdd_parsed.filter(lambda x : self.filter_emptylines(x))
         return self._tokenize_rdd(rdd_parsed)
Exemplo n.º 2
0
 def tokenize_seq_rdd(self, rdd_input):
     input_parser = ParserFactory.get_parser(self.config, **self.options)
     if input_parser:
         # SEQUENCE: Each RDD element when parsed yields a tuple of strings
         # to be tokenized, and each string yields a list of tokens
         # corresponding to one per input column_path
         rdd_parsed = rdd_input.mapValues(lambda x: input_parser.parse_values(x))
         if(self.options.get("emptylines") == 'False'):
             rdd_parsed = rdd_parsed.filter(lambda x : self.filter_emptylines(x))
         return self._tokenize_rdd(rdd_parsed)
     else:
         raise ValueError("No input_parser")
Exemplo n.º 3
0
 def tokenize_text_rdd(self, rdd_input):
     input_parser = ParserFactory.get_parser(self.config, **self.options)
     if input_parser:
         # TEXT (NEW): Each RDD element when parsed yields a tuple of strings
         # to be tokenized, and each string yields a list of tokens
         # corresponding to one per input column_path
         print('got parsed')
         rdd_parsed = rdd_input.mapValues(
             lambda x: input_parser.parse_values(x))
         if (self.options.get("emptylines") == 'False'):
             rdd_parsed = rdd_parsed.filter(
                 lambda x: self.filter_emptylines(x))
         return self._tokenize_rdd(rdd_parsed)
Exemplo n.º 4
0
 def tokenize_seq_rdd(self, rdd_input):
     input_parser = ParserFactory.get_parser(self.config, **self.options)
     if input_parser:
         # SEQUENCE: Each RDD element when parsed yields a tuple of strings
         # to be tokenized, and each string yields a list of tokens
         # corresponding to one per input column_path
         rdd_parsed = rdd_input.mapValues(
             lambda x: input_parser.parse_values(x))
         if (self.options.get("emptylines") == 'False'):
             rdd_parsed = rdd_parsed.filter(
                 lambda x: self.filter_emptylines(x))
         return self._tokenize_rdd(rdd_parsed)
     else:
         raise ValueError("No input_parser")
Exemplo n.º 5
0
 def tokenize_text_file(self, spark_context, filename, data_type):
     raw_data = spark_context.textFile(filename)
     input_parser = ParserFactory.get_parser(data_type, self.config, self.options)
     if input_parser:
         data = raw_data.map(lambda x: input_parser.parse(x))
         return self.tokenize_rdd(data)