def process(self, element): """Receives a single element (a line) and produces words and side outputs. Important things to note here: - For a single element you may produce multiple main outputs: words of a single line. - For that same input you may produce multiple side outputs, along with multiple main outputs. - Side outputs may have different types (count) or may share the same type (words) as with the main output. Args: element: processing element. Yields: words as main output, short words as side output, line character count as side output. """ # yield a count (integer) to the SIDE_OUTPUT_TAG_CHARACTER_COUNT tagged # collection. yield pvalue.SideOutputValue(self.SIDE_OUTPUT_TAG_CHARACTER_COUNT, len(element)) words = re.findall(r'[A-Za-z\']+', element) for word in words: if len(word) <= 3: # yield word as a side output to the SIDE_OUTPUT_TAG_SHORT_WORDS tagged # collection. yield pvalue.SideOutputValue(self.SIDE_OUTPUT_TAG_SHORT_WORDS, word) else: # yield word to add it to the main collection. yield word
def process(self, element, cutoff_length, marker): if len(element) <= cutoff_length: # Emit this short word to the main output. yield element else: # Emit this word's long length to a side output. yield pvalue.SideOutputValue('above_cutoff_lengths', len(element)) if element.startswith(marker): # Emit this word to a different side output. yield pvalue.SideOutputValue('marked strings', element)
def process(self, element): if element < 0: yield pvalue.SideOutputValue('tag_negative', element) else: yield element
def even_odd(x): yield pvalue.SideOutputValue('odd' if x % 2 else 'even', x) if x % 10 == 0: yield x
def process(self, context): if context.element < 0: yield pvalue.SideOutputValue('tag_negative', context.element) else: yield context.element