Пример #1
0
def writeData(writer):
    key = LongWritable()
    value = LongWritable()

    for i in range(1000):
        key.set(1000 - i)
        value.set(i)
        print(('[%d] %s %s' % (writer.getLength(), key.toString(), value.toString())))
        writer.append(key, value)
def write_text_data(writer):
    key = LongWritable()
    value = Text()

    for i in xrange(1000):
        key.set(1000 - i)
        value.set('taro {}'.format(i))
        print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString())
        writer.append(key, value)
def write_text_data(writer):
    key = LongWritable()
    value = Text()

    for i in xrange(1000):
        key.set(1000 - i)
        value.set('taro {}'.format(i))
        print '[%d] %s %s' % (writer.getLength(), key.toString(),
                              value.toString())
        writer.append(key, value)
Пример #4
0
def writeData(writer):
    key = LongWritable()
    value = LongWritable()

    for i in xrange(10):
        key.set(1000 - i)
        value.set(i)
        print '[%d] %s %s' % (writer.getLength(), key.toString(), value.toString())
        writer.append(key, value)
Пример #5
0
from hadoop.io import LongWritable
from hadoop.io import SequenceFile

writer = SequenceFile.createWriter('reddit_posts.seq' % _id, LongWritable, LongWritable)

#read the input files
for line in sys.stdin:

    #use try/expect block to make sure a improperly formatted row does not blow our program u
    # remove leading and trailing whitespace
    # assume that the files are both comma delimited and only contain the columns described in assignment 4 part 1
    line = line.strip().split(",")

    _id, _text, = line[0], line[5] or 'N/A'

    key = LongWritable()
    key.set(int(_id))

    value = LongWritable()
    value.set(_text)

    writer.append(key, value)

writer.close()


#!/usr/bin/env python
import sys

#read the input files
for line in sys.stdin: