def mapper():
    """Mapper function.

    Input is read from sys.stdin and written to sys.stdout. Both streams can be
    overwritten if needed.

    :returns: Nothing. Writes to standard output.
    """

    # The input file is saved as a tab-separated file. The data itself comes from
    # http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file
    # "forum_nodes.tsv".
    reader = csv.reader(sys.stdin, delimiter="\t")

    for line in reader:
        # Basic data sanity check
        if not isValidNodeLine(line):
            continue

        author = getField(line, "author_id")
        date = parseDate(getField(line, "added_at"))

        if date is None or author is None:
            # Something's gone wrong. Ignore this line.
            continue

        print("{0}\t{1}".format(author, date.hour))
Пример #2
0
def mapper():
    """Mapper function.

    Input is read from sys.stdin and written to sys.stdout. Both streams can be
    overwritten if needed.

    :returns: Nothing. Writes to standard output.
    """

    # The input file is saved as a tab-separated file. The data itself comes from
    # http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file
    # "forum_nodes.tsv".
    reader = csv.reader(sys.stdin, delimiter='\t')

    for line in reader:
        # Basic data sanity check
        if not isValidNodeLine(line):
            continue

        # The fields we're interested in. For questions, we obviously want
        # their ids to be output, along with their body lengths. For answers,
        # we actually want to output the value of "parent_id", and the reason
        # for that is that we want answers to be grouped together with the
        # questions that caused them to be. The node_type is used to decide if
        # this is a question or an answer. Comments should be ignored.

        node = getField(line, 'id')
        nodeType = getField(line, 'node_type')
        parent = getField(line, 'abs_parent_id')
        body = getField(line, 'body')

        # If any of the fields we're interested in is None, then it is no good
        # for us. Drop this line altogether.
        if any(map(lambda x: x == None, (node, nodeType, parent, body))):
            continue

        # Data output, as announced by the comments above
        # NOTE: We're assuming neither questions nor answers can be empty.
        # We're hoping that the forum software prevented that from happening.
        # The only reason this comment is made here is because it looks like,
        # from the database dump, that empty values get filled with the "\N"
        # string, which has a length of two. This could be improved, but
        # I believe this happening would be quite unlikely.
        if nodeType == 'question':
            print('{0}\t{1}\t{2}'.format(node, QUESTION, len(body)))
        elif nodeType == 'answer':
            print('{0}\t{1}\t{2}'.format(parent, ANSWER, len(body)))
        else:
            # We don't care about it.
            continue
def mapper():
    """Mapper function.

    Input is read from sys.stdin and written to sys.stdout. Both streams can be
    overwritten if needed.

    :returns: Nothing. Writes to standard output.
    """

    # The input file is saved as a tab-separated file. The data itself comes from
    # http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file
    # "forum_nodes.tsv".
    reader = csv.reader(sys.stdin, delimiter='\t')

    for line in reader:
        # Basic data sanity check
        if not isValidNodeLine(line):
            continue

        # The fields we're interested in. For questions, we obviously want
        # their ids to be output, along with their body lengths. For answers,
        # we actually want to output the value of "parent_id", and the reason
        # for that is that we want answers to be grouped together with the
        # questions that caused them to be. The node_type is used to decide if
        # this is a question or an answer. Comments should be ignored.

        node = getField(line, 'id')
        nodeType = getField(line, 'node_type')
        parent = getField(line, 'abs_parent_id')
        body = getField(line, 'body')

        # If any of the fields we're interested in is None, then it is no good
        # for us. Drop this line altogether.
        if any(map(lambda x: x == None, (node, nodeType, parent, body))):
            continue

        # Data output, as announced by the comments above
        # NOTE: We're assuming neither questions nor answers can be empty.
        # We're hoping that the forum software prevented that from happening.
        # The only reason this comment is made here is because it looks like,
        # from the database dump, that empty values get filled with the "\N"
        # string, which has a length of two. This could be improved, but
        # I believe this happening would be quite unlikely.
        if nodeType == 'question':
            print('{0}\t{1}\t{2}'.format(node, QUESTION, len(body)))
        elif nodeType == 'answer':
            print('{0}\t{1}\t{2}'.format(parent, ANSWER, len(body)))
        else:
            # We don't care about it.
            continue
def mapper():
    """Mapper function.

    Input is read from sys.stdin and written to sys.stdout. Both streams can be
    overwritten if needed.

    :returns: Nothing. Writes to standard output.
    """

    # The input file is saved as a tab-separated file. The data itself comes from
    # http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file
    # "forum_nodes.tsv".
    reader = csv.reader(sys.stdin, delimiter='\t')

    for line in reader:
        # Basic data sanity check
        if not isValidNodeLine(line):
            continue

        # The fields we're interested in. Question represent new threads,
        # comments & answers, posts to that thread. Hence, we need the node id
        # for questions and the parent id for answers / comments. We obviously
        # need the author id as well, so we can group that.
        node = getField(line, 'id')
        nodeType = getField(line, 'node_type')
        parent = getField(line, 'abs_parent_id')
        author = getField(line, 'author_id')

        # If any of the fields we're interested in is None, then it is no good
        # for us. Drop this line altogether.
        if any(map(lambda x: x == None, (node, nodeType, parent, author))):
            continue

        # Data output, as announced by the comments above
        if nodeType == 'question':
            print('{0}\t{1}\t{2}'.format(node, QUESTION, author))
        else:
            print('{0}\t{1}\t{2}'.format(parent, WHATEVER, author))
Пример #5
0
def mapper():
    """Mapper function.

    Input is read from sys.stdin and written to sys.stdout. Both streams can be
    overwritten if needed.

    :returns: Nothing. Writes to standard output.
    """

    # The input file is saved as a tab-separated file. The data itself comes from
    # http://content.udacity-data.com/course/hadoop/forum_data.tar.gz -- file
    # "forum_nodes.tsv".
    reader = csv.reader(sys.stdin, delimiter='\t')

    for line in reader:
        # Basic data sanity check
        if not isValidNodeLine(line):
            continue

        # The fields we're interested in. Question represent new threads,
        # comments & answers, posts to that thread. Hence, we need the node id
        # for questions and the parent id for answers / comments. We obviously
        # need the author id as well, so we can group that.
        node = getField(line, 'id')
        nodeType = getField(line, 'node_type')
        parent = getField(line, 'abs_parent_id')
        author = getField(line, 'author_id')

        # If any of the fields we're interested in is None, then it is no good
        # for us. Drop this line altogether.
        if any(map(lambda x: x == None, (node, nodeType, parent, author))):
            continue

        # Data output, as announced by the comments above
        if nodeType == 'question':
            print('{0}\t{1}\t{2}'.format(node, QUESTION, author))
        else:
            print('{0}\t{1}\t{2}'.format(parent, WHATEVER, author))
def mapper():
    """Mapper function.

    Input is read from sys.stdin and written to sys.stdout. Both streams can be
    overwritten if needed.

    :returns: Nothing. Writes to standard output.
    """

    # The input file is saved as a tab-separated file. The data itself comes
    # from http://content.udacity-data.com/course/hadoop/forum_data.tar.gz --
    # file "forum_nodes.tsv".
    reader = csv.reader(sys.stdin, delimiter='\t')

    # Dictionary that will hold the count of tags
    tagDict = {}
    for line in reader:
        # Basic data sanity check
        if not isValidNodeLine(line):
            continue

        tags = getField(line, 'tagnames')
        if tags is None:
            continue

        # Every tag gets added to the dictionary. If it doesn't exist yet, it
        # is added with value 1. Otherwise, its current value is incremented.
        tags = tags.split()
        for tag in tags:
            tagDict[tag] = tagDict.get(tag, 0) + 1

    # NOTE: Funny thing happening here. We *cannot use* the top-N pattern
    # explained in class! Suppose the following example: we want the top 1
    # tag and we're using the top-N pattern presented in class. Now, suppose
    # we have two mappers and we actually have only two tags: tag1, and tag2.
    # Suppose our mappers see the following
    # Mapper 1:
    #   This gets all instances of tag1, which happen to be in this example,
    #   equal to 1000. BUT it also sees 400 instances of tag two.
    #   So, internally, we can have in mapper 1:
    #   tag1 = 1000
    #   tag2 = 400
    #
    # Mapper 2: Suppose mapper 2 sees only tag2 and its count is 900.
    # So, the internal list for mapper 2 will be:
    #   tag2 = 900
    #
    # Now, if we used the topN patter *as presented* in class, we'd be lost.
    # Our reducer would see as its input:
    #   tag1 = 1000
    #   tag2 = 900
    #
    # And it would output the incorrect top1 tag, tag1, with 1000 occurrences.
    #
    # If, instead, the mappers output all tags they saw, then the reducer would
    # have the chance to see this list:
    #   tag1 = 1000
    #   tag2 = 400
    #   tag2 = 900
    # Now it would be able to reduce correctly and output the top 1 tag, tag2.

    # We print everything we got. Can be out of order, Hadoop will sort it for
    # us.
    for tag in tagDict.items():
        print('%s\t%s' % tag)
def mapper():
    """Mapper function.

    Input is read from sys.stdin and written to sys.stdout. Both streams can be
    overwritten if needed.

    :returns: Nothing. Writes to standard output.
    """

    # The input file is saved as a tab-separated file. The data itself comes
    # from http://content.udacity-data.com/course/hadoop/forum_data.tar.gz --
    # file "forum_nodes.tsv".
    reader = csv.reader(sys.stdin, delimiter='\t')

    # Dictionary that will hold the count of tags
    tagDict = {}
    for line in reader:
        # Basic data sanity check
        if not isValidNodeLine(line):
            continue

        tags = getField(line, 'tagnames')
        if tags is None:
            continue

        # Every tag gets added to the dictionary. If it doesn't exist yet, it
        # is added with value 1. Otherwise, its current value is incremented.
        tags = tags.split()
        for tag in tags:
            tagDict[tag] = tagDict.get(tag, 0) + 1

    # NOTE: Funny thing happening here. We *cannot use* the top-N pattern
    # explained in class! Suppose the following example: we want the top 1
    # tag and we're using the top-N pattern presented in class. Now, suppose
    # we have two mappers and we actually have only two tags: tag1, and tag2.
    # Suppose our mappers see the following
    # Mapper 1:
    #   This gets all instances of tag1, which happen to be in this example,
    #   equal to 1000. BUT it also sees 400 instances of tag two.
    #   So, internally, we can have in mapper 1:
    #   tag1 = 1000
    #   tag2 = 400
    #
    # Mapper 2: Suppose mapper 2 sees only tag2 and its count is 900.
    # So, the internal list for mapper 2 will be:
    #   tag2 = 900
    #
    # Now, if we used the topN patter *as presented* in class, we'd be lost.
    # Our reducer would see as its input:
    #   tag1 = 1000
    #   tag2 = 900
    #
    # And it would output the incorrect top1 tag, tag1, with 1000 occurrences.
    #
    # If, instead, the mappers output all tags they saw, then the reducer would
    # have the chance to see this list:
    #   tag1 = 1000
    #   tag2 = 400
    #   tag2 = 900
    # Now it would be able to reduce correctly and output the top 1 tag, tag2.

    # We print everything we got. Can be out of order, Hadoop will sort it for
    # us.
    for tag in tagDict.items():
        print('%s\t%s' % tag)