示例#1
0
def map_reduce_sentences_to_pos(words: rdd.RDD):
    """
    Perform map reduce functions on text rdd extracting parts of speech from
    the text, counting, lemmatizing, and indexing them, writing the final
    results to separate files depending on the part of speech.

    Args:
        words: A RDD with nltk tokenized words and indexing information.
    """

    pos = ["noun", "adjective", "verb", "adverb"]

    cwd = os.getcwd()
    filesep = "/"

    for part in pos:
        funcName = "is" + part.capitalize()
        posFunc = getattr(PosChecker, funcName)
        part_words = words.filter(posFunc)
        extract = functools.partial(extractWord, part)
        part_words = part_words.map(extract)
        word_counts = part_words.reduceByKey(countWord)
        np_counts = word_counts.map(toNpArray)
        dirname = part + "_counts.txt"
        foldername = cwd + filesep + dirname
        if os.path.isdir(foldername):
            shutil.rmtree(foldername)
        np_counts.saveAsTextFile(dirname)
示例#2
0
def occurrence_of_word(word: str, rdd: RDD):
    """
    This function should count the number of occurrence of a word

    >>> rdd = sc.parallelize([Article("art1", "an example just to try"),Article("art2", "another example")])
    >>> occurrence_of_word("example", rdd)
    2
    >>> occurrence_of_word("another", rdd)
    1
    :param word: the expected word
    :param rdd: dataset of articles
    :return: the number of occurrence of the word
    """
    return rdd.filter(lambda a: word in a.content.split(" ")) \
        .aggregate(0, lambda x, y: x + 1, lambda x, y: x + y)
    def standardize(data: RDD) -> (RDD, set):
        """
        Standardize the input data_.
        :param data: pair RDD, key is index of the point, value is the features of the point
        :return: the standardized data_ and a set of ids of outlier points
        """
        def compute_mean_std(data_) -> tuple:
            intermediate = data_. \
                values(). \
                flatMap(lambda features: [(i, feature) for i, feature in enumerate(features)]). \
                persist()

            count = data_.count()
            mean = intermediate.reduceByKey(lambda x, y: x + y).mapValues(
                lambda value: value / count).persist()
            mean_dict = dict(mean.collect())

            std = intermediate.\
                mapValues(lambda value: value ** 2).\
                reduceByKey(lambda x, y: x + y). \
                mapValues(lambda value: value / count). \
                map(lambda pair: (pair[0], pair[1] - mean_dict[pair[0]] ** 2)). \
                mapValues(lambda values: math.sqrt(values)). \
                persist()
            std_dict = dict(std.collect())
            return mean_dict, std_dict

        mean_dict, std_dict = compute_mean_std(data)
        outliers_ = data. \
            mapValues(lambda values: [abs((value - mean_dict[i]) / std_dict[i]) for i, value in enumerate(values)]). \
            mapValues(max). \
            filter(lambda pair: pair[1] > 3). \
            keys().collect()
        outliers_ = set(outliers_)
        data = data.filter(lambda pair: pair[0] not in outliers_).persist()

        mean_dict, std_dict = compute_mean_std(data)
        standardized_data = data. \
            mapValues(lambda values: [(value - mean_dict[index]) / std_dict[index] for index, value in enumerate(values)]). \
            persist()
        return standardized_data, outliers_
示例#4
0
def remove_stop_words(rdd_in: RDD) -> RDD:
    file = open(STOP_WORDS_FILENAME, "r")
    sw_list: list = file.read().split("\n")
    return rdd_in.filter(lambda x: x not in sw_list)