Python RDD.filter示例

编程语言: Python

命名空间/包名称: pyspark.rdd

类/类型: RDD

方法/功能: filter

hotexamples.com的示例: 4

Python RDD.filter - 已找到4个示例。这些是从开源项目中提取的最受好评的pyspark.rdd.RDD.filter现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

map(15)

RDD(8)

flatMap(8)

collect(6)

count(6)

mapPartitions(5)

filter(4)

transform(3)

aggregate(2)

getNumPartitions(2)

__init__(2)

mapValues(2)

repartition(2)

saveAsTextFile(1)

keys(1)

sample(1)

_to_java_object_rdd(1)

values(1)

示例#1

显示文件

def map_reduce_sentences_to_pos(words: rdd.RDD):
    """
    Perform map reduce functions on text rdd extracting parts of speech from
    the text, counting, lemmatizing, and indexing them, writing the final
    results to separate files depending on the part of speech.

    Args:
        words: A RDD with nltk tokenized words and indexing information.
    """

    pos = ["noun", "adjective", "verb", "adverb"]

    cwd = os.getcwd()
    filesep = "/"

    for part in pos:
        funcName = "is" + part.capitalize()
        posFunc = getattr(PosChecker, funcName)
        part_words = words.filter(posFunc)
        extract = functools.partial(extractWord, part)
        part_words = part_words.map(extract)
        word_counts = part_words.reduceByKey(countWord)
        np_counts = word_counts.map(toNpArray)
        dirname = part + "_counts.txt"
        foldername = cwd + filesep + dirname
        if os.path.isdir(foldername):
            shutil.rmtree(foldername)
        np_counts.saveAsTextFile(dirname)

示例#2

显示文件

文件： rank.py 项目： cyrilsx/pyspark_rank

def occurrence_of_word(word: str, rdd: RDD):
    """
    This function should count the number of occurrence of a word

    >>> rdd = sc.parallelize([Article("art1", "an example just to try"),Article("art2", "another example")])
    >>> occurrence_of_word("example", rdd)
    2
    >>> occurrence_of_word("another", rdd)
    1
    :param word: the expected word
    :param rdd: dataset of articles
    :return: the number of occurrence of the word
    """
    return rdd.filter(lambda a: word in a.content.split(" ")) \
        .aggregate(0, lambda x, y: x + 1, lambda x, y: x + y)

示例#3

显示文件

文件： bfr.py 项目： econzhijun/Big-Data-Mining-In-Spark

    def standardize(data: RDD) -> (RDD, set):
        """
        Standardize the input data_.
        :param data: pair RDD, key is index of the point, value is the features of the point
        :return: the standardized data_ and a set of ids of outlier points
        """
        def compute_mean_std(data_) -> tuple:
            intermediate = data_. \
                values(). \
                flatMap(lambda features: [(i, feature) for i, feature in enumerate(features)]). \
                persist()

            count = data_.count()
            mean = intermediate.reduceByKey(lambda x, y: x + y).mapValues(
                lambda value: value / count).persist()
            mean_dict = dict(mean.collect())

            std = intermediate.\
                mapValues(lambda value: value ** 2).\
                reduceByKey(lambda x, y: x + y). \
                mapValues(lambda value: value / count). \
                map(lambda pair: (pair[0], pair[1] - mean_dict[pair[0]] ** 2)). \
                mapValues(lambda values: math.sqrt(values)). \
                persist()
            std_dict = dict(std.collect())
            return mean_dict, std_dict

        mean_dict, std_dict = compute_mean_std(data)
        outliers_ = data. \
            mapValues(lambda values: [abs((value - mean_dict[i]) / std_dict[i]) for i, value in enumerate(values)]). \
            mapValues(max). \
            filter(lambda pair: pair[1] > 3). \
            keys().collect()
        outliers_ = set(outliers_)
        data = data.filter(lambda pair: pair[0] not in outliers_).persist()

        mean_dict, std_dict = compute_mean_std(data)
        standardized_data = data. \
            mapValues(lambda values: [(value - mean_dict[index]) / std_dict[index] for index, value in enumerate(values)]). \
            persist()
        return standardized_data, outliers_

示例#4

显示文件

文件： ProfileProcesser.py 项目： jerry940/BigData-1

def remove_stop_words(rdd_in: RDD) -> RDD:
    file = open(STOP_WORDS_FILENAME, "r")
    sw_list: list = file.read().split("\n")
    return rdd_in.filter(lambda x: x not in sw_list)