示例#1
0
def iqiyi2tag(data_path, iqiyi_tags):
    iqiyi2tag_df = sc.textFile(data_path).map(lambda x: Row(iqiyi=x)).toDF()
    iqiyi2tag_df = iqiyi2tag_df.withColumn(
        'movie',
        split(iqiyi2tag_df['iqiyi'], ',').getItem(0))
    for i in range(len(iqiyi_tags)):
        iqiyi2tag_df = iqiyi2tag_df.withColumn(
            iqiyi_tags[i],
            split(iqiyi2tag_df['iqiyi'], ',').getItem(i + 1))
    iqiyi2tag_df = iqiyi2tag_df.drop('iqiyi')
    iqy_movies = sc.textFile(data_path).map(
        lambda x: Row(iqiyi=x.split(",")[0])).collect()
    iqy = [item[0] for item in iqy_movies]
    tag_list = ['美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯']
    for tag in tag_list:
        iqiyi2tag_df = iqiyi2tag_df.withColumn(tag, lit(0)).persist(
            StorageLevel.DISK_ONLY)
    return iqiyi2tag_df, iqy
示例#2
0
def iqiyi2tag(data_path, iqiyi_tags):
    '''
    读取iqy影片、标签
    :param data_path:
    :param iqiyi_tags:
    :return:
    '''
    iqiyi2tag_df = sc.textFile(data_path).map(lambda x: Row(iqiyi=x)).toDF()
    iqiyi2tag_df = iqiyi2tag_df.withColumn(
        'iqytag',
        split(iqiyi2tag_df['iqiyi'], ',').getItem(0))
    for i in range(len(iqiyi_tags)):
        iqiyi2tag_df = iqiyi2tag_df.withColumn(
            iqiyi_tags[i],
            split(iqiyi2tag_df['iqiyi'], ',').getItem(i + 1))
    iqiyi2tag_df = iqiyi2tag_df.drop('iqiyi')
    iqy_movies = sc.textFile(data_path).map(
        lambda x: Row(iqiyi=x.split(",")[0])).collect()
    iqy = [item[0] for item in iqy_movies]
    return iqiyi2tag_df, iqy
示例#3
0
from pyspark.shell import sc

from Spark.settings import HDFS_HOST_ADDR, HDFS_HOST_PORT

if __name__ == '__main__':
    text_file = sc.textFile('hdfs://{}:{}{}'.format(
        HDFS_HOST_ADDR, HDFS_HOST_PORT, '/test_data/sparktest.txt'))

    filter_RDD = text_file.filter(lambda line: 'spark' in line)
    filter_RDD.cache()
    counter = filter_RDD.count()
    print('\n\n\n\n\ncounter: ', counter)
示例#4
0
import time
from pyspark.shell import spark, sc

testFile = "/mnt/sda/Spark/spark-3.0.1-bin-hadoop3.2/RDDTESTFILE.TXT"
print("Test input file = ", testFile)


def printfunc(x):
    print('Word {} occurs {} '.format(x[0], x[1]))


# Initial RDD (rdd_0) creation using API: pyspark.SparkContext.textFile
rdd_00 = sc.textFile(testFile)
rdd_0 = rdd_00.repartition(8)

# pyspark.RDD.flatMap: Return a new RDD (rdd_1) by applying a function to all elements of rdd_0 and then flattening the results
rdd_1 = rdd_0.flatMap(lambda x: x.split())

# pyspark.RDD.map: Return a new RDD (rdd_2) by applying a function to each element of rdd_1
rdd_2 = rdd_1.map(lambda x: (x, 1))

# pyspark.RDD.reduceByKey: Merge the values for each key using an associative and commutative reduce function
rdd_3 = rdd_2.reduceByKey(lambda x, y: x + y)

# pyspark.RDD.toDebugString: A description of this RDD and its recursive dependencies for debugging.
print(rdd_3.toDebugString())

# pyspark.RDD.foreach: Applies a function to all elements of this RDD
rdd_3.foreach(printfunc)

time.sleep(1000)
示例#5
0
import sys

from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.shell import sc

if __name__ == "__main__":
    input_file = sys.argv[1]
    num_features = int(sys.argv[2])
    # load input files
    print("Loading input file %s ..." % input_file)
    emails = sc.textFile(input_file)

    print("\tTotal number of emails: %i" % emails.count())

    spam_rdd = emails.filter(lambda x: int(x.split("\t")[0]) == 1)
    spam = spam_rdd.map(lambda x: x.split("\t")[1])
    ham_rdd = emails.filter(lambda x: int(x.split("\t")[0]) == 0)
    ham = ham_rdd.map(lambda x: x.split("\t")[1])

    print("\tTotal number of spam emails: %i" % spam.count())
    print("\tTotal number of ham emails: %i" % ham.count())

    # hash words
    print("Hashing words into features ...")
    tf = HashingTF(numFeatures=num_features)
    spam_features = spam.map(lambda email: tf.transform(email.split(" ")))
    ham_features = ham.map(lambda email: tf.transform(email.split(" ")))

    # label the features
def tweet_search_job():
    session = OAuth1Session(settings.TWITTER['CONSUMER_KEY'],
                            settings.TWITTER['CONSUMER_SECRET'],
                            settings.TWITTER['ACCESS_TOKEN'],
                            settings.TWITTER['ACCESS_TOKEN_SECRET'])

    yesterday = datetime.date.today() - datetime.timedelta(1)
    params = {
        'q': SEARCH_KEYWORD,
        'count': 200,
        'lang': 'ja',
        'until': yesterday.strftime("%Y-%m-%d")
    }

    if os.path.isfile('tweets.txt'):
        os.remove('tweets.txt')
    f = open('tweets.txt', 'w')

    try:
        error_count = 0
        too_many_requests_count = 0
        while True:
            response = session.get(SEARCH_URL, params=params)

            if response.status_code == 503:
                if error_count > 5:
                    raise Exception("エラー上限に達したため終了します")

                error_count += 1
                time.sleep(30)
                continue

            if response.status_code == 429:
                if too_many_requests_count >= 7:
                    break

                too_many_requests_count += 1
                sec = int(
                    response.headers['X-Rate-Limit-Reset']) - time.mktime(
                        datetime.datetime.now().timetuple())
                print("{0}---{1} sec sleep".format(
                    datetime.datetime.now().strftime("%Y/%m/%d %H:%M:%S"),
                    sec))
                time.sleep(sec + 5)
                continue

            if response.status_code != 200:
                print("Twitter API Error: %d" % response.status_code)
                print("Twitter API Error: {}".format(response.text))

                time.sleep(30)
                continue

            reset = 0
            limit = response.headers.get('X-Rate-Limit-Remaining', None)
            if limit is None:
                limit, reset = getLimitStatus(session)

            if limit == 0:
                sec = int(response.headers.get(
                    'X-Rate-Limit-Reset', reset)) - time.mktime(
                        datetime.datetime.now().timetuple())
                time.sleep(sec + 5)
                continue

            error_count = 0

            res_text = json.loads(response.text)
            if len(res_text) == 0:
                break

            max_id = ""
            for tweet in res_text['statuses']:
                match = re.search(r'(全員|ふぁぼ|ファボ|定期|相互|RT)', tweet['text'],
                                  re.MULTILINE)
                if match is None:
                    f.write(tweet['text'] + '\n')
                max_id = tweet['id']

            params['max_id'] = max_id

    finally:
        f.close()

    try:
        textfile = sc.textFile("tweets.txt")
        print(textfile.count())

        words = textfile.flatMap(lambda line: line.split())
        words_filter = words.filter(lambda x: SEARCH_KEYWORD not in x)
        words_filter = words_filter.filter(lambda x: "#" not in x)
        words_filter = words_filter.filter(lambda x: len(x) >= 2)

        words_tuple = words_filter.map(lambda word: (word, 1))
        words_count = words_tuple.reduceByKey(lambda a, b: a + b)
        words_count_sorted = words_count.sortBy(lambda t: t[1], False)
        print(words_count_sorted.collect()[:10])

        for ranking in words_count_sorted.collect()[:10]:
            trends = Trends(target_date=yesterday.strftime("%Y-%m-%d"),
                            word=ranking[0],
                            count=ranking[1])
            trends.save()
    finally:
        os.remove('tweets.txt')
from pyspark.shell import sc

txt_file = sc.textFile("README.md")

print(*txt_file.collect()[0])

s = txt_file.collect()
print(type(s))

broadcastVar = sc.broadcast(s)

print(broadcastVar.value)
# [1, 2, 3]

# driver write small data(user list)
# executer readonly
示例#8
0
    spark = SparkSession\
        .builder\
        .appName("PythonWordCount")\
        .getOrCreate()

    # input part
    importPath = sys.argv[1]
    supportRate = float(sys.argv[2])
    outputPath = sys.argv[3]
    # print(importPath, supportRate, outputPath)

    chunk = 2
    # lines = spark.read.text(importPath).rdd.map(lambda r: r)
    # sc = SparkContext("ee")

    lines = sc.textFile(importPath).map(lambda r: r)
    totalNum=len(lines.collect())
    originallines = lines
    # lines = spark.read.text(sys.argv[1],2).rdd.map(lambda r: r[0])

    ratio=supportRate

    # Stage1
    lines = lines.mapPartitions(aprioriF,2)
    # print(type(lines))
    # print(lines.distinct().collect())
    lines = lines.reduce(agg)
    # print(type(lines))
    # print(lines)

    # print(doubtCandidate)
示例#9
0
sys.path.append("/home/spark/python/lib")
sys.path.append("/opt/spark/python/lib/py4j-0.10.8.1-src")

path = "/home/hpinto/Desktop/mySpark-data/DimCurrency.csv"
read_file = open(path, 'r')
days = read_file.read()
print(days)

pathw = "/home/hpinto/Desktop/mySpark-data/DimnewCurrency.csv"
wr_file = open(pathw, 'a+')
wr = wr_file.write(days)
print(wr)

#Exception: Python in worker has different version 2.7 than that in driver 3.7, PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.

pairs = map(lambda s: (s, 1), days)
counts = reduce(lambda a, b: a + b, days)
print(counts)

lines = sc.textFile("/home/hpinto/Desktop/mySpark-data/DimAccounts.csv"
                    )  # Distribute the data - Create a RDD

countX = (
    lines.flatMap(lambda x: x.split(' '))  # Create a list with all words
    .map(lambda x: (x, 1))  # Create tuple (word,1)
    .reduceByKey(lambda x, y: x + y))  # reduce by key i.e. the word
#output = countX.take(100)                                 # get the output on local
x1 = countX.take(100)

for (word, count) in x1:  # print output
    print("%s: %i" % (word, count))
示例#10
0
# -*- coding:utf-8 -*-
"""
    2019/4/15 11:04 by young
"""

from pyspark.shell import sc

lines = sc.textFile('./data.txt')
s = lines.count()
print(s)
示例#11
0
import pyspark

from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.shell import spark, sc

from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

# Load and parse the data
data = sc.textFile("data/mllib/sample_lda_data.txt")
parsedData = data.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
# Index documents with unique IDs
corpus = parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
for topic in range(3):
    print("Topic " + str(topic) + ":")
    for word in range(0, ldaModel.vocabSize()):
        print(" " + str(topics[word][topic]))

# Save and load model
ldaModel.save(