예제 #1
0
 def compTF(self, rdd):
     tf = HashingTF(150000)
     return tf.transform(rdd)
# coding=UTF-8

from pyspark.mllib.feature import HashingTF, IDF
from pyspark import SparkContext

sentence = "hello hello world"
words = sentence.split()  # 将句子切分为一串单词
tf = HashingTF(10000)  # 创建一个向量,其尺寸S = 10,000
aa = tf.transform(words)
print(aa)

# 将若干文本文件读取为TF向量
sc = SparkContext('local')
rdd = sc.wholeTextFiles('P51FeatureExtraction.py').map(
    lambda text: text[1].split())
tfVectors = tf.transform(rdd)  # 对整个RDD进行转化操作
for v in tfVectors.collect():
    print(v)

# 在 Python 中使用 TF-IDF
idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdVectors = idfModel.transform(tfVectors)
print(tfIdVectors)
for v in tfIdVectors.collect():
    print(v)

# 在 Python 中缩放向量
print('--在 Python 中缩放向量--')
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.feature import StandardScaler
예제 #3
0
def produce_tfidf(x):
    tf = HashingTF().transform(x)
    idf = IDF(minDocFreq=5).fit(tf)
    tfidf = idf.transform(tf)
    return tfidf
예제 #4
0
TschemaP = sqlContext.createDataFrame(people1, Tschema)
TschemaPeople = TschemaP.withColumn("label",
                                    TschemaP["label"].cast(DoubleType()))

regexTokenizer = RegexTokenizer(inputCol="FileContent",
                                outputCol="words",
                                pattern="\\W")
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(nltkstop)

regexer = regexTokenizer.transform(TschemaPeople)
stop = stopwordsRemover.transform(regexer)

#tokenizer = Tokenizer(inputCol="filtered", outputCol="words")
#wordsData = tokenizer.transform(stop)
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures")
#hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)

featurizedData = hashingTF.transform(stop)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
seenData = idfModel.transform(featurizedData)

(trainingData1, testData1) = seenData.randomSplit([0.6, 0.4], seed=100)

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData1)
predictions1 = lrModel.transform(testData1)

predictions1.select("FileContent","label","prediction") \
예제 #5
0
	Words = Row('label', 'words')
	words = reviews.map(lambda r: Words(*r))
	words_df = spark.createDataFrame(words)

	#review tokenization
	token = RegexTokenizer(minTokenLength=2, pattern="[^A-Za-z]+", inputCol="words", outputCol="token", toLowercase=True)
	token_filtered = token.transform(words_df)

	#stopwords elimination
	remover = StopWordsRemover(inputCol="token", outputCol="stopwords", caseSensitive=False)
	stopwords_filtered = remover.transform(token_filtered)

	prep_filtered = (stopwords_filtered.select('stopwords').rdd).map(lambda x: x[0])
	
	#tf-idf calculation
	tf = HashingTF(numFeatures=numFeatures).transform(prep_filtered.map(porter_stem, preservesPartitioning=True))
	idf = IDF().fit(tf)
	train_tfidf = idf.transform(tf)

	#set training data with label
	training = review_labels.zip(train_tfidf).map(lambda x: LabeledPoint(x[0], x[1]))

	#train model classifier
	model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=8, maxBins=32)

	#save model to HDFS
	output_dir = "hdfs://VM10-1-0-14:9000/classifier/"+model_name
	model.save(sc, output_dir)

	end = time.time()
	print("Total Records : ", reviews.count(), "  , Processing Time : ", (end - start))
예제 #6
0
파일: random2.py 프로젝트: eds-uga/invasion
def main():
    #==============================================================================
    # Specify aws credentials
    #==============================================================================
    sc = SparkContext(conf=SparkConf().setAppName("Random Forest"))
    sqlContext = SQLContext(sc)
    sc._jsc.hadoopConfiguration().set('fs.s3n.awsAccessKeyId', sys.argv[5])
    sc._jsc.hadoopConfiguration().set('fs.s3n.awsSecretAccessKey', sys.argv[6])
    #==============================================================================
    # Specify file paths on s3
    #==============================================================================

    bytePath = sys.argv[1]
    namePath = sys.argv[2]
    nameTestPath = sys.argv[3]
    classPath = sys.argv[4]

    #==============================================================================
    # Section 1: GETTING TF OF .BYTE FILES, O/P OF SECTION :(FILE NAME, TF)
    #
    # Clean all the byte files (removes /r/n and removes the initial number token on each line)
    # Generate tf of all the cleaned byte files(includes both training and testing byte files)
    #==============================================================================

    #O/P: (FILENAME,TEXT)
    docData = sc.wholeTextFiles(
        bytePath,
        25).map(lambda (x, y): (x.encode("utf-8"), y.encode("utf-8")))

    #O/P: (FILENAME, CLEANED TEXT)
    cleanDocData = docData.map(lambda (x, y): (x, clean(y.split())))

    #O/P: Hashing TF arguement -> 256 + 1 (for "??") features
    x = 16**2 + 1
    hashingTF = HashingTF(x)

    #O/P: (FILENAME,TF)
    tfDocData = cleanDocData.map(lambda (x, y): (x, hashingTF.transform(y)))

    #cache or persist the output of section 1
    tfDocData.persist()

    print tfDocData.take(1)
    #==============================================================================
    # Section 2: GETTING LABELS OF TRAINING DATA , O/P OF SECTION:(FILE NAME,LABEL) OF TRAINING DATA
    #==============================================================================
    #O/P: (INDEX,FILENAME)
    nameData = sc.textFile(
        namePath,
        25).map(lambda x: bytePath + "/" + x + ".bytes").zipWithIndex().map(
            lambda (x, y): (y, x))

    #O/P: (INDEX,LABEL)
    labelData = sc.textFile(
        classPath, 25).zipWithIndex().map(lambda (x, y): (y, str(int(x) - 1)))

    #O/P: (FILENAME,LABEL)
    joinNameLabel = nameData.join(labelData).map(lambda (x, y): y)

    #Cache/persist output of section 2
    joinNameLabel.persist()

    #==============================================================================
    #  Section 3: Join the tf of byte files with labels for analysis and convert into Labelled Point to feed it into classifier.
    #             O/P of section: LabelledPoint(LABEL, TF)
    #==============================================================================
    #O/P: (LABEL,TF)
    joinCleanDocLabel = joinNameLabel.join(tfDocData).map(lambda (x, y): y)

    #O/P: LabelledPoint(LABEL,TF)
    hashData = joinCleanDocLabel.map(lambda
                                     (label, text): LabeledPoint(label, text))

    #Persist/cache the output of section 3
    hashData.persist()

    #==============================================================================
    # Section 4: Apply Random Forest Classifier and generate model on hashData using gini impurity.
    #            Determined heurestically that 50 trees with depth of 8 give best accuracy.
    #==============================================================================
    model = RandomForest.trainClassifier(hashData,
                                         numClasses=9,
                                         categoricalFeaturesInfo={},
                                         numTrees=50,
                                         featureSubsetStrategy="auto",
                                         impurity='gini',
                                         maxDepth=8,
                                         maxBins=32)

    #==============================================================================
    # Section 5: Generate test data in the format for prediction. O/P of section: (INDEX,(INDEX,FILENAME,TF))
    #==============================================================================

    #O/P: (INDEX,FILENAME)
    nameTestData = sc.textFile(
        nameTestPath,
        25).map(lambda x: bytePath + "/" + x + ".bytes").zipWithIndex()

    #O/P: (INDEX,FILENAME,TF)
    joinTestDocLabel = nameTestData.join(tfDocData).map(lambda (x, y):
                                                        (x, y[0], y[1]))

    #O/P: (INDEX,(INDEX,FILENAME,TF))
    joinTestDocLabel1 = joinTestDocLabel.zipWithIndex().map(lambda (x, y):
                                                            (y, x))

    #==============================================================================
    # Section 6: Prediction of Labels and saving the output in a RDD which is saved as text file on s3.
    #==============================================================================
    #O/P: Predictions
    prediction = model.predict(joinTestDocLabel1.map(lambda (x, (y, z, w)): w))
예제 #7
0
    print x


# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local[*]").setAppName("SparkTFIDF")
sc = SparkContext(conf=conf)

# Load documents (one per line).
rawData = sc.textFile("F:/Matakuliah/Semester 6/Big Data/Berita.csv")
fields = rawData.map(lambda x: x.split(";"))
documents = fields.map(lambda x: x[4].split(" "))

documentId = fields.map(lambda x: x[0])

# Creating Hash table and TF table
hashingTF = HashingTF(100000)
tf = hashingTF.transform(documents)

# Creating idf
tf.cache()
idf = IDF(minDocFreq=1).fit(tf)

# Calculate TF/IDF
tfidf = idf.transform(tf)

# Keyword yang akan dicari diubah ke Hash value <- Hash table di atas

keywordTF = hashingTF.transform(["pria"])
keywordHashValue = int(keywordTF.indices[0])

# Temukan relevansinya dengan tabel tf-idf yang sudah dibuat
예제 #8
0
    obj1 = TweetPreProcessing()
    tweet = streamData.map(lambda x: x[1]) \
                      .map(decodeUnicode)\
                      .flatMap(obj1.TweetBuilder)\

    #RETRIEVING TWEET's TEXT and LABEL
    # ZIPPING EACH TWEET WITH UNIQUE ID
    label = tweet.map(lambda tup: tup[0]) \
        .transform(lambda x: x.zipWithUniqueId()) \
        .map(lambda line: (line[1], int(line[0])))
    # int() casting string 'label' to int

    text = tweet.map(lambda tup: tup[1])

    #computing TF-IDF for each tweet and classifying it
    hashingTF = HashingTF(tf_val)
    tfidf_testing = text.map(lambda tup: hashingTF.transform(tup)) \
                    .transform(lambda tup: idf_training.transform(tup)) \

    tweet_classified = tfidf_testing.map(lambda p: int(NBM.predict(p)))\
                                            .transform(lambda p: p.zipWithUniqueId()) \
                                            .map(lambda line: (line[1], line[0])) \
                                            # .pprint()

    # Here the ground truth and the predicted class are joined
    # so, for each tweet we have the following structure:
    # (class_predicted, ground truth) i.e. (4,0),(0,0)
    result = label.join(tweet_classified) \
            .map(lambda tup: tup[1]) \
            .foreachRDD(jdbcInsert)
예제 #9
0
    # $example on$
    # Load documents (one per line).
    # documents = sc.textFile("*.txt").map(lambda line: line.split(" "))
    documents = spark.read.text("*.txt")
    documents = documents.withColumn(
        "doc_id",
        F.row_number().over(Window.orderBy('value')))
    documents.printSchema()

    # creating tokens/words from the sentence data
    tokenizer = Tokenizer(inputCol="value", outputCol="words")
    wordsData = tokenizer.transform(documents)

    # applying tf on the words data
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)

    # calculating the IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # displaying the results
    rescaledData.select("doc_id", "features").show(truncate=False)

    # closing the spark session
    spark.stop()

    # hashingTF = HashingTF()
예제 #10
0
def main(sc, db, tracking_word):
    print('>' * 30 + 'SPARK START' + '>' * 30)

    hashingTF = HashingTF()
    iDF = IDF()

    # Initialize sparksql context
    # Will be used to query the trends from the result.
    sqlContext = SQLContext(sc)
    # Initialize spark streaming context with a batch interval of 10 sec,
    # The messages would accumulate for 10 seconds and then get processed.
    ssc = StreamingContext(sc, batch_interval)

    # Receive the tweets
    host = socket.gethostbyname(socket.gethostname())
    # Create a DStream that represents streaming data from TCP source
    socket_stream = ssc.socketTextStream(STREAM_HOST, STREAM_PORT)
    lines = socket_stream.window(window_time)

    # Construct tables
    tmp = [('none', 0)]
    related_keywords_df = sqlContext.createDataFrame(tmp, ['Keyword', 'Count'])
    related_hashtags_df = sqlContext.createDataFrame(tmp, ['Hashtag', 'Count'])

    trans_table = str.maketrans('', '', ',.!?:;"@&()#.-\\/+')
    lines.map(lambda line: line.translate(trans_table).lower())

    # 1) Count the number of tweets
    tweet_cnt_li = tweet_count(lines)

    # 2) Count the number of users
    user_cnt_li = user_count(lines)

    # 3) Find the related keywords
    related_keywords(lines)

    # 4) Find the related hashtags
    related_hashtags(lines)

    # 5) Sentiment analysis
    pos_cnt_li = sentiment_analysis(lines, hashingTF, iDF)

    ###########################################################################
    # Start the streaming process
    ssc.start()

    process_cnt = 0
    start_time = [datetime.datetime.now()]
    # print("Here!!!", process_times, process_cnt)

    while process_cnt < process_times:
        time.sleep(window_time)
        start_time.append(datetime.datetime.now())
        # Find the top related keywords

        if len(sqlContext.tables().filter(
                "tableName LIKE 'related_keywords_tmp'").collect()) == 1:
            top_words = sqlContext.sql(
                'Select Keyword, Count from related_keywords_tmp')
            related_keywords_df = related_keywords_df.unionAll(top_words)
            related_keywords_tb = True
        else:
            related_keywords_tb = False

        # Find the top related hashtags
        if len(sqlContext.tables().filter(
                "tableName LIKE 'related_hashtags_tmp'").collect()) == 1:
            top_hashtags = sqlContext.sql(
                'Select Hashtag, Count from related_hashtags_tmp')
            related_hashtags_df = related_hashtags_df.unionAll(top_hashtags)

        process_cnt += 1

    # Final tables
    if related_keywords_tb:
        related_keywords_df = related_keywords_df.filter(
            related_keywords_df['Keyword'] != 'none')
        # Spark SQL to Pandas Dataframe
        related_keywords_pd = related_keywords_df.toPandas()
        related_keywords_pd = related_keywords_pd[
            related_keywords_pd['Keyword'] != tracking_word]
        related_keywords_pd = related_keywords_pd.groupby(
            related_keywords_pd['Keyword']).sum()
        related_keywords_pd = pd.DataFrame(related_keywords_pd)
        related_keywords_pd = related_keywords_pd.sort_values(
            "Count", ascending=0).iloc[0:min(9, related_keywords_pd.shape[0])]

    # Spark SQL to Pandas Dataframe
    related_hashtags_pd = related_hashtags_df.toPandas()
    related_hashtags_pd = related_hashtags_pd[
        related_hashtags_pd['Hashtag'] != '#' + tracking_word]
    related_hashtags_pd = related_hashtags_pd.groupby(
        related_hashtags_pd['Hashtag']).sum()
    related_hashtags_pd = pd.DataFrame(related_hashtags_pd)
    related_hashtags_pd = related_hashtags_pd.sort_values(
        "Count", ascending=0).iloc[0:min(9, related_hashtags_pd.shape[0])]

    ssc.stop()
    ###########################################################################

    print(">>>tweet_cnt_li:")
    print(tweet_cnt_li)
    print(">>>user_cnt_li:")
    user_cnt_len_li = len(user_cnt_li)
    print(user_cnt_len_li)
    print(">>>start_time:")
    print(start_time)
    print(">>>pos_cnt_li")
    print(pos_cnt_li)
    print(">>>related_keywords_tb")
    print(related_keywords_tb)
    # print(related_keywords_pd.head(10))
    # print(related_hashtags_pd.head(10))
    if related_keywords_tb:
        related_keywords_js = json.loads(
            related_keywords_pd.reset_index().to_json(orient='records'))
    else:
        related_keywords_js = None
    # print(related_keywords_js)
    related_hashtags_js = json.loads(
        related_hashtags_pd.reset_index().to_json(orient='records'))
    # print(related_hashtags_js)

    # Store the data to MongoDB
    data_to_db(db, start_time, tweet_cnt_li, user_cnt_len_li,
               related_keywords_js, related_hashtags_js, pos_cnt_li,
               tracking_word, related_keywords_tb)

    print('>' * 30 + 'SPARK STOP' + '>' * 30)
예제 #11
0
# Databricks notebook source exported at Thu, 23 Jun 2016 07:23:39 UTC
from pyspark import SparkConf, SparkContext
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.feature import IDF
rawData = sc.textFile(
    "/FileStore/tables/dp736dao1466664806758/subset_small-50f68.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

#Document names
documentNames = fields.map(lambda x: x[1])

#hash the word in document to their term frequencies
hashingtf = HashingTF(100000)  #to save memory
tf = hashingtf.transform(
    documents)  # each value ->term frequency of unique hash value

#calculating tf*idf score
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(
    tf)  # each value ->tf*idf of unique hash value of each document

#Test
gettysBurgTF = hashingtf.transform("Gettysburg")
gettysburgHashValue = int(gettysBurgTF.indices[0])

gettysburgRelevance = tfidf.map(lambda x: x[gettysburgHashValue])
zippedResults = gettysburgRelevance.zip(documentNames)

#print best result
print zippedResults.max()
    def import_data(self):

        # meta df
        meta_df = pd.read_csv(self.metadata_path, dtype={
            'pubmed_id': str,
            'Microsoft Academic Paper ID': str,
            'doi': str
        })

        # json
        all_json = glob.glob(f"{self. DEFAULT_INPUT_PATH}/**/*.json", recursive=True)

        dict_ = {'paper_id': [], 'doi': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [],
                 'abstract_summary': []}
        for idx, entry in enumerate(all_json):
            if idx % (len(all_json) // 10) == 0:
                print(f'Processing index: {idx} of {len(all_json)}')

            try:
                content = FileReader(entry)
            except Exception as e:
                continue  # invalid paper format, skip

            # get metadata information
            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
            # no metadata, skip this paper
            if len(meta_data) == 0:
                continue

            dict_['abstract'].append(content.abstract)
            dict_['paper_id'].append(content.paper_id)
            dict_['body_text'].append(content.body_text)

            # also create a column for the summary of abstract to be used in a plot
            if len(content.abstract) == 0:
                # no abstract provided
                dict_['abstract_summary'].append("Not provided.")
            elif len(content.abstract.split(' ')) > 100:
                # abstract provided is too long for plot, take first 100 words append with ...
                info = content.abstract.split(' ')[:100]
                summary = self.get_breaks(' '.join(info), 40)
                dict_['abstract_summary'].append(summary + "...")
            else:
                # abstract is short enough
                summary = self.get_breaks(content.abstract, 40)
                dict_['abstract_summary'].append(summary)

            # get metadata information
            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

            try:
                # if more than one author
                authors = meta_data['authors'].values[0].split(';')
                if len(authors) > 2:
                    # if more than 2 authors, take them all with html tag breaks in between
                    dict_['authors'].append(self.get_breaks('. '.join(authors), 40))
                else:
                    # authors will fit in plot
                    dict_['authors'].append(". ".join(authors))
            except Exception as e:
                # if only one author - or Null valie
                dict_['authors'].append(meta_data['authors'].values[0])

            # add the title information, add breaks when needed
            try:
                title = self.get_breaks(meta_data['title'].values[0], 40)
                dict_['title'].append(title)
            # if title was not provided
            except Exception as e:
                dict_['title'].append(meta_data['title'].values[0])

            # add the journal information
            dict_['journal'].append(meta_data['journal'].values[0])

            # add doi
            dict_['doi'].append(meta_data['doi'].values[0])

        df_covid = pd.DataFrame(dict_,
                                columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal',
                                         'abstract_summary'])
        df_covid['abstract_word_count'] = df_covid['abstract'].apply(
            lambda x: len(x.strip().split()))  # word count in abstract
        df_covid['body_word_count'] = df_covid['body_text'].apply(
            lambda x: len(x.strip().split()))  # word count in body
        df_covid['body_unique_words'] = df_covid['body_text'].apply(
            lambda x: len(set(str(x).split())))  # number of unique words in body

        # remove duplicates
        df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
        df_covid['abstract'].describe(include='all')
        df_covid.dropna(inplace=True)

        # handle multiple languages
        # set seed
        DetectorFactory.seed = 0

        # hold label - language
        languages = []

        # go through each text
        for ii in tqdm(range(0, len(df_covid))):
            # split by space into list, take the first x intex, join with space
            text = df_covid.iloc[ii]['body_text'].split(" ")

            lang = "en"
            try:
                if len(text) > 50:
                    lang = detect(" ".join(text[:50]))
                elif len(text) > 0:
                    lang = detect(" ".join(text[:len(text)]))
            # ught... beginning of the document was not in a good format
            except Exception as e:
                all_words = set(text)
                try:
                    lang = detect(" ".join(all_words))
                # what!! :( let's see if we can find any text in abstract...
                except Exception as e:

                    try:
                        # let's try to label it through the abstract then
                        lang = detect(df_covid.iloc[ii]['abstract_summary'])
                    except Exception as e:
                        lang = "unknown"
                        pass

            # get the language
            languages.append(lang)

        languages_dict = {}
        for lang in set(languages):
            languages_dict[lang] = languages.count(lang)

        df_covid['language'] = languages
        # drop
        df_covid = df_covid[df_covid['language']=='en']


        # change to spark
        # Enable Arrow-based columnar data transfers
        spark = SparkSession \
            .builder \
            .appName("PySparkKMeans") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")

        # Create a Spark DataFrame from a pandas DataFrame using Arrow
        df_english = spark.createDataFrame(df_covid)
        clean_text_df = df_english.withColumn("text", self.clean_text(col("body_text")))

        tokenizer = Tokenizer(inputCol="text", outputCol="vector")
        vector_df = tokenizer.transform(clean_text_df)


        # remove stopwords
        punctuations = string.punctuation
        stopwords = list(STOP_WORDS)
        stopwords[:10]

        custom_stop_words = [
            'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
            'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
            'al.', 'elsevier', 'pmc', 'czi', 'www', "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
        ]

        for w in custom_stop_words:
            if w not in stopwords:
                stopwords.append(w)

        # Define a list of stop words or use default list
        remover = StopWordsRemover(stopWords=stopwords)

        # Specify input/output columns
        remover.setInputCol("vector")
        remover.setOutputCol("vector_no_stopw")

        # Transform existing dataframe with the StopWordsRemover
        vector_no_stopw_df = remover.transform(vector_df)



        # tdidf
        hashingTF = HashingTF()
        tf = hashingTF.transform(vector_no_stopw_df.select("vector_no_stopw"))

        # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
        # First to compute the IDF vector and second to scale the term frequencies by IDF.
        tf.cache()
        idf = IDF().fit(tf)
        tfidf = idf.transform(tf)

        # PCA
        mat = RowMatrix(tfidf)
        # Compute the top 4 principal components.
        # Principal components are stored in a local dense matrix.
        pc = mat.computePrincipalComponents(1325)

        # Project the rows to the linear space spanned by the top 4 principal components.
        projected = mat.multiply(pc)
        projected.toPandas().to_csv(f"{self.DEFAULT_OUTPUT_FILE}")

        return projected
예제 #13
0
파일: nbv1.py 프로젝트: eds-uga/invasion
def main():
    #==============================================================================
    # Specifying paths
    #==============================================================================
    sc = SparkContext(conf=SparkConf().setAppName("Random Forest"))
    sqlContext = SQLContext(sc)
    bytePath = "/Users/priyanka/Desktop/project2files/train"
    byteTestPath = "/Users/priyanka/Desktop/project2files/test"
    namePath = "/Users/priyanka/Desktop/X_train_small.txt"
    nameTestPath = "/Users/priyanka/Desktop/X_test_small.txt"
    classPath = "/Users/priyanka/Desktop/y_train_small.txt"
    classTestPath = "/Users/priyanka/Desktop/y_test_small.txt"

    #==============================================================================
    # SECTION 1: GETTING TF OF .BYTE FILES, O/P OF SECTION :(FILE NAME, TF)
    #==============================================================================
    #O/P :(FILE NAME,TEXT)
    docData = sc.wholeTextFiles(
        bytePath,
        25).map(lambda (x, y): (x.encode("utf-8"), y.encode("utf-8")))
    print("docData done")
    docData.take(1)
    #clean docData here - remove 1st word from line and remove /r/n
    #O/P: (FILE NAME, CLEAN TEXT)
    cleanDocData = docData.map(lambda (x, y): (x, clean(y.split())))

    x = 16**2 + 1
    hashingTF = HashingTF(x)

    #O/P: (FILE NAME, TF)
    tfDocData = cleanDocData.map(lambda (x, y): (x, hashingTF.transform(y)))
    tfDocData.take(1)
    #==============================================================================
    # Section 2:GETTING LABELS , O/P OF SECTION:(FILE NAME,LABEL)
    #==============================================================================
    #Output format : (INDEX, FILE NAME)
    nameData = sc.textFile(
        namePath, 25).map(lambda x: "file:" + bytePath + "/" + x + ".bytes"
                          ).zipWithIndex().map(lambda (x, y): (y, x))

    #O/P:(INDEX,LABEL)
    labelData = sc.textFile(
        classPath, 25).zipWithIndex().map(lambda (x, y): (y, str(int(x) - 1)))

    #O/P :(FILE NAME,LABEL)
    joinNameLabel = nameData.join(labelData).map(lambda (x, y): y)

    #==============================================================================
    # Section 3:Get Data and generate Labelled Point O/P: (LABEL,TF)
    #==============================================================================
    #O/P:(LABEL,TF)
    joinCleanDocLabel = joinNameLabel.join(tfDocData).map(lambda (x, y): y)

    #O/P: Labelled Point(LABEL,TF)
    hashData = joinCleanDocLabel.map(lambda
                                     (label, text): LabeledPoint(label, text))

    #==============================================================================
    # Section 4: Build Classification Model:
    #            Applying Random Forest Classification Model on training data
    #==============================================================================
    #model = RandomForest.trainClassifier(hashData, numClasses=9, categoricalFeaturesInfo={},numTrees=50, featureSubsetStrategy="auto",impurity='gini', maxDepth=8, maxBins=32)

    model = NaiveBayes.train(hashData)
    #==============================================================================
    # Section 5: TEST : GETTING TF OF .BYTE FILES, O/P OF SECTION :(FILE NAME, TF)
    #==============================================================================

    docTestData = sc.wholeTextFiles(
        byteTestPath,
        25).map(lambda (x, y): (x.encode("utf-8"), y.encode("utf-8")))
    #docTestData.take(1)

    cleanDocTestData = docTestData.map(lambda (x, y): (x, clean(y.split())))
    tfDocTestData = cleanDocTestData.map(lambda (x, y):
                                         (x, hashingTF.transform(y)))

    #==============================================================================
    # Section 6: TEST: GETTING LABELS , O/P OF SECTION:(FILE NAME,LABEL)
    #==============================================================================
    nameTestData = sc.textFile(
        nameTestPath,
        25).map(lambda x: "file:" + byteTestPath + "/" + x + ".bytes"
                ).zipWithIndex().map(lambda (x, y): (y, x))
    labelTestData = sc.textFile(
        classTestPath,
        25).zipWithIndex().map(lambda (x, y): (y, str(int(x) - 1)))
    joinTestNameLabel = nameTestData.join(labelTestData).map(lambda (x, y): y)

    #==============================================================================
    # Section 7: TEST: o/p: (FILE,)
    #==============================================================================
    #O/P:(FILE,LABEL,TF)
    joinTestDocLabel = joinTestNameLabel.join(tfDocTestData).map(
        lambda (x, y): (x, y[0], y[1]))

    #O/P:(INDEX,(FILE,LABEL,TF))
    joinTestDocLabel1 = joinTestDocLabel.zipWithIndex().map(lambda (x, y):
                                                            (y, x))
    #Predict on Sparse vector tf
    prediction = model.predict(joinTestDocLabel1.map(lambda (x, (y, z, w)): w))
예제 #14
0
        else:
            return ''
    else:
        return ""
data = data.map(combine).filter(lambda x:len(x)>0) # 返回形如 [index, conten, title],完成数据预处理

#####################################################
################# Step 2 TF-IDF #####################
#####################################################

from pyspark.mllib.feature import HashingTF,IDF

from pyspark.mllib.feature import HashingTF,IDF

# TF部分
tf = HashingTF(50000)   # 取50000维
vectors = data.map(lambda line:(line[0],line[2],tf.transform(line[1]))) # 形如 [index, title, tf]

# IDF部分
vec = vectors.map(lambda line: line[2]) # 只留下tf结果
idf = IDF()
idfmodel = idf.fit(vec)
tfIdfVectors = idfmodel.transform(vec) # 获得tf-idf结果
tfIdfVectors.cache()    # 使tf-idf结果可持久化,完成tf-idf步骤

#####################################################
################## Step 3 SVD #######################
#####################################################

# 进计算VD
from pyspark.mllib.linalg.distributed import RowMatrix
예제 #15
0
spark = SparkSession(sc)

df = spark.read.csv('hdfs://192.168.100.6:9000/user/ubuntu/Dataset75.csv',
                    header=True)

data = df.rdd.map(list)
print(data.first())

score = data.map(lambda s: 1.0
                 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0)
comment = data.map(lambda s: s[3])

print(score.count())
print(comment.count())

tf = HashingTF()
tfVectors = tf.transform(comment).cache()

idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdfVectors = idfModel.transform(tfVectors)

#print(tfIdfVectors.take(3))
#需要用 RDD 的 zip 算子将这两部分数据连接起来,并将其转化为分类模型里的 LabeledPoint 类型
zip_score_comment = score.zip(tfIdfVectors)
final_data = zip_score_comment.map(lambda line: LabeledPoint(line[0], line[1]))
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=0)
print(train_data.take(1))

time_start = time.time()
#SVMModel = SVMWithSGD.train(train_data,iterations=100)
예제 #16
0
def main(sc):

    data = sc.textFile('data/train.txt').map(parseLine)
    #print(data.take(10))

    # Train/Test split
    training, test = data.randomSplit([0.7, 0.3], seed=0)

    # TF-IDF
    # TF
    # Features will be hashed to indexes
    # And the feature(term) frequencies will be calculated
    hashingTF = HashingTF()
    # For each training example
    tf_training = training.map(lambda tup: hashingTF.transform(tup[1]))
    # IDF
    # Compute the IDF vector
    idf_training = IDF().fit(tf_training)
    # Scale the TF by IDF
    tfidf_training = idf_training.transform(tf_training)

    # (SparseVector(1048576, {110670: 1.5533, ...), 0)
    tfidf_idx = tfidf_training.zipWithIndex()
    # (['The', 'Da', 'Vinci', 'Code', 'book', 'is', 'just', 'awesome.'], 0)
    training_idx = training.zipWithIndex()

    # Reverse the index and the SparseVector
    idx_training = training_idx.map(lambda line: (line[1], line[0]))
    idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0]))
    #print(idx_training.take(10))

    # rdd.join: (K,V).join(K,W) -> (K, (V,W))
    # idx_tfidf has no info about lables(0/1)
    # but idx_training has
    joined_tfidf_training = idx_training.join(idx_tfidf)
    training_labeled = joined_tfidf_training.map(lambda tup: tup[1])
    training_labeled = training_labeled.map(
        lambda x: LabeledPoint(x[0][0], x[1]))
    #print(training_labeled.take(10))

    # Train a naive Bayes model
    model = NaiveBayes.train(training_labeled, 1.0)

    # Test the model
    tf_test = test.map(lambda tup: hashingTF.transform(tup[1]))
    idf_test = IDF().fit(tf_test)

    tfidf_test = idf_test.transform(tf_test)
    tfidf_idx = tfidf_test.zipWithIndex()
    test_idx = test.zipWithIndex()
    idx_test = test_idx.map(lambda line: (line[1], line[0]))
    idx_tfidf = tfidf_idx.map(lambda l: (l[1], l[0]))
    joined_tfidf_test = idx_test.join(idx_tfidf)

    test_labeled = joined_tfidf_test.map(lambda tup: tup[1])
    labeled_test_data = test_labeled.map(lambda k: LabeledPoint(k[0][0], k[1]))
    #print(labeled_test_data.take(2))
    # Apply the trained model on Test data
    predictionAndLabel = labeled_test_data.map(
        lambda p: (model.predict(p.features), p.label))
    #print(predictionAndLabel.take(10))

    # Calculate the accuracy
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda x: x[0] == x[1]).count() / labeled_test_data.count()

    print('>>> Accuracy')
    print(accuracy)

    #model.save(sc, '/model')
    output = open('src/model/model.ml', 'wb')
    pickle.dump(model, output)
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel

from pyspark import SparkContext, SparkConf
conf = SparkConf().setMaster("local[*]").setAppName("Naive_Bayes")
sc = SparkContext(conf=conf)
print "Running Spark Version %s" % (sc.version)

#word to vector space converter, limit to 10000 words
htf = HashingTF(10000)

#let 1 - positive class, 0 - negative class
#tokenize sentences and transform them into vector space model

positiveData = sc.textFile("Positive.txt")
posdata = positiveData.map(lambda text: LabeledPoint(
    1,
    htf.transform(
        text.replace(',', '').replace('.', '').replace('-', '').replace(
            '?', '').replace('!', ' ').lower().split(" "))))
print "No. of Positive Sentences: " + str(posdata.count())
posdata.persist()

negativeData = sc.textFile("Negative.txt")
negdata = negativeData.map(lambda text: LabeledPoint(
    0,
    htf.transform(
        text.replace(',', '').replace('.', '').replace('-', '').replace(
            '?', '').replace('!', ' ').lower().split(" "))))
print "No. of Negative Sentences: " + str(negdata.count())
예제 #18
0
from pyspark.mllib.feature import IDF

# Sparkの設定
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf=conf)

# 文章の読み込み
rawData = sc.textFile("e:/sundog-consult/Udemy/DataScience/subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

# 文章の名前を取り出す
documentNames = fields.map(lambda x: x[1])

# TFの計算。各文章の単語はハッシュ値として数値化される。
hashingTF = HashingTF(100000)  # ハッシュ値の数の上限
tf = hashingTF.transform(documents)

# IDFの計算
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)

# 各文章における各単語のTF*IDFを計算する
tfidf = idf.transform(tf)

# 既にRDDをsparseベクトル(https://spark.apache.org/docs/latest/mllib-data-types.html)として得ています。
# 各TFxIDFの値は、各文章ごとにハッシュ値と関連付けて格納されています。

# 今回の元のデータに"Abraham Lincoln"の記事が含まれているので、 "Gettysburg"(リンカーンが有名なスピーチを行った場所)の単語で検索を行ってみましょう。

# "Gettysburg"のハッシュ値を取得します。
예제 #19
0
    file.write("- Positive : " + str(num_pos_entropy) + "\n")
    file.write("- Negative : " + str(num_neg_entropy) + "\n")

    ###########################################################################
    #########           Testing on Brexit Labeled Data                #########

    print("\n========= Test on Brexit labeled data ========= ")

    text_negative_brexit = sc.textFile("data/brexit_negatif_clean.csv")
    text_positive_brexit = sc.textFile("data/brexit_positif_clean.csv")

    test_text_brexit = text_negative_brexit.union(text_positive_brexit)
    test_tlabels_brexit = text_negative_brexit.map(lambda x: 0.0).union(
        text_positive_brexit.map(lambda x: 1.0))

    tf_test_brexit = HashingTF(numFeatures=100000).transform(
        test_text_brexit.map(lambda x: x))

    tfidf_test_brexit = idf.transform(tf_test_brexit)

    #decision tree entropy
    labeled_prediction_entropy = test_tlabels_brexit.zip(
        model_decision_tree_entropy.predict(tfidf_test_brexit)).map(
            lambda x: {
                "actual": x[0],
                "predicted": x[1]
            })
    accuracy_entropy = 1.0 * labeled_prediction_entropy.filter(
        lambda doc: doc["actual"] == doc['predicted']).count(
        ) / labeled_prediction_entropy.count()

    print('\n== ACCURACY DT ENTROPY : ', accuracy_entropy, '==')
예제 #20
0
from pyspark.mllib.feature import IDF

# Boilerplate Spark stuff:
conf = SparkConf().setMaster("local").setAppName("SparkTFIDF")
sc = SparkContext(conf=conf)

# Loading documents (one per line).
rawData = sc.textFile("C:/Users/shatak/Desktop/shatak3rd/subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

# Store the document names for later:
documentNames = fields.map(lambda x: x[1])

#hashing the words in each document to their term frequencies:
hashingTF = HashingTF(100000)  #100K hash buckets just to save some memory
tf = hashingTF.transform(documents)

# we have an RDD of sparse vectors representing each document,

# computing the TF*IDF of each term in each document:
tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

# we have an RDD of sparse vectors, where each value is the TFxIDF
# of each unique hash value for each document.

# the article for "Abraham Lincoln" is in our data
# set, so let's search for "Gettysburg" (Lincoln famous speech there):
예제 #21
0
        )
        #print(hashtag_counts_df.show())
        analysis_type = 'hashtag_analysis'
        send_df_to_dashboard(hashtag_counts_df, analysis_type)
    except:
        e = sys.exc_info()[0]
        print("There is an error: %s" % e)


conf = SparkConf()
conf.setAppName("TwitterStreamApp")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 3)
ssc.checkpoint("checkpoint_models")
htf = HashingTF(50000)
NB_output_dir = '/spark/NaiveBayes'
NB_load_model = NaiveBayesModel.load(sc, NB_output_dir)

# Sentiment Analysis #

## 01 read tweets from stream ##
dataStream = ssc.socketTextStream("localhost", 9009)
## 02 split the text into words #
words = dataStream.map(lambda x: x.split(" "))
## 03 transformed the words into features ##
features = words.map(lambda x: htf.transform(x))
## 04 predict the sentiment ##
prediction = features.map(lambda x: classify(x))
## 05 label the sentiments ##
label_sentiments = prediction.map(lambda x: ('positive', 1)
예제 #22
0
파일: text_rdd.py 프로젝트: oliyura/UANLP
 def get_tfidf(idf, sentence: List[str]) -> SparseVector:
     tf = HashingTF().transform(sentence)
     return idf.transform(tf)
    g = Goose()
    article = g.extract(url=url)
    a = article.cleaned_text
    html_dict = []
    tokenhtml = tokenize(a)
    print(tokenhtml)
    for i in range(0, len(tokenhtml)):
        body = ''
        body += tokenhtml[i] + ' '
    html_dict.append({"label": "0", "text": body})

    sc = SparkContext()
    htmldata = sc.parallelize(html_dict)
    labels = htmldata.map(lambda doc: doc["label"], preservesPartitioning=True)

    tf = HashingTF().transform(
        htmldata.map(lambda doc: doc["text"], preservesPartitioning=True))
    idf = IDF().fit(tf)
    tfidf = idf.transform(tf)
    end_tfidf = datetime.now()
    tfidf_time = format(end_tfidf - start_tfidf)

    dataset = labels.zip(tfidf).map(lambda x: LabeledPoint(x[0], x[1]))
    sameModel = NaiveBayesModel.load(
        sc, "/Users/apple/Dropbox/2016Spring/COSC526/MacHW1/mymodel")
    start_predict = datetime.now()
    predictionAndLabel = dataset.map(lambda p: (sameModel.predict(p.features)))

    predict_time = format(end_predict - start_predict)
    accuracy = 1.0 * predictionAndLabel.filter(
        lambda (x, v): x == v).count() / dataset.count()
예제 #24
0
파일: text_rdd.py 프로젝트: oliyura/UANLP
 def _compute_idf(texts: RDD) -> IDFModel:
     tf = HashingTF().transform(texts)
     tf.cache()
     idf = IDF().fit(tf)
     return idf
예제 #25
0
from pyspark import SparkConf, SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import HashingTF
from pyspark.mllib.classification import LogisticRegressionWithSGD
if __name__ == '__main__':
    conf = SparkConf().setMaster("local").setAppName("spamClassify")
    sc = SparkContext(conf=conf)
    spam = sc.textFile("input/spam.txt")
    normal = sc.textFile("input/normal.txt")

    # Create a HashingTF instance to map email text to vectors of 10,000 features.
    tf = HashingTF(numFeatures=10000)
    # Each email is split into words, and each word is mapped to one feature.
    spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
    normalFeatures = normal.map(lambda email: tf.transform(email.split(" ")))

    # Create LabeledPoint datasets for positive (spam) and negative (normal) examples.

    positiveExamples = spamFeatures.map(
        lambda features: LabeledPoint(1, features))
    negativeExamples = normalFeatures.map(
        lambda features: LabeledPoint(0, features))
    trainingData = positiveExamples.union(negativeExamples)
    trainingData.cache(
    )  # Cache since Logistic Regression is an iterative algorithm.
    # Run Logistic Regression using the SGD algorithm.
    model = LogisticRegressionWithSGD.train(trainingData)
    # Test on a positive example (spam) and a negative one (normal). We first apply
    # the same HashingTF feature transformation to get vectors, then apply the model.
    posTest = tf.transform(
        "O M G GET cheap stuff by sending money to ...".split(" "))
예제 #26
0
파일: text_rdd.py 프로젝트: oliyura/UANLP
 def get_tfidf(self, text_str) -> SparseVector:
     tf = HashingTF().transform(Text(text_str).process().words)
     return self.idf.transform(tf)
# END OF IMPORTS
####################################################
# TODO: Change from random Split to normal split and CV
# TODO: try DT and SVM algorithms
# TODO: run ready libraries and compare results

# START OF GLOBAL VARIABLES
####################################################
PUNCTUATION = [i for i in string.punctuation]
STOPWORDS = set(stopwords.words('english'))
HEADER = []
PS = PorterStemmer()

conf = SparkConf().setAppName("myFirstApp").setMaster("local")
SC = SparkContext(conf=conf)
HTF = HashingTF(50000)

# END OF GLOBAL VARIABLES
####################################################


# Fix tweet if split by comma
def line_fixer(line, col_count):
    if len(line) > col_count:
        return line[:col_count - 1] + [",".join(line[col_count - 1:])]
    return line


# Ignore unwanted columns like the tweet ID (column 0) and SentimentSource (column 2)
def remove_unwanted_col(line, sentinement_index, sentinement_text_index):
    return line[sentinement_index], line[sentinement_text_index]
예제 #28
0
def labeled_points(rdd,n,label):
	rdd_list=rdd.map(lambda x: x.split())
	hTF=HashingTF(n)
	rdd_h=rdd_list.map(lambda x: hTF.transform(x))
	rdd_h_l=rdd_h.map(lambda x: LabeledPoint(label,x))
	return rdd_h_l
예제 #29
0
#import sys
#import pyspark
from pyspark.mllib.feature import HashingTF, IDF
#from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
#from pyspark.sql.functions import explode

sc = SparkContext("local", "Simple App")

# Load documents (one per line).
documents = sc.textFile("reviews.txt").map(lambda line: line.split(" "))

hashingTF = HashingTF()
tf = hashingTF.transform(documents)

# While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
# First to compute the IDF vector and second to scale the term frequencies by IDF.
tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

# spark.mllib's IDF implementation provides an option for ignoring terms
# which occur in less than a minimum number of documents.
# In such cases, the IDF for these terms is set to 0.
# This feature can be used by passing the minDocFreq value to the IDF constructor.
idfIgnore = IDF(minDocFreq=2).fit(tf)
tfidfIgnore = idfIgnore.transform(tf)

# save tf-idf
tfidfIgnore.saveAsTextFile('tfidf')
예제 #30
0
def getFeatures(data):
    hashingTF = HashingTF()
    tfData = data.map(lambda tup: hashingTF.transform(tup))
    idfData = IDF().fit(tfData)
    tfidfData = idfData.transform(tfData)
    return tfidfData