예제 #1
0
def vectorizeCV(fullDF, sampleDF, minDocFrec):

    vectorizer = CountVectorizer()
    cv = CountVectorizer(minDF=minDocFrec,
                         inputCol="raw",
                         outputCol="features",
                         binary=True)

    if sampleDF == None:
        model = cv.fit(fullDF)
    else:
        model = cv.fit(sampleDF)
    result = model.transform(fullDF)

    return result, model
예제 #2
0
def train(allHex,labels,hashFiles,sc,sqlc,path):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")

    def fun(accum,x):
        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))

    ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000)

    featureFitModel = cv.fit(ngramFrame)

    featuresCV = featureFitModel.transform(ngramFrame)

    labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0]))

    labelFrame = labelRdd.toDF(["did","label"])

    trainData = ngramFrame.featuresCV(labelFrame,"did")
    trainData.persist(StorageLevel(True, True, False, False, 1))
    saveData(trainData,path)

    trainData.show()
    returm featureFitModel
예제 #3
0
    def test_count_vectorizer_with_binary(self):
        dataset = self.spark.createDataFrame(
            [
                (
                    0,
                    "a a a b b c".split(" "),
                    SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),
                ),
                (
                    1,
                    "a a".split(" "),
                    SparseVector(3, {0: 1.0}),
                ),
                (
                    2,
                    "a b".split(" "),
                    SparseVector(3, {0: 1.0, 1: 1.0}),
                ),
                (
                    3,
                    "c".split(" "),
                    SparseVector(3, {2: 1.0}),
                ),
            ],
            ["id", "words", "expected"],
        )
        cv = CountVectorizer(binary=True, inputCol="words", outputCol="features")
        model = cv.fit(dataset)

        transformedList = model.transform(dataset).select("features", "expected").collect()

        for r in transformedList:
            feature, expected = r
            self.assertEqual(feature, expected)
def CountVectorizerModel(input_col, output_col, vocab_size, min_df, input_data):
    # mindf 必须在文档中出现的最少次数
    # vocabSize 词典大小
    cv = CountVectorizer(inputCol=input_col, outputCol=output_col, vocabSize=vocab_size, minDF=min_df)
    model = cv.fit(input_data)
    result = model.transform(input_data)
    return result;
예제 #5
0
def word2vec(df, inputcol, outputcol, vecsize):
    from pyspark.mllib.feature import Word2Vec
    from pyspark.ml.feature import Word2Vec
    from pyspark.ml.feature import CountVectorizer, CountVectorizerModel, Tokenizer, RegexTokenizer, StopWordsRemover
    # 使用自定义函数
    df.drop('seg')
    df_seg = df.withColumn("seg", segUDF(inputcol))
    df_w = df_seg.drop('words')
    tokenizer = Tokenizer(inputCol=inputcol, outputCol='words')
    t_words = tokenizer.transform(df_w)
    t_words.select('words').head()
    #4.将文本向量转换成稀疏表示的数值向量(字符频率向量)
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         vocabSize=5,
                         minDF=2.0)
    df_f = t_words.drop("features")
    cv_model = cv.fit(df_f)
    cv_result = cv_model.transform(df_f)
    #5.将tokenizer得到的分词结果转换数字向量
    word2Vec = Word2Vec(vectorSize=vecsize,
                        minCount=0,
                        inputCol="words",
                        outputCol=outputcol)
    w2v_model = word2Vec.fit(cv_result)
    result = w2v_model.transform(cv_result)
    for feature in result.select(outputcol).take(3):
        print(feature)
        return t_words
예제 #6
0
    def get_cv_model(self):
        if self.has_cv:
            from pyspark.ml.feature import CountVectorizerModel
            cv_model = CountVectorizerModel.load(
                os.path.join(model_pth, model_name))
        else:
            from pyspark.ml.feature import CountVectorizer
            data = self._fit()
            cv = CountVectorizer(inputCol='item_seq',
                                 outputCol='item_seq_enc',
                                 vocabSize=1 << 20,
                                 minTF=0,
                                 minDF=0)
            cv_model = cv.fit(data)
            cv_model.write().overwrite().save(
                os.path.join(model_pth, model_name))

        copora = cv_model.vocabulary  # 579012
        action_copora = [
            'clickout item', 'interaction item deals',
            'interaction item image', 'interaction item info',
            'search for item', 'interaction item rating'
        ]
        item2id = dict(zip(copora, range(1, len(copora) + 1)))
        action2id = dict(zip(action_copora, range(1, len(action_copora) + 1)))
        sc = self.sqlContext.sparkContext
        bitem2id = sc.broadcast(item2id)
        baction2id = sc.broadcast(action2id)
        print("Item size:", len(item2id))
        return bitem2id, baction2id
예제 #7
0
def compute_labeled_sanitized_comments(comments, labels):
    # TASK 2
    labelled_comments = comments.join(labels, comments.id == labels.Input_id)

    # TASK 6B
    def check_positive(x):
        if x == "1":
            return 1
        return 0

    def check_negative(x):
        if x == "-1":
            return 1
        return 0

    check_negative_udf = udf(check_negative, IntegerType())
    check_positive_udf = udf(check_positive, IntegerType())

    # TASKS 4, 5
    sanitize_udf = udf(sanitize_wrapper, ArrayType(StringType()))

    labelled_sanitized_comments = labelled_comments.select(
        sanitize_udf("body").alias("features"),
        check_positive_udf("labeldjt").alias("trump_pos"),
        check_negative_udf("labeldjt").alias("trump_neg"),
        check_positive_udf("labeldem").alias("dem_pos"),
        check_negative_udf("labeldem").alias("dem_neg"),
        check_positive_udf("labelgop").alias("rep_pos"),
        check_negative_udf("labelgop").alias("rep_neg"))

    # TASK 6A
    cv = CountVectorizer(inputCol="features", outputCol="vectors", binary=True, minDF=5)
    model = cv.fit(labelled_sanitized_comments)
    sanitized_comments = model.transform(labelled_sanitized_comments)
    return sanitized_comments, model
예제 #8
0
def perform_lda(documents, n_topics, n_words, beta, tokens_col):
    '''
    will perform LDA on a list of documents (== list of token)
    assume that documents is a DataFrame with a column of unique id (uid).
    
    '''
    cv = CountVectorizer(inputCol=tokens_col, outputCol="raw_features")
    cvmodel = cv.fit(documents)
    result_cv = cvmodel.transform(documents)
    
    #we perform an tf-idf (term frequency inverse document frequency), to avoid threads with a lot of words to pollute the topics.
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 
    
    #keeping created for time series purpose. 
    corpus = result_tfidf.select("uid", "features")#, "date")
    
    lda = LDA(k=n_topics, topicConcentration=beta)
    
    model = lda.fit(corpus)
    
    #retrieving topics, and the vocabulary constructed by the CountVectorizer
    topics = model.describeTopics(maxTermsPerTopic=n_words)
    vocab = cvmodel.vocabulary
    
    #getting topic distribution per document. 
    #topic_distribution = model.transform(corpus)[['topicDistribution', 'date']]
    
    #the topics are just numerical indices, we need to convert them to words, and associate them to their weights..
    topics_with_weights = topics.rdd.map(lambda r: (r[0], ([(vocab[t],w) for t,w in zip(r[1], r[2])]), ' '.join([vocab[t] for t in r[1]]))).toDF().selectExpr("_1 as topic_number", "_2 as topic_weight", "_3 as topic")
    
    return topics_with_weights#, topic_distribution
예제 #9
0
def indexing_pipeline(input_df, **kwargs):
    """ Runs a full text indexing pipeline on a collection of texts contained
    in a DataFrame.

    Parameters
    ----------
    input_df (DataFrame): a DataFrame that contains a field called 'text'

    Returns
    -------
    df : the same DataFrame with a column called 'features' for each document
    wordlist : the list of words in the vocabulary with their corresponding IDF
    """
    inputCol_ = kwargs.get("inputCol", "text")
    vocabSize_ = kwargs.get("vocabSize", 5000)
    minDF_ = kwargs.get("minDF", 2.0)

    tokenizer_udf = udf(extract_bow_from_raw_text, ArrayType(StringType()))
    df_tokens = input_df.withColumn("bow", tokenizer_udf(col(inputCol_)))

    cv = CountVectorizer(inputCol="bow",
                         outputCol="vector_tf",
                         vocabSize=vocabSize_,
                         minDF=minDF_)
    cv_model = cv.fit(df_tokens)
    df_features_tf = cv_model.transform(df_tokens)

    idf = IDF(inputCol="vector_tf", outputCol="features")
    idfModel = idf.fit(df_features_tf)
    df_features = idfModel.transform(df_features_tf)

    return (df_features, cv_model.vocabulary)
예제 #10
0
def loadData1(host, port, db_name, WEB_DATA1):
    '''
    加载集合1数据
    :param host:
    :param port:
    :param db_name: 数据库
    :param WEB_DATA1: 集合1,lda_sum_data,共15584条数据
    :return: dataframe
    '''
    df1 = sparkEntrance.spark.read.format("com.mongodb.spark.sql.DefaultSource") \
        .option("spark.mongodb.input.uri",
                "mongodb://" + host + ":" + str(port) + "/" + db_name + '.' + WEB_DATA1) \
        .load()
    df1 = df1.selectExpr('html as content', '_id as id').distinct()
    # 删除空值
    # If 'any', drop a row if it contains any nulls.
    # If 'all', drop a row only if all its values are null.
    df1 = df1.na.drop(how='any', subset='content')
    rdd1 = df1.rdd.map(tokenizer)
    # 由content , id创建dataframe
    df1 = sparkEntrance.spark.createDataFrame(rdd1, ['content', 'id']).cache()
    # CountVectorizer
    cv1 = CountVectorizer(inputCol="content", outputCol="features")
    model_cv1 = cv1.fit(df1)
    # 获取词汇值
    # vocabulary1 = model_cv1.vocabulary
    df_cv1 = model_cv1.transform(df1).cache()
    # IDF
    idf1 = IDF(inputCol="features", outputCol="cv")
    model_idf1 = idf1.fit(df_cv1)
    df_idf1 = model_idf1.transform(df_cv1).cache()
    return df_idf1
예제 #11
0
def indexing_pipeline(input_df, **kwargs):
    """Runs a full text indexing pipeline on a collection of texts contained in a DataFrame.
    Parameters
    ----------
    input_df (DataFrame): a DataFrame that contains a field called 'text'
    Returns
    -------
    df : the same DataFrames with a column called 'features' for each document
    wordlist : the list of words in the vocabulary with their corresponding IDF
    """
    inputCol_ = kwargs.get("inputCol", "reviews")
    vocabSize_ = kwargs.get("vocabSize", 5000)
    minDF_ = kwargs.get("minDF", 2.0)

    # ugly: to add that to our slave nodes so that it finds the bootstrapped nltk_data
    nltk.data.path.append('/home/hadoop/nltk_data')

    extract_bow_from_raw_text(
        "")  # ugly: for instanciating all dependencies of this function
    tokenizer_udf = udf(extract_bow_from_raw_text, ArrayType(StringType()))
    df_tokens = input_df.withColumn("bow", tokenizer_udf(col(inputCol_)))

    cv = CountVectorizer(inputCol="bow",
                         outputCol="vector_tf",
                         vocabSize=vocabSize_,
                         minDF=minDF_)
    cv_model = cv.fit(df_tokens)
    df_features_tf = cv_model.transform(df_tokens)

    idf = IDF(inputCol="vector_tf", outputCol="features")
    idfModel = idf.fit(df_features_tf)
    df_features = idfModel.transform(df_features_tf)

    return (df_features, cv_model.vocabulary)
예제 #12
0
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0):

    tokenized = Tokenizer(inputCol="text",
                          outputCol="words").transoform(sentenceDataFrame)

    ngramDataFrame = NGram(n=ngrams, inputCol="words",
                           outputCol="ngrams").transform(tokenized)

    countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures")

    countVectModel = countVect.fit(ngramDataFrame)

    featurizedData = countVectModel.transform(ngramDataFrame)

    idf = IDF(minDocFreq=minDocFreq,
              inputCol="rawFeatures",
              outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.select("label", "features")

    normalizer = Normalizer(inputCol="features", outputCol='scores')
    X = normalizer.transform(rescaledData)

    return X
예제 #13
0
def dedup_min_hash(df, column, id_col, min_distance=0.1):
    """
    Deduplicates a dataset using MinHash on a token count basis.

    Removes all items with a distance smaller than min_distance.
    """
    @udf("long")
    def num_nonzeros(v):
        return v.numNonzeros()

    df.cache()
    tokenizer = RegexTokenizer(inputCol=column, outputCol="tokens")
    tokens = tokenizer.transform(df)
    cv = CountVectorizer(inputCol="tokens", outputCol="token_ids")
    vectorizer_model = cv.fit(tokens)
    with_token_ids = vectorizer_model.transform(tokens).drop("tokens", column)
    with_token_ids = with_token_ids.where(
        num_nonzeros(with_token_ids.token_ids) > 0).cache()
    mh = MinHashLSH(inputCol="token_ids",
                    outputCol="hashes",
                    seed=1,
                    numHashTables=10)
    dedup_model = mh.fit(with_token_ids)
    joined = dedup_model.approxSimilarityJoin(with_token_ids, with_token_ids, 1 - min_distance, distCol="dist")\
        .drop("token_ids", "hashes")\
        .filter(f"datasetA.{id_col} < datasetB.{id_col}")
    duplicate_ids = joined.rdd.flatMap(lambda row: (row.datasetA[id_col], row.datasetB[id_col]))\
        .distinct()\
        .map(lambda el: [el])\
        .toDF()
    return df.join(duplicate_ids, duplicate_ids._1 == df[id_col], "left")\
        .where(duplicate_ids._1.isNotNull())\
        .drop(duplicate_ids._1)
예제 #14
0
def words_widely_used_and_short(df, input_col="stemmed", number_of_words=100):
    cv_tmp = CountVectorizer(inputCol=input_col, outputCol="tmp_vectors")
    cv_tmp_model = cv_tmp.fit(df)
    top_words = list(cv_tmp_model.vocabulary[0:number_of_words])
    less_then_3_charachters = [
        word for word in cv_tmp_model.vocabulary if len(word) <= 3
    ]
    return (top_words, less_then_3_charachters)
예제 #15
0
 def transformDataToFeaturesVector(self, dataDF):
     print("Term frecuency and vocabulary extraction")
     #Count Term Frecuency, transform data into features vector
     vector = CountVectorizer(inputCol="words", outputCol="vector")
     model = vector.fit(dataDF)
     self.vocabulary = model.vocabulary
     result = model.transform(dataDF)
     return result
예제 #16
0
def add_tf_and_vocab(df):
    cv = CountVectorizer(inputCol="tokens", outputCol="tf_vector", minDF=2.0)
    tf_model = cv.fit(df)
    df_tf = tf_model.transform(df)

    vocab = tf_model.vocabulary

    return df_tf, vocab
예제 #17
0
 def calculate_count_vectorize_idf(self, df):
     count_vec = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize=262144)
     cv_model = count_vec.fit(df)
     featurized_data = cv_model.transform(df)
     vocab = cv_model.vocabulary
     idf = IDF(inputCol="rawFeatures", outputCol="features")
     idf_model = idf.fit(featurized_data)
     rescaled_data = idf_model.transform(featurized_data)
     return rescaled_data, vocab
예제 #18
0
def calculate_vectors(data, n=2, binary=False):
    ngram = NGram(n=n, inputCol="sequence", outputCol="ngrams")
    ngramDataFrame = ngram.transform(data)
    ngrams = ngramDataFrame.select("ngrams")
    cvectorizer = CountVectorizer(
        inputCol="ngrams", outputCol="vec", binary=binary
    )
    model = cvectorizer.fit(ngrams)
    return model.transform(ngrams).select("vec")
예제 #19
0
    def transform(self, token_col='words'):
        """ StopRemover, CountWords"""
        remover = StopWordsRemover(inputCol=token_col, outputCol="clean_words")
        clean_docs_ddf = remover.transform(self.docs_ddf)

        cv = CountVectorizer(inputCol="clean_words", outputCol="tf_vector")
        self.cv_model = cv.fit(clean_docs_ddf)
        self.word_counts_ddf = self.cv_model.transform(clean_docs_ddf). \
            persist(StorageLevel.DISK_ONLY)
예제 #20
0
def tf_train(df):
    cv_train = CountVectorizer(inputCol="final",
                               outputCol="rawFeatures",
                               vocabSize=4000,
                               minDF=3,
                               minTF=2)
    cvmodel = cv_train.fit(df)
    cvDatasetTrain = cvmodel.transform(df)
    return (cvmodel, cvDatasetTrain)
예제 #21
0
def task6a(sqlContext, data):
    cv = CountVectorizer(inputCol="grams",
                         outputCol="count_vectors",
                         minDF=10,
                         binary=True)
    model = cv.fit(data)
    result = model.transform(data)
    result.show(n=10)
    return result, model
예제 #22
0
def vectorizeCV(DF):
    
    vectorizer = CountVectorizer()
    cv = CountVectorizer(minDF=.0001, inputCol="raw", outputCol="features")
    
    model = cv.fit(DF)
    result = model.transform(DF)
    
    return result, model
예제 #23
0
def LDA_pipefit (data_ip, ipcol):
  text_col = ipcol
  from sparknlp.base import DocumentAssembler
  documentAssembler = DocumentAssembler().setInputCol(text_col).setOutputCol('document')
  from sparknlp.annotator import Tokenizer
  tokenizer = Tokenizer().setInputCols(['document']).setOutputCol('tokenized')
  from sparknlp.annotator import Normalizer
  normalizer = Normalizer().setInputCols(['tokenized']).setOutputCol('normalized').setLowercase(True)
  from sparknlp.annotator import LemmatizerModel
  lemmatizer = LemmatizerModel.pretrained().setInputCols(['normalized']).setOutputCol('lemmatized')
  from sparknlp.annotator import StopWordsCleaner
  stopwords_cleaner = StopWordsCleaner().setInputCols(['lemmatized']).setOutputCol('unigrams').setStopWords(eng_stopwords)
  from sparknlp.annotator import NGramGenerator
  ngrammer = NGramGenerator().setInputCols(['lemmatized']).setOutputCol('ngrams').setN(3).setEnableCumulative(True).setDelimiter('_')
  from sparknlp.annotator import PerceptronModel
  pos_tagger = PerceptronModel.pretrained('pos_anc').setInputCols(['document', 'lemmatized']).setOutputCol('pos')
  from sparknlp.base import Finisher
  finisher = Finisher().setInputCols(['unigrams', 'ngrams','pos'])
  from pyspark.ml import Pipeline
  pipeline = Pipeline().setStages([documentAssembler,
                                  tokenizer,
                                  normalizer,
                                  lemmatizer,
                                  stopwords_cleaner,
                                  pos_tagger,
                                  ngrammer,
                                  finisher])
  review_text_clean = ipcol
  processed_tweets = pipeline.fit(data_ip).transform(data_ip)
  from pyspark.sql.functions import concat
  processed_tweets = processed_tweets.withColumn('final',concat(F.col('finished_unigrams'), F.col('finished_ngrams')))
  from pyspark.ml.feature import CountVectorizer
  tfizer = CountVectorizer(inputCol='final',outputCol='tf_features')
  tf_model = tfizer.fit(processed_tweets)
  tf_result = tf_model.transform(processed_tweets)
  from pyspark.ml.feature import IDF
  idfizer = IDF(inputCol='tf_features', outputCol='tf_idf_features')
  idf_model = idfizer.fit(tf_result)
  tfidf_result = idf_model.transform(tf_result)
  from pyspark.ml.clustering import LDA

  num_topics = 3
  max_iter = 10

  lda = LDA(k=num_topics, maxIter=max_iter, featuresCol='tf_idf_features')
  lda_model = lda.fit(tfidf_result)
  from pyspark.sql import types as T
  vocab = tf_model.vocabulary
  def get_words(token_list):
      return [vocab[token_id] for token_id in token_list]
  udf_to_words = F.udf(get_words, T.ArrayType(T.StringType()))

  num_top_words = 15
  topics = lda_model.describeTopics(num_top_words).withColumn('topicWords', udf_to_words(F.col('termIndices')))
  topics_p=topics.toPandas()
  return topics_p
def count_vectorizer_generic(data_frame, vocab_size, input_col):
    print('Count Vectorizer Result with output column features')
    cv_generic = CountVectorizer(inputCol=input_col,
                                 outputCol="features",
                                 vocabSize=vocab_size)
    model_generic = cv_generic.fit(data_frame)
    result_generic = model_generic.transform(data_frame)
    result_generic.show()
    print('\n')
    return (result_generic, model_generic)
예제 #25
0
def vectorizecol(df, incol, outcol, size=1<<18):
   """
   Vectorize a column of terms and add it to the dataframe
   return df and model
   """
   cv = CountVectorizer(inputCol=incol, outputCol=outcol, vocabSize=size)
   model = cv.fit(df)
   result = model.transform(df)

   return model, result
예제 #26
0
    def transform(self):
        cv = CountVectorizer(inputCol='content', outputCol='raw_features')
        cv_model = cv.fit(self._mapped_data)
        tf_df = cv_model.transform(self._mapped_data)

        idf = IDF(minDocFreq=self._min_doc_freq, inputCol='raw_features', outputCol='features')
        tfidf_model = idf.fit(tf_df)
        tfidf_df = tfidf_model.transform(tf_df)

        return tfidf_df.drop('content').drop('raw_features')
예제 #27
0
def sparsify(ngrams_df, model):
    if model is None:
        # TASK 6a: Binary CountVectorizer
        cv = CountVectorizer(minDF=10,
                             binary=True,
                             inputCol="split_ngrams",
                             outputCol="sparse_vector")
        model = cv.fit(ngrams_df)

    sparsified = model.transform(ngrams_df)
    return model, sparsified
예제 #28
0
def functions_for_deal_with_texts_3(spark, resources_folder):
    df = spark.createDataFrame([(0, "a b c".split(" ")),
                                (1, "a b b c a".split(" "))], ["id", "words"])
    df.show()
    cv = CountVectorizer(inputCol='words',
                         outputCol='features',
                         vocabSize=3,
                         minDF=2.0)
    model = cv.fit(df)
    result = model.transform(df)
    result.show(truncate=False)
예제 #29
0
def main():
    for tn in tablenames:
        data = spark.read.format("org.apache.spark.sql.cassandra")\
                    .options(table=tn, keyspace=keyspace).load().limit(1000)

        data = data.sort('imdb_score', ascending=False)

        desc = data.rdd.map(lambda x: x['description']).filter(
            lambda x: x is not None)

        StopWords = nltk.corpus.stopwords.words('english')
        StopWords.extend([" ...                See full summary"])

        tokenized = desc.map( lambda y: y.strip().lower()).map( lambda x: re.split(" ", x))\
            .map( lambda word: [x for x in word if x.isalpha()]).map( lambda word: [x for x in word if len(x) > 3] )\
            .map( lambda word: [x for x in word if x not in StopWords]).zipWithIndex()

        df_txts = spark.createDataFrame(tokenized, ["words", 'index'])
        countVec = CountVectorizer(inputCol="words",
                                   outputCol="raw_features",
                                   vocabSize=5000,
                                   minDF=10.0)
        CountVectMod = countVec.fit(df_txts)
        result = CountVectMod.transform(df_txts)
        idf = IDF(inputCol="raw_features", outputCol="features")
        idfModel = idf.fit(result)
        resultTFIdf = idfModel.transform(result)

        totalTopics = 10
        totalItr = 100
        LDAModel = MLlibLDA.train(resultTFIdf.select('index','features').rdd.mapValues(MLlibVectors.fromML).map(list),\
                        k=totalTopics, maxIterations=totalItr)

        maxwordsTopic = 5
        topicIndices = sc.parallelize(
            LDAModel.describeTopics(maxTermsPerTopic=5))
        VCarr = CountVectMod.vocabulary

        def finalTopic(topic):
            terms = topic[0]
            result = []
            for i in range(maxwordsTopic):
                term = VCarr[terms[i]]
                result.append(term)
            return result

        topics_final = topicIndices.map(
            lambda topic: finalTopic(topic)).collect()
        print(topics_final)
        for topic in range(len(topics_final)):
            print("Topic" + str(topic) + ":")
            for term in topics_final[topic]:
                print(term)
            print('\n')
def pre_processing(df):
    # fit a CountVectorizerModel from the corpus.
    cv = CountVectorizer(inputCol="words",
                         outputCol="features",
                         vocabSize=3,
                         minDF=2.0)

    model = cv.fit(df)

    result = model.transform(df)
    result.show(truncate=False)
예제 #31
0
    def test_count_vectorizer_with_binary(self):
        dataset = self.spark.createDataFrame([
            (0, "a a a b b c".split(' '), SparseVector(3, {0: 1.0, 1: 1.0, 2: 1.0}),),
            (1, "a a".split(' '), SparseVector(3, {0: 1.0}),),
            (2, "a b".split(' '), SparseVector(3, {0: 1.0, 1: 1.0}),),
            (3, "c".split(' '), SparseVector(3, {2: 1.0}),)], ["id", "words", "expected"])
        cv = CountVectorizer(binary=True, inputCol="words", outputCol="features")
        model = cv.fit(dataset)

        transformedList = model.transform(dataset).select("features", "expected").collect()

        for r in transformedList:
            feature, expected = r
            self.assertEqual(feature, expected)
예제 #32
0
파일: ml.py 프로젝트: ribonj/lsir
def count(df, column):
    """
    Count the number of occurences of terms in documents.
    """
    # fit a CountVectorizerModel from the corpus.
    # vocabSize: top N words orderedby term frequency across the corpus
    # minDF: minimum number of documents a term must appear in to be 
    #   included in the vocabulary
    # e.g. vocabSize=10, minDF=2.0
    cv = CountVectorizer(inputCol=column, 
                         outputCol='_'+column)
    
    model = cv.fit(df)
    voc = model.vocabulary
    df = model.transform(df)
    
    df = replace(df, column, '_'+column)
    return (df, voc)
예제 #33
0
def featurizeData(raw, gap, vocabFile, featFile):
    feats = raw.dropDuplicates(['cluster', 'series', 'date'])\
            .withColumn('day', datediff(col('date'), lit('1970-01-01')))\
            .na.drop(subset=['day'])\
            .rdd.groupBy(lambda r: r.cluster)\
            .flatMap(lambda c: clusterFeatures(c, gap))\
            .toDF()

    feats.cache()
    cv = CountVectorizer(inputCol='raw', outputCol='features', minDF=4.0)
    interner = cv.fit(feats)      # alternate possibility: grab features only from label==1 edges
    full = interner.transform(feats)
    # combiner = VectorAssembler(inputCols=realCols + ['categorial'], outputCol='features')
    # # I don't think a Pipeline will work here since we need to get the interner.vocabulary
    # full = combiner.transform(interner.transform(feats)).drop('categorial')

    full.write.parquet(featFile)
    np.savetxt(vocabFile, np.array(interner.vocabulary), fmt='%s')
    feats.unpersist()
예제 #34
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
예제 #35
0
def main():
	p = sys.argv[1]
	logFile = "data/" + p + "_cleaned.txt"
	sc = SparkContext("local", "simpleApp")
	sqlContext = SQLContext(sc)
	data = sc.textFile(logFile).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" "))).cache()
	docDF = sqlContext.createDataFrame(data)
	Vector = CountVectorizer(inputCol="words", outputCol="vectors")
	model = Vector.fit(docDF)
	result = model.transform(docDF)
	corpus_size = result.count()

	corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache()

	# Cluster the documents into three topics using LDA
	ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online')
	topics = ldaModel.topicsMatrix()
	wordNumbers = 10
	topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))
	vocabArray = model.vocabulary
	topics_final = topicIndices.map(lambda topic: topic_render(topic,wordNumbers,vocabArray)).collect()

	path = "data/" + p + "_results.txt"
	json = open(path, 'wb')
	json.close()

	for topic in topics_final:
		for term in topic:
			line = term[0] + " "

			try:
				string_for_output = line.encode('utf8', 'replace')
				if string_for_output != " ":
					os.system("python3 basic/codes/p3p.py " +  string_for_output + "  >> " + path)
			except: pass

		os.system("python3 basic/codes/p3p.py " +  "delmch" + "  >> " + path)
예제 #36
0
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import CountVectorizer
from pyspark.mllib.clustering import LDA, LDAModel

sqlContext = SQLContext(sc)
path = ... # path of the txt file

data = sc.textFile(path).zipWithIndex().map(lambda (words,idd): Row(idd= idd, words = words.split(" ")))
docDF = sqlContext.createDataFrame(data)

Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus_size = result.count()  # total number of words
corpus = result.select("idd", "vectors").map(lambda (x,y): [x,y]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3,maxIterations=100,optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 10  # number of words per topic
topicIndices = sc.parallelize(ldaModel.describeTopics(maxTermsPerTopic = wordNumbers))

def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
        term = vocabArray[terms[i]]
        result.append(term)
예제 #37
0
def train_cv_model(modelDataframe):
    cv = CountVectorizer(inputCol="udf_results", outputCol="features", binary=True, minDF=5.0)
    model = cv.fit(modelDataframe)
    model.write().overwrite().save("models/cvModel")
예제 #38
0
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import CountVectorizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("CountVectorizerExample")\
        .getOrCreate()

    # $example on$
    # Input data: Each row is a bag of words with a ID.
    df = spark.createDataFrame([
        (0, "a b c".split(" ")),
        (1, "a b b c a".split(" "))
    ], ["id", "words"])

    # fit a CountVectorizerModel from the corpus.
    cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

    model = cv.fit(df)

    result = model.transform(df)
    result.show(truncate=False)
    # $example off$

    spark.stop()
    # alltags=tags_users.map(lambda x:Counter(x.tags)).reduce(lambda a,b:a+b)
    # print(alltags.most_common(10))
        #.filter(lambda x:len(x.tags)>100) # filtering to get smaller dataset

    # print(tags_users.count())
    # print(tags_users.first())

    ## Filtered for testing

    tags_users_df=sqlContext.createDataFrame(tags_users)
    print(tags_users_df.take(2))
    #
    #
    # print('Indexing strings')
    cVec = CountVectorizer(inputCol='tags', outputCol="tag_features",minDF=10.)
    model=cVec.fit(tags_users_df)
    td=model.transform(tags_users_df)

    with open('/home/erlenda/data/konsum/countvec_vocabulary.pkl',mode='wb') as ff:
        pkl.dump(model.vocabulary,ff)



    normalizer=Normalizer(p=1.,inputCol='tag_features',outputCol='tags_normalized')
    tdNorm=normalizer.transform(td)
    print(tdNorm.take(5))

    tdNorm.write.save('/home/erlenda/data/konsum/tag_profiler_parquet')

    samples=tdNorm.filter(tdNorm.posts_with_tags>10).take(10)
    #pprint(samples)
summary.probability.show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("features")\
  .setVocabSize(500)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)


# COMMAND ----------

from pyspark.ml.clustering import LDA
lda = LDA().setK(10).setMaxIter(5)
print lda.explainParams()
model = lda.fit(prepped)


# COMMAND ----------

model.describeTopics(3).show()
cvFitted.vocabulary
#tokenizer = Tokenizer(inputCol="description", outputCol="words")
#wordsData = tokenizer.transform(text)

################################################################################################
#
#   Generate TFIDF
#
################################################################################################

# Term Frequency Vectorization  - Option 1 (Using hashingTF): 
#hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
#featurizedData = hashingTF.transform(clean_text)

# Term Frequency Vectorization  - Option 2 (CountVectorizer)    : 
cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", vocabSize = 1000)
cvmodel = cv.fit(clean_text)
featurizedData = cvmodel.transform(clean_text)

vocab = cvmodel.vocabulary
vocab_broadcast = sc.broadcast(vocab)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

################################################################################################
#
#   LDA Clustering - Find Data-driven Topics
#
################################################################################################
예제 #42
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")
unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)
unigram.transform(tokenized.select("DescOut")).show(False)
bigram.transform(tokenized.select("DescOut")).show(False)


# COMMAND ----------

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("countVec")\
  .setVocabSize(500)\
  .setMinTF(1)\
  .setMinDF(2)
fittedCV = cv.fit(tokenized)
fittedCV.transform(tokenized).show(False)


# COMMAND ----------

tfIdfIn = tokenized\
  .where("array_contains(DescOut, 'red')")\
  .select("DescOut")\
  .limit(10)
tfIdfIn.show(10, False)


# COMMAND ----------

from pyspark.ml.feature import HashingTF, IDF