Exemplo n.º 1
0
def main(*args):
    if len(args) != 2:
        print("Please provide one input and one output directories!")
        sys.exit(1)

    input_fn, output_fn = args[0],args[1]
    conf = SparkConf()
    conf.setAppName("grant")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    # Load the abstract content in the test folder into spark, 
    # clean text, tokenize the corpus, and stem the words
    abstract = sc.textFile(input_fn)
    df_abs = (abstract.map(lambda doc: text_cleaning(doc))
                      .filter(lambda doc: len(doc) > 0)
                      .filter(lambda line: not line.startswith('app'))
                      .map(lambda doc: doc.split(' '))
                      .map(lambda word: [x for x in word if len(x)>0])
                      .map(lambda word: stem(word))
                      .map(lambda doc: (int(doc[0]), doc[1:]))
                      .filter(lambda doc: len(doc[1])>0)
                      .toDF(['Id','words']))
    # build the pipeline and lda model with online optimizer
    stop_words = StopWordsRemover(inputCol='words',
                             outputCol='clean')
    stop_words.setStopWords(stop_words.loadDefaultStopWords('english'))
    countv = CountVectorizer(inputCol=stop_words.getOutputCol(), 
                             outputCol="tokens")
    idf = IDF(inputCol=countv.getOutputCol(),outputCol="features")
    lda = LDA(maxIter=10,k=10,optimizer='online')
    pipeline = Pipeline(stages=[stop_words, countv, idf, lda])
    lda_model = pipeline.fit(df_abs)
    labels = lda_model.transform(df_abs)
    
    # identify the label as the topic with the max probability
    # save the label to file
    topic_labels = (labels.select('Id','topicDistribution')
                          .rdd
                          .map(lambda x: (x[0],np.argmax(x[1])))
                          .saveAsTextFile(os.path.join(output_fn,'labels')))
    # Get the topics
    wordnum = 5 # choose the number of topic words
    vocabulary = lda_model.stages[1].vocabulary
    voc_bv = sc.broadcast(vocabulary)
    topic_df = (lda_model.stages[3].describeTopics(wordnum)
                     .rdd
                     .map(lambda x: (x[0],[voc_bv.value[Id] for Id in x[1]],x[2]))
                     .saveAsTextFile(os.path.join(output_fn,'words')))
Exemplo n.º 2
0
def lr_train(data):
    #Logistic Regression using Count Vector Features
    label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label")
    lsmodel=label_stringIdx.fit(data)
    data=lsmodel.transform(data)
    (trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100)
    countVectors = CountVectorizer(inputCol="filtered", outputCol="cfeatures", vocabSize=10000, minDF=5)
    '''hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",minDocFreq=5)'''

    lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0,featuresCol=countVectors.getOutputCol(), labelCol="label")
    pipeline = Pipeline(stages=[countVectors,lr])
    pipelineFit = pipeline.fit(trainingData)
    predictions = pipelineFit.transform(testData)
    #predictions.show(5)
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    #evaluator.evaluate(predictions)
    return (evaluator.evaluate(predictions),lsmodel.labels,pipelineFit)
Exemplo n.º 3
0
def lr_train_tvs(data):
    #Logistic Regression using Count Vector Features
    label_stringIdx = StringIndexer(inputCol="_c0", outputCol="label")
    lsmodel=label_stringIdx.fit(data)
    data=lsmodel.transform(data)
    #(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=100)
    countVectors = CountVectorizer(inputCol="filtered", outputCol="cfeatures", vocabSize=10000, minDF=5)
    '''hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features",minDocFreq=5)'''
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    lr = LogisticRegression(regParam=0.3, elasticNetParam=0,featuresCol=countVectors.getOutputCol(), labelCol="label")
    pipeline = Pipeline(stages=[countVectors,lr])
    grid = ParamGridBuilder().addGrid(lr.maxIter, [10,15,20]).build()
    crossval = TrainValidationSplit(estimator=pipeline,
                              estimatorParamMaps=grid,
                              evaluator=evaluator,
                              trainRatio=0.9)
    cvmodel=crossval.fit(data)
    return (evaluator.evaluate(cvmodel.transform(data)),lsmodel.labels,cvmodel)
Exemplo n.º 4
0
def get_pipeline():

    # Hard Coded Labels (original texts only):
    auth_hard_lbl = AuthorLabeler(inputCol='author', outputCol='author_label')
    ttl_hard_lbl = TitleLabeler(inputCol='title', outputCol='title_label')

    # Labels
    author_labeler = StringIndexer(inputCol="author", outputCol="author_id")
    title_labeler = StringIndexer(inputCol="title", outputCol="title_id")
    vector_ider = VectorAssembler(
                  inputCols=["author_id", "title_id", "excerpt_number"],
                  outputCol="id_vector")

    tokenizer = SpacyTokenizer(inputCol='excerpt', outputCol='words')

    # TF-IDF
    countvec = CountVectorizer(inputCol=tokenizer.getOutputCol()
                              , outputCol='termfreq')
    idf = IDF(inputCol=countvec.getOutputCol(), outputCol='tfidf')

    # Word2Vec
    word2vec = Word2Vec(vectorSize=250, minCount=2
                        , inputCol=tokenizer.getOutputCol(), outputCol="w2v")
    w2v_2d = Word2Vec(vectorSize=2, minCount=2
                        , inputCol=tokenizer.getOutputCol(), outputCol="w2v_2d")

    # TODO: Include Metadata
    # char_count =
    # word_count =
    # sent_count =
    # para_count =

    # TODO: Play with n-grams
    # NGram(n=2, inputCol=tokenizer.getOutputCol(), outputCol="2_gram")
    # NGram(n=3, inputCol=tokenizer.getOutputCol(), outputCol="3_gram")
    # NGram(n=4, inputCol=tokenizer.getOutputCol(), outputCol="4_gram")
    # NGram(n=5, inputCol=tokenizer.getOutputCol(), outputCol="5_gram")

    pipeline = Pipeline(stages=[author_labeler, title_labeler, vector_ider,
                                tokenizer, countvec, idf, word2vec, w2v_2d])

    return pipeline
Exemplo n.º 5
0
def benchmark_body_pipeline(cleaned_dataframe, stopwordlist=None):
    """NLP pipeline. Tokenizes, removes stopwords, and computes TF-IDF
    Returns transformed data as 'features' and the vocabulary of words."""

    tokenizer = Tokenizer(inputCol="Text", outputCol="Text_tokens")
    if stopwordlist:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped",
                                        stopWords=stopwordlist)
    else:
        stop_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                        outputCol="Text_tokens_stopped")

    count_vect = CountVectorizer(inputCol=stop_remover.getOutputCol(),
                                 outputCol="Text_counts_raw")
    idf = IDF(inputCol=count_vect.getOutputCol(), outputCol="features")

    pipeline = Pipeline(stages=[tokenizer, stop_remover, count_vect, idf])
    model = pipeline.fit(cleaned_dataframe)
    featurized_data = model.transform(cleaned_dataframe)

    return featurized_data, model.stages[-2].vocabulary
Exemplo n.º 6
0

# Import json objects from tar file
opinion_df = import_dataframe(spark, 'opinion')
docket_df = import_dataframe(spark, 'docket')
cluster_df = import_dataframe(spark, 'cluster')

# Setup pipeline for adding ML features - tokens, stems, n-grams, tf, tfidf, word2vec
# tokenizer = Tokenizer(inputCol='parsed_text', outputCol='tokens')
tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens", pattern="\\W", minTokenLength=3)
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop')
stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens')
bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams', n=2)
trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams', n=3)
cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector', minDF=10.0)
idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf', minDocFreq=10)
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol(), outputCol='word2vec_large')

pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large])

# Use the pipeline to fit a model
model = pipe.fit(opinion_df)

# Use the model to transform the data
df_transformed = model.transform(opinion_df)

# retrieve top 10 number of words for the document, assumes existence of 'row' containg one row from the dataframe
np.array(opinion_cv_model.vocabulary)[row['token_idf'].indices[np.argsort(row['token_idf'].values)]][:-11:-1]

# save and retrieve dataframe
Exemplo n.º 7
0
                  outputCol='body')
filterer = Filterer(key='subreddit',
                    val='body',
                    inputCol='subreddit',
                    outputCol='body',
                    minlength=args.minlength)
tokenizer = RegexTokenizer(inputCol=cleaner.getOutputCol(),
                           outputCol="tokens",
                           pattern="\\W")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="swr_tokens")
cv = CountVectorizer(inputCol=remover.getOutputCol(),
                     outputCol="tf",
                     minDF=args.mindf,
                     vocabSize=args.vocabsize)
idf = IDF(inputCol=cv.getOutputCol(), outputCol="tfidf")
topkwords = TopKWords(inputCol=idf.getOutputCol(),
                      outputCol='top_words',
                      nwords=args.nwords)
cos_similarity = CosineSimilarity(inputCol='subreddit',
                                  outputCol='norm',
                                  spark=spark)
topksubreddits = TopKSubreddits(inputCol=cos_similarity.getOutputCol(),
                                outputCol='top_subreddits',
                                nsubreddits=args.nsubreddits)

pipeline = Pipeline(stages=[
    extractor, cleaner, filterer, tokenizer, remover, cv, idf, topkwords,
    cos_similarity, topksubreddits
])
Exemplo n.º 8
0
# list of stopwords to be removed from the posts
StopWords = list(set(stopwords.words('english')))

labelIndexer = StringIndexer(inputCol="tags", outputCol="label").fit(train)
bs_text_extractor = BsTextExtractor(inputCol="post", outputCol="untagged_post")
RegexTokenizer = RegexTokenizer(inputCol=bs_text_extractor.getOutputCol(),
                                outputCol="words",
                                pattern="[^0-9a-z#+_]+")
StopwordRemover = StopWordsRemover(
    inputCol=RegexTokenizer.getOutputCol(),
    outputCol="filtered_words").setStopWords(StopWords)
CountVectorizer = CountVectorizer(inputCol=StopwordRemover.getOutputCol(),
                                  outputCol="countFeatures",
                                  minDF=5)
idf = IDF(inputCol=CountVectorizer.getOutputCol(), outputCol="features")
rf = RandomForestClassifier(labelCol="label",
                            featuresCol=idf.getOutputCol(),
                            numTrees=100,
                            maxDepth=4)
idx_2_string = IndexToString(inputCol="prediction", outputCol="predictedValue")
idx_2_string.setLabels(labelIndexer.labels)

# creating the pipeline
pipeline = Pipeline(stages=[
    labelIndexer, bs_text_extractor, RegexTokenizer, StopwordRemover,
    CountVectorizer, idf, rf, idx_2_string
])

# fitting the model
model = pipeline.fit(train)
def update_text_with_key_ngrams(df, n, seed=42,
                                outputCol="ngram_text",
                                pattern=r"(?!(?<='))\w+"):
    def build_text(words):
        # Wandle bag of words in sentences um und schaue in jedem der
        # sentences ob
        # eines der key_bigrams in ihm vorkommt
        # bspw. bag of words = ["hi", "i", "ralf"] und key_bigram = "i ralf" -->
        # sentence = ["hi i ralf"] und key_bigram kommt drin vor
        # Wenn bigram vorkommt, dann ersetze die zwei Wörter im Satz mit der
        # underscore version des bigrams ("i_ralf")
        sentence = ' '.join(words)
        for ngram in key_ngrams:
            if ngram in sentence:
                sentence = sentence.replace(ngram, ngram.replace(" ", "_"))
        return sentence

    outputs = {
        "tokenizer": "words",
        "ngram": "ngrams",
        "cv": "tf",
        "idf": "tf_idf",
        "build_text_udf": outputCol
    }

    # Build pipeline
    tokenizer = RegexTokenizer(inputCol="text",
                               outputCol=outputs["tokenizer"],
                               pattern=pattern,
                               gaps=False)
    ngram = NGram(n=n,
                  inputCol=tokenizer.getOutputCol(),
                  outputCol=outputs["ngram"])
    cv = CountVectorizer(inputCol=ngram.getOutputCol(),
                         outputCol=outputs["cv"])
    idf = IDF(inputCol=cv.getOutputCol(),
              outputCol=outputs["idf"])
    pipe = Pipeline(stages=[
        tokenizer,  # transform
        ngram,  # transform
        cv,  # fit_transform
        idf  # fit
    ])

    print("\t Computing tf_idf matrix for {}-grams...".format(n))
    pipe_model = pipe.fit(df)  # calls transform on tokenizer & ngram,
    # fit_transform on cv and fit on idf
    vocabulary = np.array(pipe_model.stages[2].vocabulary)
    print("\t\t vocabulary size: {}".format(len(vocabulary)))
    df = pipe_model.transform(df)

    # train test split
    train, _ = df.randomSplit([0.8, 0.2], seed=seed)
    train.persist(StorageLevel.MEMORY_AND_DISK)

    # fit linear SVM
    svc = LinearSVC(maxIter=100,
                    regParam=0.1,
                    featuresCol="tf_idf")
    print("\t Estimating key {}-grams with SVC...".format(n))
    svc_model = svc.fit(train)

    # Wähle die ngrams mit den schlechtesten/besten weights
    print("\t Update text with key {}-grams...".format(n))
    coeffs = svc_model.coefficients.toArray()
    key_ngrams = get_n_extremes_of_a_in_b(coeffs, vocabulary, 50)

    build_text_udf = F.udf(build_text)

    df = df.withColumn(outputs["build_text_udf"],
                       build_text_udf(
                           F.col(tokenizer.getOutputCol())))
    print()
    return df
                                           seed=42,
                                           outputCol=outputCol,
                                           pattern=pattern)
print("\n")

## PREDICT LABEL BASED ON TF-IDF OF UPDATED TEXT
print("Computing TF-IDF matrix for updated text...")
tokenizer = RegexTokenizer(inputCol=outputCol,
                           outputCol="words_with_ngrams",
                           pattern=pattern,
                           gaps=False)
stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                      outputCol="filtered_words")
cv = CountVectorizer(inputCol=stop_words_remover.getOutputCol(),
                     outputCol="final_tf")
idf = IDF(inputCol=cv.getOutputCol(),
          outputCol="final_tf_idf")

pipe = Pipeline(stages=[
    tokenizer,
    stop_words_remover,
    cv,
    idf
])

reviews_mini = pipe.fit(reviews_mini).transform(reviews_mini)

## Train test split
train, test = reviews_mini.randomSplit([0.8, 0.2], seed=seed)
train.persist(StorageLevel.MEMORY_AND_DISK)
Exemplo n.º 11
0
    para_train = (
        articles_by_paragraph.select('article_id', 'p_index',
                                     'paragraph')  # select unique identifiers
        .where(col('paragraph').isNotNull())  # ignore blank paragraphs
        .withColumn('paragraph', clean_udf('paragraph'))  # clean the text
        .withColumn('paragraph',
                    split(col('paragraph'),
                          ' '))  # split on blank space to tokenize words
        .withColumnRenamed('paragraph',
                           'text')  # rename column to text for pipeline
    )
    para_train.show(5)

    tf = CountVectorizer(inputCol='text',
                         outputCol='tf_result',
                         minDF=0.05,
                         maxDF=0.9)
    idf = IDF(inputCol=tf.getOutputCol(), outputCol='features')
    lda = LDA(k=20, maxIter=10)
    paragraph_pipe = Pipeline(stages=[tf, idf, lda])

    para_model = paragraph_pipe.fit(para_train)

    # models will not overwrite existing ones of the same name
    """import shutil, os
    if os.path.exists("../models/articles_LDA"):
        shutil.rmtree("../models/articles_LDA")"""

    para_model.save("../models/articles_LDA_")
Exemplo n.º 12
0
from pyspark.ml.clustering import LDA

spark = SparkSession.builder.getOrCreate()

data = pd.read_csv('https://raw.githubusercontent.com/DaiZack/MLdatasets/master/imdb500.csv')
df = spark.createDataFrame(data)
textCol = 'review'
selfstopwords = ['br']
numOfTopics = 10
numOfKeywords = 5

tokenizer = RegexTokenizer(inputCol=textCol, outputCol='token', pattern='\\W+')
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='clean0')
stopwords1 = StopWordsRemover(inputCol=stopwords.getOutputCol(), stopWords=selfstopwords,outputCol='clean')
cv = CountVectorizer(inputCol=stopwords1.getOutputCol(), outputCol='cv')
idf = IDF(inputCol=cv.getOutputCol(), outputCol='idf')
lda = LDA(featuresCol=idf.getOutputCol(), k=numOfTopics, maxIter=10)

pipe1 = Pipeline(stages=[tokenizer, stopwords,stopwords1,cv,idf, lda])

model = pipe1.fit(df)
output = model.transform(df)

def topicsTerms(vocab, termindices, leng=None):
  if not leng:
    return [voca[t] for t in termindices]
  return [vocab[t] for t in termindices][:leng]

def topicsTerm_udf(vocab, leng=None):
  return udf(lambda x: topicsTerms(vocab,x, leng))
Exemplo n.º 13
0
# %%
dataSet = dataSet.withColumn('class', dataSet['class'].cast(IntegerType()))
dataSet = dataSet.select('class', 'cleanReview').withColumnRenamed(
    'cleanReview', 'reviews')

# %%
trainDF, testDF = dataSet.randomSplit([0.8, 0.2])
trainDF.show()
testDF.show()

# %%
tokenizer = Tokenizer(inputCol="reviews", outputCol="tokens")
countVector = CountVectorizer(inputCol=tokenizer.getOutputCol(),
                              outputCol='features')
idf = IDF(inputCol=countVector.getOutputCol(), outputCol='idf')
pipeline = Pipeline(stages=[tokenizer, countVector, idf])
pipelineModel = pipeline.fit(trainDF)

# %%
pTrainDF = pipelineModel.transform(trainDF)
pTestDF = pipelineModel.transform(testDF)

# %%
evaluator = MulticlassClassificationEvaluator(labelCol="class",
                                              predictionCol="prediction",
                                              metricName="f1")
lr = LogisticRegression(featuresCol=idf.getOutputCol(), labelCol='class')
lrModel = lr.fit(pTrainDF)
predictionsLR = lrModel.transform(pTestDF)
evaluator.evaluate(predictionsLR)
Exemplo n.º 14
0
sampled.groupby('label').count().toPandas()


# # Data Ingestion and Vectorization

# In[18]:


#Tokennize the TrainData - sparse the URL string into words
regexTokenizer = RegexTokenizer(inputCol="url", outputCol="Words", pattern="\\W")

#CountVectorizer converts the the words into feature vectors - Thi is used as it gives better results
countVectors = CountVectorizer(inputCol=regexTokenizer.getOutputCol(), outputCol="rawfeatures", vocabSize=10000, minDF=5)

#
idf = IDF(inputCol=countVectors.getOutputCol(), outputCol="features") 

#create the pipline 
pipeline = Pipeline(stages=[regexTokenizer, countVectors, idf ])


# Fit the pipeline to training documents.
# Pass 'sampled' in the param to set Balanced datasets
pipelineFit = pipeline.fit(sampled)

#Transform the pipeline to dataset
# Pass 'sampled' in the param to set Balanced datasets
dataset = pipelineFit.transform(sampled)

#randomly split the dataset to traning and testing 80%, 20% respectively
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
Exemplo n.º 15
0
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol='reviewText', outputCol='reviewWords')
stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='reviewWordsWithoutTrash')
vectorizer = CountVectorizer(inputCol=stop_words_remover.getOutputCol(), outputCol="word_vector", minDF=150)
lr = LinearRegression(featuresCol=vectorizer.getOutputCol(), labelCol='overall')

pipeline = Pipeline(stages=[tokenizer, stop_words_remover, vectorizer, lr])
Exemplo n.º 16
0
remover = StopWordsRemover(inputCol='words', outputCol='filtered_words')
text = remover.transform(text)
text.show(5)

ngramer = NGram(n=2, inputCol='filtered_words', outputCol='ngrams')
text = ngramer.transform(text)
text.show(5)

count_vec = CountVectorizer(inputCol=ngramer.getOutputCol(),
                            outputCol='ft_features')
count_vec_model = count_vec.fit(text)
vocab = count_vec_model.vocabulary
text = count_vec_model.transform(text)
text.show(5)

idf = IDF(inputCol=count_vec.getOutputCol(), outputCol='features')
text = idf.fit(text).transform(text)

lda = LDA(featuresCol=idf.getOutputCol(), k=5, maxIter=10)
lda_model = lda.fit(text)

topics = lda_model.describeTopics()
# topics_words = topics.rdd.map(lambda x: x['termIndices']).map(lambda x:[vocab[i] for i in x]).collect()
get_topics_words = F.udf(lambda x: [vocab[i] for i in x],
                         ArrayType(StringType()))
topics = topics.withColumn('topic_words',
                           get_topics_words(F.col('termIndices')))
topics.show()

text = lda_model.transform(text)
text.show(5)