예제 #1
0
def test_simplepipe():
    df = SPARK_SESSION.sparkContext.\
        parallelize([['this is a test'], ['this is another test']]).\
        toDF(schema=types.StructType().add('sentence', types.StringType()))

    pl = feature.Tokenizer().setInputCol('sentence') | \
        feature.CountVectorizer() | \
        feature.IDF()
    pl_model = pl.fit(df)
    pl_model.transform(df).count()
예제 #2
0
def test_ml_pipe():
    df = sc. \
         parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]). \
         toDF()

    pl = feature.Tokenizer().setInputCol('sentence') | feature.CountVectorizer()
    ml = pl | classification.LogisticRegression()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
예제 #3
0
def test_stackedml_pipe():
    df = SPARK_SESSION.sparkContext. \
        parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]).\
        toDF()

    pl = feature.Tokenizer().setInputCol(
        'sentence') | feature.CountVectorizer()
    ml = pl | (classification.LogisticRegression(),) | feature.VectorAssembler() | \
        classification.\
        RandomForestClassifier()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
예제 #4
0
파일: ml.py 프로젝트: vivshri/Optimus
def logistic_regression_text(df, input_col):
    """
    Runs a logistic regression for input (text) DataFrame.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to predict
    :return: DataFrame with logistic regression and prediction run.
    """

    assert_spark_df(df)

    pl = feature.Tokenizer().setInputCol(input_col) | feature.CountVectorizer()
    ml = pl | classification.LogisticRegression()
    ml_model = ml.fit(df)
    df_model = ml_model.transform(df)
    return df_model, ml_model
예제 #5
0
def canonicaltokens(df, inputColumn, outputColumn):
   """
   turn input column of strings into canonical format as output column of tokens
   return as output column added to the dataframe
   """

   newname = df.withColumn("cleanname", \
       f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \
       " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3"))

   newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words")
   chtokenized = newtokenizer.transform(newname).drop("cleanname")

   stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn)
   canonicalname = stopwordremover.transform(chtokenized).drop("words")

   return canonicalname
예제 #6
0
def test_multi_model_pipe():
    df = SPARK_SESSION.sparkContext. \
        parallelize([Row(sentence='this is a test', label=0.),
                     Row(sentence='this is another test', label=1.)]).\
        toDF()

    pl = feature.Tokenizer().setInputCol(
        'sentence') | feature.CountVectorizer()
    models = (classification.LogisticRegression(),
              classification.RandomForestClassifier(),
              classification.LogisticRegression().setElasticNetParam(0.2),
              classification.GBTClassifier())
    ml = pl | models | feature.VectorAssembler().setOutputCol('final_features') | \
        classification.LogisticRegression()

    ml_model = ml.fit(df)
    assert_equal(ml_model.transform(df).count(), 2)
예제 #7
0
def n_gram(df, input_col, n=2):
    """
    Converts the input array of strings inside of a Spark DF into an array of n-grams.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to analyzer.
    :param n: number of elements per n-gram >=1.
    :return: Spark DataFrame with n-grams calculated.
    """

    is_dataframe(df)

    tokenizer = feature.Tokenizer().setInputCol(input_col) | feature.StopWordsRemover()
    count = feature.CountVectorizer()
    gram = feature.NGram(n=n) | feature.CountVectorizer()
    tf = tokenizer | (count, gram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    df_model = tfidf_model.transform(df)
    return df_model, tfidf_model
def getTFIDF(closest):
    grouped_clusters = closest.groupBy("prediction")\
        .agg(F.collect_list("split_aspect").alias("text"))\
        .withColumn("text", F.concat_ws(" ", "text"))

    tokenizer = feat.Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(grouped_clusters)

    # get term freqs (using count vectorizer because it does hash the words and we can revert back to words from idx)
    cv = feat.CountVectorizer(inputCol="words", outputCol="rawFeatures").fit(wordsData)
    featurizedData = cv.transform(wordsData)

    # save vocab object
    vocab = cv.vocabulary

    # compute idf
    idf = feat.IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    tfidf = idfModel.transform(featurizedData)

    return tfidf, vocab
예제 #9
0
def test_unigram_and_bigram():
    df = SPARK_SESSION.sparkContext. \
        parallelize([['this is the best sentence ever'],
                     ['this is however the worst sentence available']]). \
        toDF(schema=types.StructType().add('sentence', types.StringType()))
    import requests
    stop_words = requests.get(
        'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words'
    ).text.split()

    tokenizer = feature.Tokenizer().setInputCol(
        'sentence') | feature.StopWordsRemover(stopWords=stop_words)
    unigram = feature.CountVectorizer()
    bigram = feature.NGram() | feature.CountVectorizer()
    trigram = feature.NGram(n=3) | feature.CountVectorizer()
    tf = tokenizer | (unigram, bigram, trigram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    assert_equal(
        tfidf_model.transform(df).select('sentence', 'features').count(), 2)
def create_vocab(df):
	"""Create a vocabulary from a dataframe.
	Also removes some special tokens.
	
	Args:
		df: A dataframe with columns'processed_abstract'
		 and 'processed_full_text'

	Return:
		vocab: A wordlist sorted by frequency
	"""
	concat_udf = F.udf(
		lambda cols: " ".join([col for col in cols]),
		spark_types.StringType())
	df = df.withColumn(
		'all_text',
		concat_udf(F.array(
			'processed_abstract',
			'processed_full_text')))
	tokenizer = ml_feature.Tokenizer(
		inputCol='all_text',
		outputCol='tokens')
	df = tokenizer.transform(df)
	cv = ml_feature.CountVectorizer(
		inputCol='tokens',
		outputCol='vectors',
		vocabSize=200000)
	cv_model = cv.fit(df)

	# wrd_list is sorted by frequency
	vocab = cv_model.vocabulary
	vocab.remove(SENT_START)
	vocab.remove(SENT_END)
	vocab.remove(SEC_START)
	vocab.remove(SEC_END)

	return vocab