def train(allHex,labels,hashFiles,sc,sqlc,path): bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes") def fun(accum,x): return accum+','+x bytesFileString = bytesFiles.reduce(fun) rdd1= sc.wholeTextFiles(bytesFileString,20) bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0])) ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"]) twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF() cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000) featureFitModel = cv.fit(ngramFrame) featuresCV = featureFitModel.transform(ngramFrame) labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0])) labelFrame = labelRdd.toDF(["did","label"]) trainData = ngramFrame.featuresCV(labelFrame,"did") trainData.persist(StorageLevel(True, True, False, False, 1)) saveData(trainData,path) trainData.show() returm featureFitModel
def shringles(x, fileName): # tokenize and ngrams tokenizer = RegexTokenizer(inputCol="value", outputCol="words", pattern="\\W") ngrams = NGram(n=x, inputCol="words", outputCol="kshringles") shringleList.append(ngrams.transform(tokenizer.transform(read(fileName))))
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel): bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes") def fun(accum,x): return accum+','+x bytesFileString = bytesFiles.reduce(fun) rdd1= sc.wholeTextFiles(bytesFileString,20) bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0])) Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1]))) sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0]))) ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"]) twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF() featuresCV = featureFitModel.transform(ngramFrame) testData = featuresCV.drop('docFeatures') testData.persist(StorageLevel(True, True, False, False, 1)) saveData(ngramFrame,path) testData.show()
def ngrram(dataframe, column, x): tokens = Tokenizer(inputCol=column, outputCol='tokens') nn = NGram(n=x, inputCol='tokens', outputCol='ngrams') b = tokens.transform(dataframe) a = nn.transform(b) final = a.select(['tokens', 'ngrams']).show(4) return final
def extract_featrues(self, train_rdd=None, test_rdd=None): """ train_rdd: type rdd, the raw rdd of train data (text content, label) test_rdd: type rdd, the raw rdd of test data (text content, doc_id) return: type data frame, a data frame where each record contains the extracred features """ print('****************************') print('Feature Extraction: TF-IDF\n') train_raw_df = train_rdd.map(lambda row: (self.convert(row[0]), row[1])).toDF( ['words', 'label']) test_raw_df = test_rdd.map(lambda row: (self.convert(row[0]), row[1])).toDF( ['words', 'doc_id']) ngram = NGram(n=2, inputCol="words", outputCol="ngrams") train_ngram_df = ngram.transform(train_raw_df).drop('words') test_ngram_df = ngram.transform(test_raw_df).drop('words') hashing_tf = HashingTF(inputCol='ngrams', outputCol='raw_features') train_raw_featured_data = hashing_tf.transform(train_ngram_df).drop( 'ngrams') test_raw_featured_data = hashing_tf.transform(test_ngram_df).drop( 'ngrams') idf = IDF(inputCol='raw_features', outputCol='features') idf_model = idf.fit(train_raw_featured_data) train_df = idf_model.transform(train_raw_featured_data).drop( 'raw_features') test_df = idf_model.transform(test_raw_featured_data).drop( 'raw_features') return (train_df, test_df)
def extract_collocations(records, num_collocations, collocation_window): """Extracts the most common collocations present in the records. Params: - records (pyspark.rdd.RDD): The tokenized and lemmatized records from the JSON file - num_collocations (int): The number of collocations to show - collocation_window (int): The text window within which to search for collocations. Returns: - best_collocations (list<tuple<str, int>>): The highest scored collocations present in the records, with their frequency of occurrence in the dataset. """ # @see: https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram from pyspark.ml.feature import NGram data_frame = records.map(lambda record: Row(record[constants.VALUE])).toDF( ['words']) ngram_model = NGram(n=2, inputCol='words', outputCol='ngrams') ngram_data_frame = ngram_model.transform(data_frame) ngram_rdd = ngram_data_frame.select('ngrams').rdd ngram_rdd = ngram_rdd.flatMap(lambda row: row['ngrams'])\ .map(lambda ngram: (ngram.encode('utf-8'), 1))\ .reduceByKey(add)\ .sortBy(lambda bigram_with_count: bigram_with_count[1], ascending=False) rdd_show(ngram_rdd) frequent_collocations = ngram_rdd.take(num_collocations) return frequent_collocations
def build_pipeline(classifier='rf', max_depth=7): """ creates a pipeline of functionalities to be applied on the training set """ # Training: Tokenize, Removing stop words, calculating n-grams, calcuating frequencies tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern='\w{8}|\s') remover = StopWordsRemover(inputCol='words', outputCol='filtered', stopWords=['??']) ngram_2 = NGram(n=2, inputCol='filtered', outputCol='ngrams') ngram_3 = NGram(n=3, inputCol='filtered', outputCol='ngrams') hashingTF = HashingTF(inputCol="ngrams", outputCol="features") word2vec = Word2Vec(inputCol='ngrams', outputCol='features') if classifier == 'rf': clf = RandomForestClassifier(maxDepth=max_depth) stages = [tokenizer, remover, ngram_2, hashingTF, clf] elif classifier == 'nb': clf = NaiveBayes(smoothing=1) stages = [tokenizer, remover, ngram_3, hashingTF, clf] elif classifier == 'lr': clf = LogisticRegression() stages = [tokenizer, remover, ngram_2, word2vec, clf] else: raise ValueError("classifier must be 'rf', 'nb', or 'lr'.") return stages
def Ngram_feature(N, feature_rdd): ''' Extract and count N-gram. Leave top 1000 n-gram features if it's 2-gram or more. Input: feature_rdd : [(<hash1>,<feature1>), (<hash1>,<feature2>), ..., (<hashN>,<featureK>)] Output: freq_ngram_count_rdd : [((<hash>,<ngram feature>),cnt), ...] ''' feature_rdd = feature_rdd.groupByKey().map(lambda x: (x[0],list(x[1]))) df = spark.createDataFrame(feature_rdd).toDF("file_names", "features") ngram = NGram(n=N, inputCol="features", outputCol="ngrams") ngramDataFrame = ngram.transform(df) ngram_rdd = ngramDataFrame.rdd.map(tuple).map(lambda x: (x[0],x[2])).flatMapValues(lambda x: x) ngram_count_rdd = ngram_rdd.map(lambda x: ((x),1)).reduceByKey(add) freq_ngram_count_rdd = ngram_count_rdd if not N == 1: #[(<ngram feature>,cnt), ...] topN_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],x[1])).reduceByKey(add) #[((<ngram feature>,cnt),index), ...] topN_ngram_count_rdd = topN_ngram_count_rdd.sortBy(lambda x: x[1],ascending=False).zipWithIndex() length = topN_ngram_count_rdd.count() #top [(<ngram feature>,cntSum), ...] topN_ngram_count_rdd = topN_ngram_count_rdd.filter(lambda x: x[1]<1000).map(lambda x: x[0]) #freq [(<ngram feature>,(<hash>,cnt)), ...] freq_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],(x[0][0],x[1]))) #[(<ngram feature>,(cntSum,(<hash>,cnt))), ...] freq_ngram_count_rdd = topN_ngram_count_rdd.join(freq_ngram_count_rdd).map(lambda x: ((x[1][1][0],x[0]),x[1][1][1])) return freq_ngram_count_rdd
def feature_engineering(class_balancedDf): # N-Gram ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams") ngramDataFrame = ngram.transform(class_balancedDf) # Hashing TF hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20) featurizedData = hashingTF.transform(ngramDataFrame) # IDF idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) # K-Means kmeans = KMeans().setK(6).setSeed(1) kmodel = kmeans.fit(rescaledData).transform(rescaledData) #LDA lda = LDA(k=10, maxIter=10) ldamodel = lda.fit(kmodel).transform(kmodel) # changing label column to int data = ldamodel.withColumn( "label", ldamodel.label.cast("Integer")).drop("prediction") return data
def create_ngram(self, df, n, input_col, output_col='ngrams'): "Generate N-Gram -> https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram" from pyspark.ml.feature import NGram ngram = NGram(n=n, inputCol=input_col, outputCol=output_col) ngram_df = ngram.transform(df) return ngram_df
def test_ngram(self): dataset = self.spark.createDataFrame([ Row(input=["a", "b", "c", "d", "e"])]) ngram0 = NGram(n=4, inputCol="input", outputCol="output") self.assertEqual(ngram0.getN(), 4) self.assertEqual(ngram0.getInputCol(), "input") self.assertEqual(ngram0.getOutputCol(), "output") transformedDF = ngram0.transform(dataset) self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
def calculate_vectors(data, n=2, binary=False): ngram = NGram(n=n, inputCol="sequence", outputCol="ngrams") ngramDataFrame = ngram.transform(data) ngrams = ngramDataFrame.select("ngrams") cvectorizer = CountVectorizer( inputCol="ngrams", outputCol="vec", binary=binary ) model = cvectorizer.fit(ngrams) return model.transform(ngrams).select("vec")
def opcode_ngram(df_opcode, N): """ Generates n-grams opcode by opcode data frame. Returns n-grams opcode in RDD((filename, n-gram), total_counts) """ ngrams = NGram(n=N, inputCol="opcode", outputCol="ngrams") df_ngrams = ngrams.transform(df_opcode) rdd_ngrams = df_ngrams.select("filename", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\ .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add) return rdd_ngrams
def bytes_ngram(df_bytes, n): """ Generates n-grams bytes by bytes data frame. Returns n-grams bytes in RDD((hash, n-gram), total_counts) """ ngrams = NGram(n=n, inputCol="bytes", outputCol="ngrams") df_ngrams = ngrams.transform(df_bytes) rdd_ngrams = df_ngrams.select("hash", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\ .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add) return rdd_ngrams
def make_ngrams (df, n=1): df = df.withColumn('normalized_text', processing(F.col('text'))) tokenizer = Tokenizer(inputCol="normalized_text", outputCol="tokens") tokenized = tokenizer.transform(df).drop('normalized_text') ngram = NGram(n=n, inputCol="tokens", outputCol="n_gram") n_gram_df = ngram.transform(tokenized) n_gram_df = n_gram_df.withColumn('n_gram', F.explode('n_gram')) n_gram_df = n_gram_df.filter(F.length('n_gram')>2) return n_gram_df
def ngram(dataframe, in_col, out_col, n): ngram = NGram(n=n, inputCol=in_col, outputCol=out_col) dataframe = ngram.transform(dataframe) # summarise top n-grams dataframe\ .groupBy(out_col)\ .count()\ .sort(col("count").desc())\ .show() return dataframe
def learn(self, text_df): """Spark transformation to learn the adjacent terms of a given ngram""" ngram = NGram(n=self.n, inputCol='tokenized_text', outputCol='ngram') ngram_df = ngram.transform(text_df) # create the ngram to adjacent term mappings ngram_list = ngram_df.select("ngram").rdd.map(lambda r: r['ngram']).collect() self.ngram_model = ngram_df.rdd \ .map(lambda x: PreProcess.generate_adjacent_terms(x.asDict()['ngram'])) \ .flatMap(lambda xs: [x for x in xs]) \ .map(lambda y: (y[0], [y[1]])) \ .reduceByKey(lambda a, b: a + b).collect() # create list of the keys in the model and store them self.model_keys = self.ngram_model.map(lambda x: x[0]).collect()
def build_pipeline(): tokenizer = [Tokenizer(inputCol='tweet', outputCol='words')] ngrams = [ NGram(n=i, inputCol='words', outputCol='{0}_grams'.format(i)) for i in range(1, 4) ] cv = [ CountVectorizer(vocabSize=5460, inputCol='{0}_grams'.format(i), outputCol='{0}_tf'.format(i)) for i in range(1, 4) ] idf = [ IDF(inputCol='{0}_tf'.format(i), outputCol='{0}_tfidf'.format(i), minDocFreq=5) for i in range(1, 4) ] assembler = [ VectorAssembler(inputCols=['{0}_tfidf'.format(i) for i in range(1, 4)], outputCol='features') ] label_stringIdx = [StringIndexer(inputCol='sentiment', outputCol='label')] lr = [LogisticRegression(maxIter=100)] pipeline = Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + label_stringIdx + lr) return pipeline
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0): tokenized = Tokenizer(inputCol="text", outputCol="words").transoform(sentenceDataFrame) ngramDataFrame = NGram(n=ngrams, inputCol="words", outputCol="ngrams").transform(tokenized) countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures") countVectModel = countVect.fit(ngramDataFrame) featurizedData = countVectModel.transform(ngramDataFrame) idf = IDF(minDocFreq=minDocFreq, inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features") normalizer = Normalizer(inputCol="features", outputCol='scores') X = normalizer.transform(rescaledData) return X
def LR_Model(train_dataframe, test_dataframe): ''' Takes the argument as a train_dataframe, test_dataframe implements the pipeline of RegexTokenizer, NGrams =3 , HashingTF, IDF and LogisticRegression and predicts the label based on features of test_dataframe. The Pattern RegexTokenizer is set to "\\W|\b(00|CC)\b" because it removes all nonwords that is extra spaces or punctuations, '??', '00' and 'CC' are removed as these are most repeated words and accuracy is significantly improved. Args: dataframe: -The train_dataframe should consist of the columns, 'label' and 'text'. -The test_dataframe should consist of the column 'text'. Returns: DataFrame['prediction': double, given_order: bigint, label: string] iff data read initially is a small dataset else DataFrame['prediction': double, given_order: bigint] data read initially is a big dataset ''' train_dataframe = train_dataframe.repartition(96)\ .withColumn('label', train_dataframe['label'].cast(IntegerType())) regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W|\b(00|CC)\b") ngram = NGram(n=3, inputCol="words", outputCol="ngrams") hashingTF = HashingTF(inputCol="ngrams", outputCol="TF") idf = IDF(inputCol="TF", outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.001) pipeline = Pipeline(stages=[regexTokenizer, ngram, hashingTF, idf, lr]) model = pipeline.fit(train_dataframe) predictions_df = model.transform(test_dataframe) return predictions_df\ .drop('rawfeatures', 'n_grams', 'TF', 'text', 'words', 'features')
def ngramFeatureExtractors(n, inputCol=["text", "target"]): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] ngrams = [ NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] count_vectorizer = [ CountVectorizer(vocabSize=5460, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=5) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="features") ] label_stringIdx = [StringIndexer(inputCol="target", outputCol="label")] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + count_vectorizer + idf + assembler + label_stringIdx + lr)
def build_pipeline(): tokenizer = [Tokenizer(inputCol='text', outputCol='words')] remover = [StopWordsRemover(inputCol="words", outputCol="stopped_words")] ngrams = [ NGram(n=i, inputCol='stopped_words', outputCol='{0}_grams'.format(i)) for i in range(1, 6) ] cv = [ CountVectorizer(vocabSize=50000, inputCol='{0}_grams'.format(i), outputCol='{0}_tf'.format(i)) for i in range(1, 6) ] idf = [ IDF(inputCol='{0}_tf'.format(i), outputCol='{0}_tfidf'.format(i), minDocFreq=5) for i in range(1, 6) ] tweetvect = [ VectorAssembler(inputCols=["tweet_count"], outputCol="vec_tweet_count") ] ss = [ StandardScaler(inputCol="vec_tweet_count", outputCol="ss_tweet_count") ] assembler = [VectorAssembler(inputCols=input_cols, outputCol='features')] pipeline = Pipeline(stages=tokenizer + remover + ngrams + cv + idf + tweetvect + ss + assembler) return pipeline
class SentimentalPipelineEngine(PipelineEngine): def __init__(self, cv): super(SentimentalPipelineEngine, self).__init__(cv) self.tokenizer_map = [TweetTokenizer()] self.ngram_map = [1] self.hashing_tf_map = [pow(2, 20)] self.clf_map = [0.1] self.stages = self._build_stages() self.pipeline = Pipeline(stages=self.stages) self.param_grid = self._build_param_grid() def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered") self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed") self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams") self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features") self.idf = IDF(inputCol="features", outputCol="idf_features") self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0) self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1) # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2]) return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf] def _build_param_grid(self): param_grid_builder = ParamGridBuilder() param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map) param_grid_builder.addGrid(self.ngram.n, self.ngram_map) param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map) param_grid_builder.addGrid(self.clf.regParam, self.clf_map) return param_grid_builder.build()
def convert_ngrams(df, column): # convert tokens to ngram n = 5 # convert to ngrams 2 to n ngrams for i in range(1, n + 1): ngram = NGram(n=i, inputCol=column, outputCol='{}_{}'.format(column, i)) df = ngram.transform(df) return df.withColumn( column, concat(*['{}_{}'.format(column, i) for i in range(1, n + 1)])).drop( *['{}_{}'.format(column, i) for i in range(1, n + 1)])
def process_df(df): time_seq.append(['start process-df', time.time()]) model = Pipeline(stages=[ RegexTokenizer(pattern=" ", inputCol="instruments", outputCol="instruments_tokenized", minTokenLength=1), NGram(n=1, inputCol="instruments_tokenized", outputCol="instruments_ngrams"), HashingTF(inputCol="instruments_ngrams", outputCol="instruments_vectors"), MinHashLSH(inputCol="instruments_vectors", outputCol="instruments_lsh", numHashTables=10) ]).fit(df) df_hashed = model.transform(df) df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.5, distCol="distance") \ .filter("datasetA.filename != datasetB.filename AND datasetA.filename < datasetB.filename") \ .select(f.col('datasetA.filename').alias('filename_A'), f.col('datasetB.filename').alias('filename_B'), f.col('distance')) time_seq.append(['process-df df_matches', time.time()]) write_df_to_pgsql(df_matches, 'filepair_similarity_run3') time_seq.append(['write pgsql', time.time()]) print('time_seq', time_seq)
def build_ngrams(n=3): tokenizer = [Tokenizer(inputCol="text", outputCol="tokens")] stopwordsRemover = [ StopWordsRemover(inputCol='tokens', outputCol='tokens_filtered') ] ngrams = [ NGram(n=i, inputCol="tokens", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=5460, inputCol="{0}_grams".format(i), outputCol="{0}_cv".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_cv".format(i), outputCol="{0}_idf".format(i), minDocFreq=5) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_idf".format(i) for i in range(1, n + 1)], outputCol="features") ] stringIndexer = [StringIndexer(inputCol="class", outputCol="label")] lr = [LogisticRegression(maxIter=100)] return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler + stringIndexer + lr)
def preprocess(inputCol=["text", "label"], n=4): tokenizer = [Tokenizer(inputCol="text", outputCol="words")] remover = [StopWordsRemover(inputCol="words", outputCol="filtered")] ngrams = [ NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i)) for i in range(1, n + 1) ] cv = [ CountVectorizer(vocabSize=2**14, inputCol="{0}_grams".format(i), outputCol="{0}_tf".format(i)) for i in range(1, n + 1) ] idf = [ IDF(inputCol="{0}_tf".format(i), outputCol="{0}_tfidf".format(i), minDocFreq=2) for i in range(1, n + 1) ] assembler = [ VectorAssembler( inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)], outputCol="rawFeatures") ] label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")] selector = [ ChiSqSelector(numTopFeatures=2**14, featuresCol='rawFeatures', outputCol="features") ] lr = [LogisticRegression(maxIter=1000)] return Pipeline(stages=tokenizer + remover + ngrams + cv + idf + assembler + label_stringIdx + selector + lr)
def n_gram_fingerprint(df, input_cols, n_size=2): """ Calculate the ngram for a fingerprinted string :param df: Dataframe to be processed :param input_cols: Columns to be processed :param n_size: :return: """ def remote_white_spaces_remove_sort_join(value, args): # remove white spaces value = [x.replace(" ", "") for x in value] # sort and remove duplicated value = sorted(set(value)) # join the tokens back together value = "".join(value) return value input_cols = parse_columns(df, input_cols) for input_col in input_cols: ngram_col = name_col(input_col, NGRAM_COL) ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL) df = ( df.cols.copy(input_col, name_col( input_col, NGRAM_COL)).cols.lower(ngram_col).cols.remove_white_spaces( ngram_col).cols.remove_special_chars( ngram_col).cols.remove_accents(ngram_col) # For create n-grams we need an Array type column .cols.nest(input_cols=ngram_col, output_col=ngram_col, shape='array')) if Optimus.cache: df = df.cache() n_gram = NGram(n=n_size, inputCol=ngram_col, outputCol=ngram_fingerprint_col) df = n_gram.transform(df) df = df.cols.apply(ngram_fingerprint_col, remote_white_spaces_remove_sort_join, "string") return df
def test(opcodes, hashFiles, sc, sqlc, path, featureFitModel): asmFiles = hashFiles.map( lambda x: "gs://uga-dsp/project2/data/asm/" + x + ".asm") def fun(accum, x): return accum + ',' + x asmFileString = asmFiles.reduce(fun) rdd1 = sc.wholeTextFiles(asmFileString, 20) opcodesInDoc = rdd1.map(lambda x: x[1].split()).map( lambda x: [word for word in x if word in opcodes.value]).zipWithIndex( ).map(lambda x: (x[1], x[0])) ngramFrame = sqlc.createDataFrame(opcodesInDoc, ["docId", "opcodes"]) twoGram = NGram(n=2, inputCol="opcodes", outputCol="2grams") ngramFrame = twoGram.transform(ngramFrame) threeGram = NGram(n=3, inputCol="opcodes", outputCol="3grams") ngramFrame = threeGram.transform(ngramFrame) fourGram = NGram(n=4, inputCol="opcodes", outputCol="4grams") ngramFrame = fourGram.transform(ngramFrame) def getSegment(x): templist = [] for line in x: l = re.findall(r'\w+:?(?=:)', line) if l: templist.append(l[0]) return templist segments = rdd1.zipWithIndex().map(lambda x: (x[1], x[0][1].splitlines( ))).map(lambda x: (x[0], getSegment(x[1]))).toDF(["docId", "segments"]) featureFrame = ngramFrame.join(segments, "docId") featuresDF = featureFrame.rdd.map( lambda x: Row(did=x['docId'], docFeatures=x['opcodes'] + x['2grams'] + x['3grams'] + x[ '4grams'] + x['segments'])).toDF() featuresCV = featureFitModel.transform(featuresDF) testData = featuresCV.drop('docFeatures') testData.persist(StorageLevel(True, True, False, False, 1)) saveData(testData, path) testData.show()
def build_ngrams_part(inputCol="words", n=6): ngrams = [ NGram(n=i, inputCol="words", outputCol="ngrams_{0}".format(i)) for i in range(7, n + 1) ] vectorizers = [ CountVectorizer(inputCol="ngrams_{0}".format(i), outputCol="ngramscounts_{0}".format(i)) for i in range(7, n + 1) ] return Pipeline(stages=ngrams + vectorizers)
def get_ngrams(cases, region_path): if (debug): logging.info(region_path) for case_path in tqdm(cases): parsed = parse_file(case_path) text = get_case_text(parsed) date = get_decision_date(parsed).year state = parsed("case|court").attr('jurisdiction').strip() text = text.encode("ascii", "ignore") clean_word_list = alphanumeric.sub('', text).lower().split() text_df = spark.createDataFrame([Row(inputTokens=clean_word_list)]) for n in range(1,4): if n==1: ngrams = clean_word_list else: ngram_prepared = NGram(n=n, inputCol="inputTokens", outputCol="nGrams") ngrams = ngram_prepared.transform(text_df).head().nGrams sc.parallelize(ngrams).map(lambda word: (word,1)).reduceByKey(lambda v1,v2: v1 + v2).map(lambda word_tuple: write_to_file(word_tuple, date, case_path, state, region_path, n=n)).collect()
def main(train_x, train_y, test_x, test_y=None, idf=False, ngram=1, base='gs', asm=False): # Load : DF[id, url, features, label?] # The DataFrames only have a labels column if labels are given. # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens kind = 'asm' if asm else 'bytes' train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text') test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text') # convert the string labels to numeric indices # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting label_indexer = StringIndexer(inputCol='label', outputCol='indexedLabel', handleInvalid="skip") label_indexer = label_indexer.fit(train) train = label_indexer.transform(train) # the test set won't always have labels if test_y is not None: test = label_indexer.transform(test) index_labeller = IndexToString(inputCol='prediction', outputCol='predictedClass', labels=label_indexer.labels) # Train the preprocessor and transform the data. prep = elizabeth.Preprocessor() prep.add(NGram(n=int(ngram))) prep.add(CountVectorizer()) if idf: prep.add(IDF()) train = prep.fit(train) test = prep.transform(test) # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction] nb = NaiveBayes(labelCol='indexedLabel').fit(train) test = nb.transform(test) test = index_labeller.transform( test) # DF[id, url, ... prediction, predictedClass] # If labels are given for the test set, print a score.s if test_y: test = test.orderBy(test.id) test = test.withColumn( 'correct', (test.label == test.predictedClass).cast('double')) test = test.select(avg(test.correct)) print(test.show()) # If no labels are given for the test set, print predictions. else: test = test.orderBy(test.id).select(test.predictedClass) test = test.rdd.map(lambda row: int(row.predictedClass)) test = test.toLocalIterator() print(*test, sep='\n')
def initialize(): spark = SparkSession \ .builder \ .appName("search-flight-spark-ml-model") \ .getOrCreate() sc = spark.sparkContext auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET) api = tweepy.API(auth) important_fields = ['id', 'text', 'user'] schema = StructType([ StructField('id', LongType(), False), StructField('text', StringType(), False), StructField('username', StringType(), False) ]) tweetsDf = spark.createDataFrame(sc.emptyRDD(), schema) for tweet in tweepy.Cursor(api.search, q='barajas', rpp=100, lang='en').items(MAX_TWEETS): json_tweet = {k: tweet._json[k] for k in important_fields} json_tweet['text'] = json_tweet['text'].replace("'", "").replace( "\"", "").replace("\n", "") tweetDf = spark.createDataFrame([ (json_tweet['id'], json_tweet['text'], json_tweet['user']['name']) ], schema) tweetsDf = tweetsDf.union(tweetDf) tweets_df_splitted = tweetsDf.randomSplit([0.75, 0.25], MAX_TWEETS) training_set = tweets_df_splitted[0] test_set = tweets_df_splitted[1] username_indexed = StringIndexer(inputCol="username", outputCol="username_indexed") tokenizer = Tokenizer(inputCol="text", outputCol="token_raw") ngram = NGram(inputCol="token_raw", outputCol="ngram", n=2) hashing_tf = HashingTF(inputCol="ngram", outputCol="tf", numFeatures=20) idf = IDF(inputCol="tf", outputCol="idf", minDocFreq=2) lr = LogisticRegression(featuresCol="idf", labelCol="username_indexed") pipeline = Pipeline( stages=[username_indexed, tokenizer, ngram, hashing_tf, idf, lr]) pipeline_model = pipeline.fit(training_set) pipeline_model.write().overwrite().save("tweet_traveling_partners_model") tweet_traveling_partners_prediction = pipeline_model.transform(test_set) selected = tweet_traveling_partners_prediction.select( "username", "text", "probability", "prediction") for row in selected.collect(): print(row) spark.stop()
def transformData(df, parameter): ''' Transformed dataframe based on the parameter Input : - parameter Output : - transformed dataframe ''' ngram = NGram(n=parameter["n"], inputCol=parameter["inputCol"], outputCol=parameter["outputCol"]) temp = '' if len(ngram.transform(df).head().inputTokens) < ngram.getN(): print('No element in ' + parameter["outputCol"]) else: temp = ngram.transform(df).show() return temp
def _build_stages(self): self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed") self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words") self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered") self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed") self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams") self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features") self.idf = IDF(inputCol="features", outputCol="idf_features") self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0) self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1) # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2]) return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf]
print "Start preprocessing all data" t0 = time() def preProcess(doc): clean = doc.review.replace("<br /><br />"," ") tok = nltk.tokenize.wordpunct_tokenize(clean) tags = nltk.pos_tag(tok,tagset='universal') low = [word.lower() for word in tok] return low,zip(*tags)[1],doc.label schema = StructType([StructField('words',ArrayType(StringType()),True), StructField('tags',ArrayType(StringType()),True), StructField('label',DoubleType())]) dfPre=df.map(preProcess).toDF(schema).cache() trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams") dfTriAux = trigram.transform(dfPre).cache() trigram.setInputCol("words") trigram.setOutputCol("wordTrigrams") dfTri = trigram.transform(dfTriAux).cache() dfTrain, dfValid = dfTri.randomSplit([0.8,0.2]) lists=dfTrain.map(lambda r : r.words).collect() dictUnigrams=list(set(itertools.chain(*lists))) dictionaryUni={} for i,word in enumerate(dictUnigrams): dictionaryUni[word]=i dict_broad = sc.broadcast(dictionaryUni)
from pyspark.ml.feature import NGram import itertools import nltk print "Start preprocessing all data" t0 = time() def preProcess(doc): clean = doc[0].replace("<br /><br />"," ") tok = nltk.tokenize.wordpunct_tokenize(clean) low = [word.lower() for word in tok] return low,doc[1] bigram = NGram(inputCol="words", outputCol="bigrams") dfPre=df.map(preProcess).toDF(['words','label']).cache() dfTrain, dfValid = bigram.transform(dfPre).randomSplit([0.8,0.2]) dfTrain.cache() dfValid.cache() lists=dfTrain.map(lambda r : r.bigrams).collect() dictBigrams=list(set(itertools.chain(*lists))) dictionaryBigrams={} for i,word in enumerate(dictBigrams): dictionaryBigrams[word]=i dict_broad=sc.broadcast(dictionaryBigrams) revDict_broad=sc.broadcast(dictBigrams)
# COMMAND ---------- from pyspark.ml.feature import StopWordsRemover englishStopWords = StopWordsRemover.loadDefaultStopWords("english") stops = StopWordsRemover()\ .setStopWords(englishStopWords)\ .setInputCol("DescOut") stops.transform(tokenized).show() # COMMAND ---------- from pyspark.ml.feature import NGram unigram = NGram().setInputCol("DescOut").setN(1) bigram = NGram().setInputCol("DescOut").setN(2) unigram.transform(tokenized.select("DescOut")).show(False) bigram.transform(tokenized.select("DescOut")).show(False) # COMMAND ---------- from pyspark.ml.feature import CountVectorizer cv = CountVectorizer()\ .setInputCol("DescOut")\ .setOutputCol("countVec")\ .setVocabSize(500)\ .setMinTF(1)\ .setMinDF(2) fittedCV = cv.fit(tokenized)
sqlContext = SQLContext(sc) df = sqlContext.createDataFrame(rdd, ['review', 'label']) dfTrain, dfTest = df.randomSplit([0.8,0.2]) print "Clean train and test set created" from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol='review', outputCol='words') dfTrainTok = tokenizer.transform(dfTrain) print "Tokens computed" from pyspark.ml.feature import NGram bigram = NGram(inputCol="words", outputCol="bigrams") dfBigram = bigram.transform(dfTrainTok) print "Bigrams computed" import itertools lists=dfBigram.map(lambda r : r.bigrams).collect() dictBigrams=set(itertools.chain(*lists)) dictionaryBigrams={} for i,word in enumerate(dictBigrams): dictionaryBigrams[word]=i print "Dictionary created"
get_ipython().magic(u'autoreload 2') import transformers as tr posTagger = tr.NLTKPosTagger( inputCol="words", outputCol="tags") print "Compute tags" t0 = time() dfTags = posTagger.transform(df) dfTags.show() tt = time() - t0 print "Tags computed in {} second".format(round(tt,3)) # In[7]: from pyspark.ml.feature import NGram trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams") t0 = time() dfTriAux = trigram.transform(dfTags) trigram.setInputCol("words") trigram.setOutputCol("wordTrigrams") dfTri = trigram.transform(dfTriAux) dfTri.show() tt = time() - t0 print "Trigrams created in {} second".format(round(tt,3)) # In[8]: dfTrain, dfTest = dfTri.randomSplit([0.8,0.2])
from __future__ import print_function from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import NGram # $example off$ if __name__ == "__main__": sc = SparkContext(appName="NGramExample") sqlContext = SQLContext(sc) # $example on$ wordDataFrame = sqlContext.createDataFrame( [ (0, ["Hi", "I", "heard", "about", "Spark"]), (1, ["I", "wish", "Java", "could", "use", "case", "classes"]), (2, ["Logistic", "regression", "models", "are", "neat"]), ], ["label", "words"], ) ngram = NGram(inputCol="words", outputCol="ngrams") ngramDataFrame = ngram.transform(wordDataFrame) for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3): print(ngrams_label) # $example off$ sc.stop()
tt = time() - t0 print print "Dataframe created in {} second".format(round(tt,3)) # In[314]: from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol='review', outputCol='words') dfTok = tokenizer.transform(df) # In[315]: from pyspark.ml.feature import NGram bigram = NGram(inputCol="words", outputCol="bigrams") dfBigram = bigram.transform(dfTok) # In[317]: print "Start tokenizing, computing bigrams and splitting between test and train" t0 = time() dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2]) dfTrain.take(1) dfTest.take(1) tt = time() - t0 print "Done in {} second".format(round(tt,3)) # In[318]:
out_col = self.getOutputCol() in_col = dataset[self.getInputCol()] return dataset.withColumn(out_col, udf(f, t)(in_col)) posTagger = NLTKPosTagger(inputCol="words", outputCol="tagWords") dfTagged = posTagger.transform(dfTrainTok) #dfTagged.show() #---------------------------------------------------------------------- #------------------------------Bigrams--------------------------------- #---------------------------------------------------------------------- from pyspark.ml.feature import NGram bigram = NGram(inputCol="words", outputCol="bigrams") dfBigram = bigram.transform(dfTrainTokNoSw) print "DataFrame des Bigram: " dfBigram.show() #********************************************************************** #------------------------Feature selection----------------------------- #********************************************************************** # Pour la suite on a le choix entre l'encodage utilisé par le prof (le mot y est ou n'y est pas) # ou la version en apparence plus informative du tfidf. En vrai, le tfidf peut être trompeur # donc je construis quand même les dictionnaires d'unigrammes et de bigrammes pour pouvoir # calculer les sparse vectors du prof.