예제 #1
0
def train(allHex,labels,hashFiles,sc,sqlc,path):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")

    def fun(accum,x):
        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))

    ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000)

    featureFitModel = cv.fit(ngramFrame)

    featuresCV = featureFitModel.transform(ngramFrame)

    labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0]))

    labelFrame = labelRdd.toDF(["did","label"])

    trainData = ngramFrame.featuresCV(labelFrame,"did")
    trainData.persist(StorageLevel(True, True, False, False, 1))
    saveData(trainData,path)

    trainData.show()
    returm featureFitModel
예제 #2
0
def shringles(x, fileName):
    # tokenize and ngrams
    tokenizer = RegexTokenizer(inputCol="value",
                               outputCol="words",
                               pattern="\\W")
    ngrams = NGram(n=x, inputCol="words", outputCol="kshringles")
    shringleList.append(ngrams.transform(tokenizer.transform(read(fileName))))
예제 #3
0
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")
    def fun(accum,x):

        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))
    Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1])))
    sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0])))

    ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    featuresCV = featureFitModel.transform(ngramFrame)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(ngramFrame,path)
    testData.show()
예제 #4
0
def ngrram(dataframe, column, x):
    tokens = Tokenizer(inputCol=column, outputCol='tokens')
    nn = NGram(n=x, inputCol='tokens', outputCol='ngrams')
    b = tokens.transform(dataframe)
    a = nn.transform(b)
    final = a.select(['tokens', 'ngrams']).show(4)
    return final
예제 #5
0
    def extract_featrues(self, train_rdd=None, test_rdd=None):
        """
        train_rdd: type rdd, the raw rdd of train data (text content, label)
        test_rdd: type rdd, the raw rdd of test data (text content, doc_id)
        return: type data frame, a data frame where each record contains the extracred features
        """
        print('****************************')
        print('Feature Extraction: TF-IDF\n')

        train_raw_df = train_rdd.map(lambda row:
                                     (self.convert(row[0]), row[1])).toDF(
                                         ['words', 'label'])
        test_raw_df = test_rdd.map(lambda row:
                                   (self.convert(row[0]), row[1])).toDF(
                                       ['words', 'doc_id'])

        ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
        train_ngram_df = ngram.transform(train_raw_df).drop('words')
        test_ngram_df = ngram.transform(test_raw_df).drop('words')

        hashing_tf = HashingTF(inputCol='ngrams', outputCol='raw_features')
        train_raw_featured_data = hashing_tf.transform(train_ngram_df).drop(
            'ngrams')
        test_raw_featured_data = hashing_tf.transform(test_ngram_df).drop(
            'ngrams')

        idf = IDF(inputCol='raw_features', outputCol='features')
        idf_model = idf.fit(train_raw_featured_data)

        train_df = idf_model.transform(train_raw_featured_data).drop(
            'raw_features')
        test_df = idf_model.transform(test_raw_featured_data).drop(
            'raw_features')

        return (train_df, test_df)
def extract_collocations(records, num_collocations, collocation_window):
    """Extracts the most common collocations present in the records.

    Params:
    - records (pyspark.rdd.RDD): The tokenized and lemmatized records from the JSON file
    - num_collocations (int): The number of collocations to show
    - collocation_window (int): The text window within which to search for collocations.

    Returns:
    - best_collocations (list<tuple<str, int>>): The highest scored collocations present in the records, with their
                                                 frequency of occurrence in the dataset.
    """
    # @see: https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram
    from pyspark.ml.feature import NGram

    data_frame = records.map(lambda record: Row(record[constants.VALUE])).toDF(
        ['words'])
    ngram_model = NGram(n=2, inputCol='words', outputCol='ngrams')
    ngram_data_frame = ngram_model.transform(data_frame)

    ngram_rdd = ngram_data_frame.select('ngrams').rdd
    ngram_rdd = ngram_rdd.flatMap(lambda row: row['ngrams'])\
        .map(lambda ngram: (ngram.encode('utf-8'), 1))\
        .reduceByKey(add)\
        .sortBy(lambda bigram_with_count: bigram_with_count[1], ascending=False)
    rdd_show(ngram_rdd)

    frequent_collocations = ngram_rdd.take(num_collocations)

    return frequent_collocations
예제 #7
0
파일: helper.py 프로젝트: saedr/sabayon-p1
def build_pipeline(classifier='rf', max_depth=7):
    """
	creates a pipeline of functionalities to be applied on the training set
	"""

    # Training: Tokenize, Removing stop words, calculating n-grams, calcuating frequencies
    tokenizer = RegexTokenizer(inputCol="text",
                               outputCol="words",
                               pattern='\w{8}|\s')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=['??'])
    ngram_2 = NGram(n=2, inputCol='filtered', outputCol='ngrams')
    ngram_3 = NGram(n=3, inputCol='filtered', outputCol='ngrams')
    hashingTF = HashingTF(inputCol="ngrams", outputCol="features")
    word2vec = Word2Vec(inputCol='ngrams', outputCol='features')

    if classifier == 'rf':
        clf = RandomForestClassifier(maxDepth=max_depth)
        stages = [tokenizer, remover, ngram_2, hashingTF, clf]
    elif classifier == 'nb':
        clf = NaiveBayes(smoothing=1)
        stages = [tokenizer, remover, ngram_3, hashingTF, clf]
    elif classifier == 'lr':
        clf = LogisticRegression()
        stages = [tokenizer, remover, ngram_2, word2vec, clf]
    else:
        raise ValueError("classifier must be 'rf', 'nb', or 'lr'.")
    return stages
def Ngram_feature(N, feature_rdd):
    '''
        Extract and count N-gram. Leave top 1000 n-gram features if it's 2-gram or more.
        
        Input:
        feature_rdd : [(<hash1>,<feature1>), (<hash1>,<feature2>), ..., (<hashN>,<featureK>)]
        
        Output:
        freq_ngram_count_rdd : [((<hash>,<ngram feature>),cnt), ...]
        '''
    feature_rdd = feature_rdd.groupByKey().map(lambda x: (x[0],list(x[1])))
    df = spark.createDataFrame(feature_rdd).toDF("file_names", "features")
    ngram = NGram(n=N, inputCol="features", outputCol="ngrams")
    ngramDataFrame = ngram.transform(df)
    ngram_rdd = ngramDataFrame.rdd.map(tuple).map(lambda x: (x[0],x[2])).flatMapValues(lambda x: x)
    ngram_count_rdd = ngram_rdd.map(lambda x: ((x),1)).reduceByKey(add)
    freq_ngram_count_rdd = ngram_count_rdd

    if not N == 1:
        #[(<ngram feature>,cnt), ...]
        topN_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],x[1])).reduceByKey(add)
        #[((<ngram feature>,cnt),index), ...]
        topN_ngram_count_rdd = topN_ngram_count_rdd.sortBy(lambda x: x[1],ascending=False).zipWithIndex()
        length = topN_ngram_count_rdd.count()
        #top [(<ngram feature>,cntSum), ...]
        topN_ngram_count_rdd = topN_ngram_count_rdd.filter(lambda x: x[1]<1000).map(lambda x: x[0])
        #freq [(<ngram feature>,(<hash>,cnt)), ...]
        freq_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],(x[0][0],x[1])))
        #[(<ngram feature>,(cntSum,(<hash>,cnt))), ...]
        freq_ngram_count_rdd = topN_ngram_count_rdd.join(freq_ngram_count_rdd).map(lambda x: ((x[1][1][0],x[0]),x[1][1][1]))
    
    return freq_ngram_count_rdd
예제 #9
0
def feature_engineering(class_balancedDf):
    # N-Gram
    ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams")
    ngramDataFrame = ngram.transform(class_balancedDf)

    # Hashing TF
    hashingTF = HashingTF(inputCol="ngrams",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(ngramDataFrame)

    # IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # K-Means
    kmeans = KMeans().setK(6).setSeed(1)
    kmodel = kmeans.fit(rescaledData).transform(rescaledData)

    #LDA
    lda = LDA(k=10, maxIter=10)
    ldamodel = lda.fit(kmodel).transform(kmodel)

    # changing label column to int
    data = ldamodel.withColumn(
        "label", ldamodel.label.cast("Integer")).drop("prediction")

    return data
예제 #10
0
  def create_ngram(self, df, n, input_col, output_col='ngrams'):
    "Generate N-Gram -> https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram"
    from pyspark.ml.feature import NGram

    ngram = NGram(n=n, inputCol=input_col, outputCol=output_col)

    ngram_df = ngram.transform(df)
    return ngram_df
예제 #11
0
 def test_ngram(self):
     dataset = self.spark.createDataFrame([
         Row(input=["a", "b", "c", "d", "e"])])
     ngram0 = NGram(n=4, inputCol="input", outputCol="output")
     self.assertEqual(ngram0.getN(), 4)
     self.assertEqual(ngram0.getInputCol(), "input")
     self.assertEqual(ngram0.getOutputCol(), "output")
     transformedDF = ngram0.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
예제 #12
0
def calculate_vectors(data, n=2, binary=False):
    ngram = NGram(n=n, inputCol="sequence", outputCol="ngrams")
    ngramDataFrame = ngram.transform(data)
    ngrams = ngramDataFrame.select("ngrams")
    cvectorizer = CountVectorizer(
        inputCol="ngrams", outputCol="vec", binary=binary
    )
    model = cvectorizer.fit(ngrams)
    return model.transform(ngrams).select("vec")
예제 #13
0
def opcode_ngram(df_opcode, N):
    """
    Generates n-grams opcode by opcode data frame.
    Returns n-grams opcode in RDD((filename, n-gram), total_counts)
    """
    ngrams = NGram(n=N, inputCol="opcode", outputCol="ngrams")
    df_ngrams = ngrams.transform(df_opcode)
    rdd_ngrams = df_ngrams.select("filename", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\
                    .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add)
    return rdd_ngrams
예제 #14
0
def bytes_ngram(df_bytes, n):
    """
    Generates n-grams bytes by bytes data frame.
    Returns n-grams bytes in RDD((hash, n-gram), total_counts)
    """
    ngrams = NGram(n=n, inputCol="bytes", outputCol="ngrams")
    df_ngrams = ngrams.transform(df_bytes)
    rdd_ngrams = df_ngrams.select("hash", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\
                    .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add)
    return rdd_ngrams
def make_ngrams (df, n=1):

    df = df.withColumn('normalized_text', processing(F.col('text')))
    tokenizer = Tokenizer(inputCol="normalized_text", outputCol="tokens")
    tokenized = tokenizer.transform(df).drop('normalized_text')
    
    ngram = NGram(n=n, inputCol="tokens", outputCol="n_gram")
    n_gram_df = ngram.transform(tokenized)
    n_gram_df = n_gram_df.withColumn('n_gram', F.explode('n_gram'))
    n_gram_df = n_gram_df.filter(F.length('n_gram')>2)
    
    return n_gram_df
def ngram(dataframe, in_col, out_col, n):
    
    ngram = NGram(n=n, inputCol=in_col, outputCol=out_col)
    dataframe = ngram.transform(dataframe)
    
    # summarise top n-grams
    dataframe\
    .groupBy(out_col)\
    .count()\
    .sort(col("count").desc())\
    .show()
    
    return dataframe
예제 #17
0
    def learn(self, text_df):
        """Spark transformation to learn the adjacent terms of a given ngram"""

        ngram = NGram(n=self.n, inputCol='tokenized_text', outputCol='ngram')
        ngram_df = ngram.transform(text_df)
        # create the ngram to adjacent term mappings
        ngram_list = ngram_df.select("ngram").rdd.map(lambda r: r['ngram']).collect()
        self.ngram_model = ngram_df.rdd \
            .map(lambda x: PreProcess.generate_adjacent_terms(x.asDict()['ngram'])) \
            .flatMap(lambda xs: [x for x in xs]) \
            .map(lambda y: (y[0], [y[1]])) \
            .reduceByKey(lambda a, b: a + b).collect()

        # create list of the keys in the model and store them
        self.model_keys = self.ngram_model.map(lambda x: x[0]).collect()
예제 #18
0
def build_pipeline():
    tokenizer = [Tokenizer(inputCol='tweet', outputCol='words')]
    ngrams = [
        NGram(n=i, inputCol='words', outputCol='{0}_grams'.format(i))
        for i in range(1, 4)
    ]
    cv = [
        CountVectorizer(vocabSize=5460,
                        inputCol='{0}_grams'.format(i),
                        outputCol='{0}_tf'.format(i)) for i in range(1, 4)
    ]
    idf = [
        IDF(inputCol='{0}_tf'.format(i),
            outputCol='{0}_tfidf'.format(i),
            minDocFreq=5) for i in range(1, 4)
    ]
    assembler = [
        VectorAssembler(inputCols=['{0}_tfidf'.format(i) for i in range(1, 4)],
                        outputCol='features')
    ]
    label_stringIdx = [StringIndexer(inputCol='sentiment', outputCol='label')]
    lr = [LogisticRegression(maxIter=100)]
    pipeline = Pipeline(stages=tokenizer + ngrams + cv + idf + assembler +
                        label_stringIdx + lr)
    return pipeline
예제 #19
0
def create_tfidf_model(sentenceDataFrame, ngrams=1, minDocFreq=0):

    tokenized = Tokenizer(inputCol="text",
                          outputCol="words").transoform(sentenceDataFrame)

    ngramDataFrame = NGram(n=ngrams, inputCol="words",
                           outputCol="ngrams").transform(tokenized)

    countVect = CountVectorizer(inputCol="ngrams", outputCol="rawFeatures")

    countVectModel = countVect.fit(ngramDataFrame)

    featurizedData = countVectModel.transform(ngramDataFrame)

    idf = IDF(minDocFreq=minDocFreq,
              inputCol="rawFeatures",
              outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.select("label", "features")

    normalizer = Normalizer(inputCol="features", outputCol='scores')
    X = normalizer.transform(rescaledData)

    return X
예제 #20
0
def LR_Model(train_dataframe, test_dataframe):
    '''
    Takes the argument as a train_dataframe, test_dataframe implements the
    pipeline of RegexTokenizer,    NGrams =3 , HashingTF, IDF and
    LogisticRegression and predicts the label based on features of
    test_dataframe.

    The Pattern RegexTokenizer is set to "\\W|\b(00|CC)\b" because it removes
    all nonwords that is extra spaces or punctuations, '??', '00' and 'CC' are
    removed as these are most repeated words and accuracy is significantly
    improved.
    Args:
        dataframe:
            -The train_dataframe should consist of the columns, 'label'
            and 'text'.
            -The test_dataframe should consist of the column 'text'.
    Returns:
        DataFrame['prediction': double, given_order: bigint, label: string]
        iff data read initially is a small dataset
        else DataFrame['prediction': double, given_order: bigint]
        data read initially is a big dataset
    '''
    train_dataframe = train_dataframe.repartition(96)\
        .withColumn('label', train_dataframe['label'].cast(IntegerType()))
    regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words",
                                    pattern="\\W|\b(00|CC)\b")
    ngram = NGram(n=3, inputCol="words", outputCol="ngrams")
    hashingTF = HashingTF(inputCol="ngrams", outputCol="TF")
    idf = IDF(inputCol="TF", outputCol="features")
    lr = LogisticRegression(maxIter=20, regParam=0.001)
    pipeline = Pipeline(stages=[regexTokenizer, ngram, hashingTF, idf, lr])
    model = pipeline.fit(train_dataframe)
    predictions_df = model.transform(test_dataframe)
    return predictions_df\
        .drop('rawfeatures', 'n_grams', 'TF', 'text', 'words', 'features')
예제 #21
0
def ngramFeatureExtractors(n, inputCol=["text", "target"]):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    ngrams = [
        NGram(n=i, inputCol="words", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    count_vectorizer = [
        CountVectorizer(vocabSize=5460,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=5) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="features")
    ]
    label_stringIdx = [StringIndexer(inputCol="target", outputCol="label")]
    lr = [LogisticRegression(maxIter=100)]
    return Pipeline(stages=tokenizer + ngrams + count_vectorizer + idf +
                    assembler + label_stringIdx + lr)
예제 #22
0
def build_pipeline():
    tokenizer = [Tokenizer(inputCol='text', outputCol='words')]
    remover = [StopWordsRemover(inputCol="words", outputCol="stopped_words")]
    ngrams = [
        NGram(n=i, inputCol='stopped_words', outputCol='{0}_grams'.format(i))
        for i in range(1, 6)
    ]
    cv = [
        CountVectorizer(vocabSize=50000,
                        inputCol='{0}_grams'.format(i),
                        outputCol='{0}_tf'.format(i)) for i in range(1, 6)
    ]
    idf = [
        IDF(inputCol='{0}_tf'.format(i),
            outputCol='{0}_tfidf'.format(i),
            minDocFreq=5) for i in range(1, 6)
    ]
    tweetvect = [
        VectorAssembler(inputCols=["tweet_count"], outputCol="vec_tweet_count")
    ]
    ss = [
        StandardScaler(inputCol="vec_tweet_count", outputCol="ss_tweet_count")
    ]
    assembler = [VectorAssembler(inputCols=input_cols, outputCol='features')]
    pipeline = Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                        tweetvect + ss + assembler)
    return pipeline
예제 #23
0
class SentimentalPipelineEngine(PipelineEngine):
    def __init__(self, cv):
        super(SentimentalPipelineEngine, self).__init__(cv)
        self.tokenizer_map = [TweetTokenizer()]
        self.ngram_map = [1]
        self.hashing_tf_map = [pow(2, 20)]
        self.clf_map = [0.1]
        self.stages = self._build_stages()
        self.pipeline = Pipeline(stages=self.stages)
        self.param_grid = self._build_param_grid()

    def _build_stages(self):
        self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
        self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
        self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
        self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed")
        self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams")
        self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features")
        self.idf = IDF(inputCol="features", outputCol="idf_features")
        self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0)
        self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1)
        # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2])
        return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf]

    def _build_param_grid(self):
        param_grid_builder = ParamGridBuilder()
        param_grid_builder.addGrid(self.tokenizer.tokenizer, self.tokenizer_map)
        param_grid_builder.addGrid(self.ngram.n, self.ngram_map)
        param_grid_builder.addGrid(self.hashing_tf.numFeatures, self.hashing_tf_map)
        param_grid_builder.addGrid(self.clf.regParam, self.clf_map)
        return param_grid_builder.build()
예제 #24
0
def convert_ngrams(df, column):
    # convert tokens to ngram

    n = 5

    # convert to ngrams 2 to n ngrams
    for i in range(1, n + 1):
        ngram = NGram(n=i,
                      inputCol=column,
                      outputCol='{}_{}'.format(column, i))
        df = ngram.transform(df)

    return df.withColumn(
        column,
        concat(*['{}_{}'.format(column, i) for i in range(1, n + 1)])).drop(
            *['{}_{}'.format(column, i) for i in range(1, n + 1)])
예제 #25
0
def process_df(df):
    time_seq.append(['start process-df', time.time()])
    model = Pipeline(stages=[
        RegexTokenizer(pattern=" ",
                       inputCol="instruments",
                       outputCol="instruments_tokenized",
                       minTokenLength=1),
        NGram(n=1,
              inputCol="instruments_tokenized",
              outputCol="instruments_ngrams"),
        HashingTF(inputCol="instruments_ngrams",
                  outputCol="instruments_vectors"),
        MinHashLSH(inputCol="instruments_vectors",
                   outputCol="instruments_lsh",
                   numHashTables=10)
    ]).fit(df)

    df_hashed = model.transform(df)
    df_matches = model.stages[-1].approxSimilarityJoin(df_hashed, df_hashed, 0.5, distCol="distance") \
        .filter("datasetA.filename != datasetB.filename AND datasetA.filename < datasetB.filename") \
        .select(f.col('datasetA.filename').alias('filename_A'),
                f.col('datasetB.filename').alias('filename_B'),
                f.col('distance'))
    time_seq.append(['process-df df_matches', time.time()])
    write_df_to_pgsql(df_matches, 'filepair_similarity_run3')
    time_seq.append(['write pgsql', time.time()])
    print('time_seq', time_seq)
예제 #26
0
def build_ngrams(n=3):
    tokenizer = [Tokenizer(inputCol="text", outputCol="tokens")]
    stopwordsRemover = [
        StopWordsRemover(inputCol='tokens', outputCol='tokens_filtered')
    ]
    ngrams = [
        NGram(n=i, inputCol="tokens", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]
    cv = [
        CountVectorizer(vocabSize=5460,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_cv".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_cv".format(i),
            outputCol="{0}_idf".format(i),
            minDocFreq=5) for i in range(1, n + 1)
    ]
    assembler = [
        VectorAssembler(
            inputCols=["{0}_idf".format(i) for i in range(1, n + 1)],
            outputCol="features")
    ]

    stringIndexer = [StringIndexer(inputCol="class", outputCol="label")]
    lr = [LogisticRegression(maxIter=100)]

    return Pipeline(stages=tokenizer + ngrams + cv + idf + assembler +
                    stringIndexer + lr)
예제 #27
0
def preprocess(inputCol=["text", "label"], n=4):
    tokenizer = [Tokenizer(inputCol="text", outputCol="words")]
    remover = [StopWordsRemover(inputCol="words", outputCol="filtered")]
    ngrams = [
        NGram(n=i, inputCol="filtered", outputCol="{0}_grams".format(i))
        for i in range(1, n + 1)
    ]

    cv = [
        CountVectorizer(vocabSize=2**14,
                        inputCol="{0}_grams".format(i),
                        outputCol="{0}_tf".format(i)) for i in range(1, n + 1)
    ]
    idf = [
        IDF(inputCol="{0}_tf".format(i),
            outputCol="{0}_tfidf".format(i),
            minDocFreq=2) for i in range(1, n + 1)
    ]

    assembler = [
        VectorAssembler(
            inputCols=["{0}_tfidf".format(i) for i in range(1, n + 1)],
            outputCol="rawFeatures")
    ]
    label_stringIdx = [StringIndexer(inputCol="label", outputCol="labels")]
    selector = [
        ChiSqSelector(numTopFeatures=2**14,
                      featuresCol='rawFeatures',
                      outputCol="features")
    ]
    lr = [LogisticRegression(maxIter=1000)]
    return Pipeline(stages=tokenizer + remover + ngrams + cv + idf +
                    assembler + label_stringIdx + selector + lr)
예제 #28
0
def n_gram_fingerprint(df, input_cols, n_size=2):
    """
    Calculate the ngram for a fingerprinted string
    :param df: Dataframe to be processed
    :param input_cols: Columns to be processed
    :param n_size:
    :return:
    """
    def remote_white_spaces_remove_sort_join(value, args):
        # remove white spaces
        value = [x.replace(" ", "") for x in value]

        # sort and remove duplicated
        value = sorted(set(value))

        # join the tokens back together
        value = "".join(value)

        return value

    input_cols = parse_columns(df, input_cols)

    for input_col in input_cols:
        ngram_col = name_col(input_col, NGRAM_COL)
        ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL)

        df = (
            df.cols.copy(input_col, name_col(
                input_col,
                NGRAM_COL)).cols.lower(ngram_col).cols.remove_white_spaces(
                    ngram_col).cols.remove_special_chars(
                        ngram_col).cols.remove_accents(ngram_col)
            # For create n-grams we need an Array type column
            .cols.nest(input_cols=ngram_col,
                       output_col=ngram_col,
                       shape='array'))
        if Optimus.cache:
            df = df.cache()

        n_gram = NGram(n=n_size,
                       inputCol=ngram_col,
                       outputCol=ngram_fingerprint_col)
        df = n_gram.transform(df)
        df = df.cols.apply(ngram_fingerprint_col,
                           remote_white_spaces_remove_sort_join, "string")

    return df
예제 #29
0
def test(opcodes, hashFiles, sc, sqlc, path, featureFitModel):

    asmFiles = hashFiles.map(
        lambda x: "gs://uga-dsp/project2/data/asm/" + x + ".asm")

    def fun(accum, x):
        return accum + ',' + x

    asmFileString = asmFiles.reduce(fun)

    rdd1 = sc.wholeTextFiles(asmFileString, 20)

    opcodesInDoc = rdd1.map(lambda x: x[1].split()).map(
        lambda x: [word for word in x if word in opcodes.value]).zipWithIndex(
        ).map(lambda x: (x[1], x[0]))

    ngramFrame = sqlc.createDataFrame(opcodesInDoc, ["docId", "opcodes"])

    twoGram = NGram(n=2, inputCol="opcodes", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    threeGram = NGram(n=3, inputCol="opcodes", outputCol="3grams")
    ngramFrame = threeGram.transform(ngramFrame)

    fourGram = NGram(n=4, inputCol="opcodes", outputCol="4grams")
    ngramFrame = fourGram.transform(ngramFrame)

    def getSegment(x):
        templist = []
        for line in x:
            l = re.findall(r'\w+:?(?=:)', line)
            if l:
                templist.append(l[0])
        return templist

    segments = rdd1.zipWithIndex().map(lambda x: (x[1], x[0][1].splitlines(
    ))).map(lambda x: (x[0], getSegment(x[1]))).toDF(["docId", "segments"])

    featureFrame = ngramFrame.join(segments, "docId")

    featuresDF = featureFrame.rdd.map(
        lambda x: Row(did=x['docId'],
                      docFeatures=x['opcodes'] + x['2grams'] + x['3grams'] + x[
                          '4grams'] + x['segments'])).toDF()

    featuresCV = featureFitModel.transform(featuresDF)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(testData, path)
    testData.show()
예제 #30
0
def build_ngrams_part(inputCol="words", n=6):
    ngrams = [ 
        NGram(n=i, inputCol="words", outputCol="ngrams_{0}".format(i)) 
        for i in range(7, n + 1) ]
    vectorizers = [ 
        CountVectorizer(inputCol="ngrams_{0}".format(i), outputCol="ngramscounts_{0}".format(i)) 
        for i in range(7, n + 1) ]
    return Pipeline(stages=ngrams + vectorizers)
예제 #31
0
def get_ngrams(cases, region_path):
    if (debug): logging.info(region_path)
    for case_path in tqdm(cases):
        parsed = parse_file(case_path)
        text = get_case_text(parsed)
        date = get_decision_date(parsed).year
        state = parsed("case|court").attr('jurisdiction').strip()
        text = text.encode("ascii", "ignore")
        clean_word_list = alphanumeric.sub('', text).lower().split()
        text_df = spark.createDataFrame([Row(inputTokens=clean_word_list)])
        for n in range(1,4):
            if n==1:
                ngrams = clean_word_list
            else:
                ngram_prepared = NGram(n=n, inputCol="inputTokens", outputCol="nGrams")
                ngrams = ngram_prepared.transform(text_df).head().nGrams
            sc.parallelize(ngrams).map(lambda word: (word,1)).reduceByKey(lambda v1,v2: v1 +  v2).map(lambda word_tuple: write_to_file(word_tuple, date, case_path, state, region_path, n=n)).collect()
예제 #32
0
def main(train_x,
         train_y,
         test_x,
         test_y=None,
         idf=False,
         ngram=1,
         base='gs',
         asm=False):
    # Load : DF[id, url, features, label?]
    # The DataFrames only have a labels column if labels are given.
    # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens
    kind = 'asm' if asm else 'bytes'
    train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text')
    test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # Train the preprocessor and transform the data.
    prep = elizabeth.Preprocessor()
    prep.add(NGram(n=int(ngram)))
    prep.add(CountVectorizer())
    if idf: prep.add(IDF())
    train = prep.fit(train)
    test = prep.transform(test)

    # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction]
    nb = NaiveBayes(labelCol='indexedLabel').fit(train)
    test = nb.transform(test)
    test = index_labeller.transform(
        test)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        test = test.orderBy(test.id)
        test = test.withColumn(
            'correct', (test.label == test.predictedClass).cast('double'))
        test = test.select(avg(test.correct))
        print(test.show())

    # If no labels are given for the test set, print predictions.
    else:
        test = test.orderBy(test.id).select(test.predictedClass)
        test = test.rdd.map(lambda row: int(row.predictedClass))
        test = test.toLocalIterator()
        print(*test, sep='\n')
예제 #33
0
def initialize():

    spark = SparkSession \
        .builder \
        .appName("search-flight-spark-ml-model") \
        .getOrCreate()
    sc = spark.sparkContext

    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)
    api = tweepy.API(auth)
    important_fields = ['id', 'text', 'user']

    schema = StructType([
        StructField('id', LongType(), False),
        StructField('text', StringType(), False),
        StructField('username', StringType(), False)
    ])

    tweetsDf = spark.createDataFrame(sc.emptyRDD(), schema)

    for tweet in tweepy.Cursor(api.search, q='barajas', rpp=100,
                               lang='en').items(MAX_TWEETS):
        json_tweet = {k: tweet._json[k] for k in important_fields}
        json_tweet['text'] = json_tweet['text'].replace("'", "").replace(
            "\"", "").replace("\n", "")
        tweetDf = spark.createDataFrame([
            (json_tweet['id'], json_tweet['text'], json_tweet['user']['name'])
        ], schema)
        tweetsDf = tweetsDf.union(tweetDf)

    tweets_df_splitted = tweetsDf.randomSplit([0.75, 0.25], MAX_TWEETS)
    training_set = tweets_df_splitted[0]
    test_set = tweets_df_splitted[1]

    username_indexed = StringIndexer(inputCol="username",
                                     outputCol="username_indexed")
    tokenizer = Tokenizer(inputCol="text", outputCol="token_raw")
    ngram = NGram(inputCol="token_raw", outputCol="ngram", n=2)
    hashing_tf = HashingTF(inputCol="ngram", outputCol="tf", numFeatures=20)
    idf = IDF(inputCol="tf", outputCol="idf", minDocFreq=2)
    lr = LogisticRegression(featuresCol="idf", labelCol="username_indexed")
    pipeline = Pipeline(
        stages=[username_indexed, tokenizer, ngram, hashing_tf, idf, lr])

    pipeline_model = pipeline.fit(training_set)
    pipeline_model.write().overwrite().save("tweet_traveling_partners_model")

    tweet_traveling_partners_prediction = pipeline_model.transform(test_set)

    selected = tweet_traveling_partners_prediction.select(
        "username", "text", "probability", "prediction")
    for row in selected.collect():
        print(row)

    spark.stop()
예제 #34
0
def transformData(df, parameter):
    '''
    Transformed dataframe based on the parameter
        Input : - parameter
        Output : - transformed dataframe
    '''

    ngram = NGram(n=parameter["n"],
                  inputCol=parameter["inputCol"],
                  outputCol=parameter["outputCol"])

    temp = ''

    if len(ngram.transform(df).head().inputTokens) < ngram.getN():
        print('No element in ' + parameter["outputCol"])
    else:
        temp = ngram.transform(df).show()

    return temp
예제 #35
0
 def _build_stages(self):
     self.bs_parser = BeautifulSoupParser(inputCol="review", outputCol="parsed")
     self.tokenizer = Tokenizzzer(inputCol=self.bs_parser.getOutputCol(), outputCol="words")
     self.stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
     self.porter = PorterStemmerTransformer(inputCol=self.stopwords_remover.getOutputCol(), outputCol="stemmed")
     self.ngram = NGram(inputCol=self.porter.getOutputCol(), outputCol="ngrams")
     self.hashing_tf = HashingTF(inputCol=self.ngram.getOutputCol(), outputCol="features")
     self.idf = IDF(inputCol="features", outputCol="idf_features")
     self.normalizer = Normalizer(inputCol="idf_features", outputCol="norm_features", p=1.0)
     self.clf = LogisticRegression(featuresCol='norm_features', regParam=0.1)
     # self.clf = MultilayerPerceptronClassifier(featuresCol="norm_features", maxIter=1000, layers=[self.hashing_tf.getNumFeatures(), 200, 100, 2])
     return [self.bs_parser, self.tokenizer, self.stopwords_remover, self.porter, self.ngram, self.hashing_tf, self.idf, self.normalizer, self.clf]
예제 #36
0

print "Start preprocessing all data"
t0 = time()

def preProcess(doc):
    clean = doc.review.replace("<br /><br />"," ")
    tok = nltk.tokenize.wordpunct_tokenize(clean)
    tags = nltk.pos_tag(tok,tagset='universal')
    low = [word.lower() for word in tok]
    return low,zip(*tags)[1],doc.label

schema = StructType([StructField('words',ArrayType(StringType()),True), StructField('tags',ArrayType(StringType()),True), StructField('label',DoubleType())])

dfPre=df.map(preProcess).toDF(schema).cache()
trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams")
dfTriAux = trigram.transform(dfPre).cache()
trigram.setInputCol("words")
trigram.setOutputCol("wordTrigrams")
dfTri = trigram.transform(dfTriAux).cache()

dfTrain, dfValid = dfTri.randomSplit([0.8,0.2])


lists=dfTrain.map(lambda r : r.words).collect()
dictUnigrams=list(set(itertools.chain(*lists)))
dictionaryUni={}
for i,word in enumerate(dictUnigrams):
	dictionaryUni[word]=i

dict_broad = sc.broadcast(dictionaryUni)
예제 #37
0
from pyspark.ml.feature import NGram
import itertools
import nltk


print "Start preprocessing all data"
t0 = time()

def preProcess(doc):
    clean = doc[0].replace("<br /><br />"," ")
    tok = nltk.tokenize.wordpunct_tokenize(clean)
    low = [word.lower() for word in tok]
    return low,doc[1]

bigram = NGram(inputCol="words", outputCol="bigrams")

dfPre=df.map(preProcess).toDF(['words','label']).cache()
dfTrain, dfValid = bigram.transform(dfPre).randomSplit([0.8,0.2])
dfTrain.cache()
dfValid.cache()

lists=dfTrain.map(lambda r : r.bigrams).collect()
dictBigrams=list(set(itertools.chain(*lists)))
dictionaryBigrams={}
for i,word in enumerate(dictBigrams):
	dictionaryBigrams[word]=i
    
dict_broad=sc.broadcast(dictionaryBigrams)
revDict_broad=sc.broadcast(dictBigrams)
예제 #38
0
from pyspark.ml.feature import NGram
import itertools
import nltk


print "Start preprocessing all data"
t0 = time()

def preProcess(doc):
    clean = doc[0].replace("<br /><br />"," ")
    tok = nltk.tokenize.wordpunct_tokenize(clean)
    low = [word.lower() for word in tok]
    return low,doc[1]

bigram = NGram(inputCol="words", outputCol="bigrams")

dfPre=df.map(preProcess).toDF(['words','label']).cache()
dfTrain, dfValid = bigram.transform(dfPre).randomSplit([0.8,0.2])
dfTrain.cache()
dfValid.cache()

lists=dfTrain.map(lambda r : r.bigrams).collect()
dictBigrams=list(set(itertools.chain(*lists)))
dictionaryBigrams={}
for i,word in enumerate(dictBigrams):
	dictionaryBigrams[word]=i
    
dict_broad=sc.broadcast(dictionaryBigrams)
revDict_broad=sc.broadcast(dictBigrams)

# COMMAND ----------

from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
  .setStopWords(englishStopWords)\
  .setInputCol("DescOut")
stops.transform(tokenized).show()


# COMMAND ----------

from pyspark.ml.feature import NGram
unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)
unigram.transform(tokenized.select("DescOut")).show(False)
bigram.transform(tokenized.select("DescOut")).show(False)


# COMMAND ----------

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("countVec")\
  .setVocabSize(500)\
  .setMinTF(1)\
  .setMinDF(2)
fittedCV = cv.fit(tokenized)
예제 #40
0
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Clean train and test set created"


from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTrainTok = tokenizer.transform(dfTrain)

print "Tokens computed"


from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTrainTok)

print "Bigrams computed"


import itertools
lists=dfBigram.map(lambda r : r.bigrams).collect()
dictBigrams=set(itertools.chain(*lists))
dictionaryBigrams={}
for i,word in enumerate(dictBigrams):
	dictionaryBigrams[word]=i

print "Dictionary created"

예제 #41
0
get_ipython().magic(u'autoreload 2')
import transformers as tr
posTagger = tr.NLTKPosTagger(
    inputCol="words", outputCol="tags")
print "Compute tags"
t0 = time()
dfTags = posTagger.transform(df)
dfTags.show()
tt = time() - t0
print "Tags computed in {} second".format(round(tt,3))


# In[7]:

from pyspark.ml.feature import NGram
trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams")
t0 = time()
dfTriAux = trigram.transform(dfTags)
trigram.setInputCol("words")
trigram.setOutputCol("wordTrigrams")
dfTri = trigram.transform(dfTriAux)
dfTri.show()
tt = time() - t0
print "Trigrams created in {} second".format(round(tt,3))


# In[8]:

dfTrain, dfTest = dfTri.randomSplit([0.8,0.2])

예제 #42
0
from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext

# $example on$
from pyspark.ml.feature import NGram

# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="NGramExample")
    sqlContext = SQLContext(sc)

    # $example on$
    wordDataFrame = sqlContext.createDataFrame(
        [
            (0, ["Hi", "I", "heard", "about", "Spark"]),
            (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
            (2, ["Logistic", "regression", "models", "are", "neat"]),
        ],
        ["label", "words"],
    )
    ngram = NGram(inputCol="words", outputCol="ngrams")
    ngramDataFrame = ngram.transform(wordDataFrame)
    for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
        print(ngrams_label)
    # $example off$

    sc.stop()
예제 #43
0
tt = time() - t0
print
print "Dataframe created in {} second".format(round(tt,3))


# In[314]:

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTok = tokenizer.transform(df)


# In[315]:

from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTok)


# In[317]:

print "Start tokenizing, computing bigrams and splitting between test and train"
t0 = time()
dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2])
dfTrain.take(1)
dfTest.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[318]:
예제 #44
0
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))


posTagger = NLTKPosTagger(inputCol="words", outputCol="tagWords")
dfTagged = posTagger.transform(dfTrainTok)
#dfTagged.show()



#----------------------------------------------------------------------
#------------------------------Bigrams---------------------------------
#----------------------------------------------------------------------
from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTrainTokNoSw)
print "DataFrame des Bigram: "
dfBigram.show()




#**********************************************************************
#------------------------Feature selection-----------------------------
#**********************************************************************

# Pour la suite on a le choix entre l'encodage utilisé par le prof (le mot y est ou n'y est pas)
# ou la version en apparence plus informative du tfidf. En vrai, le tfidf peut être trompeur 
# donc je construis quand même les dictionnaires d'unigrammes et de bigrammes pour pouvoir 
# calculer les sparse vectors du prof.