예제 #1
0
    def extract_featrues(self, train_rdd=None, test_rdd=None):
        """
        train_rdd: type rdd, the raw rdd of train data (text content, label)
        test_rdd: type rdd, the raw rdd of test data (text content, doc_id)
        return: type data frame, a data frame where each record contains the extracred features
        """
        print('****************************')
        print('Feature Extraction: TF-IDF\n')

        train_raw_df = train_rdd.map(lambda row:
                                     (self.convert(row[0]), row[1])).toDF(
                                         ['words', 'label'])
        test_raw_df = test_rdd.map(lambda row:
                                   (self.convert(row[0]), row[1])).toDF(
                                       ['words', 'doc_id'])

        ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
        train_ngram_df = ngram.transform(train_raw_df).drop('words')
        test_ngram_df = ngram.transform(test_raw_df).drop('words')

        hashing_tf = HashingTF(inputCol='ngrams', outputCol='raw_features')
        train_raw_featured_data = hashing_tf.transform(train_ngram_df).drop(
            'ngrams')
        test_raw_featured_data = hashing_tf.transform(test_ngram_df).drop(
            'ngrams')

        idf = IDF(inputCol='raw_features', outputCol='features')
        idf_model = idf.fit(train_raw_featured_data)

        train_df = idf_model.transform(train_raw_featured_data).drop(
            'raw_features')
        test_df = idf_model.transform(test_raw_featured_data).drop(
            'raw_features')

        return (train_df, test_df)
예제 #2
0
def generate_nlp_columns(input_dataset,target):
    udf_remove_punc = udf(lambda s: removePunctuation(s) )
    # Remove Punctuation
    input_dataset = input_dataset.withColumn(target,udf_remove_punc(target))
    # Tokenize Title
    tokenizer = Tokenizer(inputCol=target, outputCol=target+"_words")
    input_dataset = tokenizer.transform(input_dataset)
    # Remove Stop Words
    remover = StopWordsRemover(inputCol=target+"_words", outputCol=target+"_cleanwords")
    input_dataset = remover.transform(input_dataset)
    # Generate N-Grams 
    ngram = NGram(n=2, inputCol=target+"_cleanwords", outputCol=target+"_bigrams")
    input_dataset = ngram.transform(input_dataset)
    trigram = NGram(n=3, inputCol=target+"_cleanwords", outputCol=target+"_trigrams")
    input_dataset = trigram.transform(input_dataset)
    # Drop Extra Columns - Leave ngrams only.
    input_dataset = input_dataset.drop(target+"_words")
    input_dataset = input_dataset.drop(target+"_cleanwords")
    # Perform TFIDF
    #hashingTF = HashingTF(inputCol=target+"_trigrams", outputCol=target+"_hashing", numFeatures=20)
    #input_dataset = hashingTF.transform(input_dataset)
    #idf = IDF(inputCol=target+"_hashing", outputCol=target+"_features")
    #idfModel = idf.fit(input_dataset)
    #input_dataset = idfModel.transform(input_dataset) 
    return input_dataset
예제 #3
0
def test(opcodes, hashFiles, sc, sqlc, path, featureFitModel):

    asmFiles = hashFiles.map(
        lambda x: "gs://uga-dsp/project2/data/asm/" + x + ".asm")

    def fun(accum, x):
        return accum + ',' + x

    asmFileString = asmFiles.reduce(fun)

    rdd1 = sc.wholeTextFiles(asmFileString, 20)

    opcodesInDoc = rdd1.map(lambda x: x[1].split()).map(
        lambda x: [word for word in x if word in opcodes.value]).zipWithIndex(
        ).map(lambda x: (x[1], x[0]))

    ngramFrame = sqlc.createDataFrame(opcodesInDoc, ["docId", "opcodes"])

    twoGram = NGram(n=2, inputCol="opcodes", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    threeGram = NGram(n=3, inputCol="opcodes", outputCol="3grams")
    ngramFrame = threeGram.transform(ngramFrame)

    fourGram = NGram(n=4, inputCol="opcodes", outputCol="4grams")
    ngramFrame = fourGram.transform(ngramFrame)

    def getSegment(x):
        templist = []
        for line in x:
            l = re.findall(r'\w+:?(?=:)', line)
            if l:
                templist.append(l[0])
        return templist

    segments = rdd1.zipWithIndex().map(lambda x: (x[1], x[0][1].splitlines(
    ))).map(lambda x: (x[0], getSegment(x[1]))).toDF(["docId", "segments"])

    featureFrame = ngramFrame.join(segments, "docId")

    featuresDF = featureFrame.rdd.map(
        lambda x: Row(did=x['docId'],
                      docFeatures=x['opcodes'] + x['2grams'] + x['3grams'] + x[
                          '4grams'] + x['segments'])).toDF()

    featuresCV = featureFitModel.transform(featuresDF)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(testData, path)
    testData.show()
예제 #4
0
def ngrram(dataframe, column, x):
    tokens = Tokenizer(inputCol=column, outputCol='tokens')
    nn = NGram(n=x, inputCol='tokens', outputCol='ngrams')
    b = tokens.transform(dataframe)
    a = nn.transform(b)
    final = a.select(['tokens', 'ngrams']).show(4)
    return final
def Ngram_feature(N, feature_rdd):
    '''
        Extract and count N-gram. Leave top 1000 n-gram features if it's 2-gram or more.
        
        Input:
        feature_rdd : [(<hash1>,<feature1>), (<hash1>,<feature2>), ..., (<hashN>,<featureK>)]
        
        Output:
        freq_ngram_count_rdd : [((<hash>,<ngram feature>),cnt), ...]
        '''
    feature_rdd = feature_rdd.groupByKey().map(lambda x: (x[0],list(x[1])))
    df = spark.createDataFrame(feature_rdd).toDF("file_names", "features")
    ngram = NGram(n=N, inputCol="features", outputCol="ngrams")
    ngramDataFrame = ngram.transform(df)
    ngram_rdd = ngramDataFrame.rdd.map(tuple).map(lambda x: (x[0],x[2])).flatMapValues(lambda x: x)
    ngram_count_rdd = ngram_rdd.map(lambda x: ((x),1)).reduceByKey(add)
    freq_ngram_count_rdd = ngram_count_rdd

    if not N == 1:
        #[(<ngram feature>,cnt), ...]
        topN_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],x[1])).reduceByKey(add)
        #[((<ngram feature>,cnt),index), ...]
        topN_ngram_count_rdd = topN_ngram_count_rdd.sortBy(lambda x: x[1],ascending=False).zipWithIndex()
        length = topN_ngram_count_rdd.count()
        #top [(<ngram feature>,cntSum), ...]
        topN_ngram_count_rdd = topN_ngram_count_rdd.filter(lambda x: x[1]<1000).map(lambda x: x[0])
        #freq [(<ngram feature>,(<hash>,cnt)), ...]
        freq_ngram_count_rdd = freq_ngram_count_rdd.map(lambda x: (x[0][1],(x[0][0],x[1])))
        #[(<ngram feature>,(cntSum,(<hash>,cnt))), ...]
        freq_ngram_count_rdd = topN_ngram_count_rdd.join(freq_ngram_count_rdd).map(lambda x: ((x[1][1][0],x[0]),x[1][1][1]))
    
    return freq_ngram_count_rdd
def extract_collocations(records, num_collocations, collocation_window):
    """Extracts the most common collocations present in the records.

    Params:
    - records (pyspark.rdd.RDD): The tokenized and lemmatized records from the JSON file
    - num_collocations (int): The number of collocations to show
    - collocation_window (int): The text window within which to search for collocations.

    Returns:
    - best_collocations (list<tuple<str, int>>): The highest scored collocations present in the records, with their
                                                 frequency of occurrence in the dataset.
    """
    # @see: https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram
    from pyspark.ml.feature import NGram

    data_frame = records.map(lambda record: Row(record[constants.VALUE])).toDF(
        ['words'])
    ngram_model = NGram(n=2, inputCol='words', outputCol='ngrams')
    ngram_data_frame = ngram_model.transform(data_frame)

    ngram_rdd = ngram_data_frame.select('ngrams').rdd
    ngram_rdd = ngram_rdd.flatMap(lambda row: row['ngrams'])\
        .map(lambda ngram: (ngram.encode('utf-8'), 1))\
        .reduceByKey(add)\
        .sortBy(lambda bigram_with_count: bigram_with_count[1], ascending=False)
    rdd_show(ngram_rdd)

    frequent_collocations = ngram_rdd.take(num_collocations)

    return frequent_collocations
예제 #7
0
def test(allHex,hashFiles,sc,sqlc,path,featureFitModel):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")
    def fun(accum,x):

        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [str(int(word,16)) for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))
    Vec= bytesRdd.map(lambda x: (x[0],createVector(x[1])))
    sparseVec = Vec.map(lambda x: (x[0],SparseVector(256,numpy.nonzero(x[1])[0],x[1][x[1]>0])))

    ngramFrame = sqlc.createDataFrame(sparseVec,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    featuresCV = featureFitModel.transform(ngramFrame)

    testData = featuresCV.drop('docFeatures')
    testData.persist(StorageLevel(True, True, False, False, 1))
    saveData(ngramFrame,path)
    testData.show()
예제 #8
0
def shringles(x, fileName):
    # tokenize and ngrams
    tokenizer = RegexTokenizer(inputCol="value",
                               outputCol="words",
                               pattern="\\W")
    ngrams = NGram(n=x, inputCol="words", outputCol="kshringles")
    shringleList.append(ngrams.transform(tokenizer.transform(read(fileName))))
예제 #9
0
def train(allHex,labels,hashFiles,sc,sqlc,path):

    bytesFiles = hashFiles.map(lambda x: "gs://uga-dsp/project2/data/bytes/"+ x+".bytes")

    def fun(accum,x):
        return accum+','+x

    bytesFileString = bytesFiles.reduce(fun)
    rdd1= sc.wholeTextFiles(bytesFileString,20)

    bytesRdd = rdd1.map(lambda x: x[1].split()).map(lambda x: [word for word in x if word in allHex.value]).zipWithIndex().map(lambda x: (x[1],x[0]))

    ngramFrame = sqlc.createDataFrame(bytesRdd,["did","1grams"])

    twoGram = NGram(n=2, inputCol="1grams", outputCol="2grams")
    ngramFrame = twoGram.transform(ngramFrame)

    featuresDF = ngramFrame.rdd.map(lambda x: Row(did=x['docId'],docFeatures=x['1grams']+x['2grams'])).toDF()

    cv = CountVectorizer(inputCol="docFeatures", outputCol="features",vocabSize=1000)

    featureFitModel = cv.fit(ngramFrame)

    featuresCV = featureFitModel.transform(ngramFrame)

    labelRdd = labels.zipWithIndex().map(lambda x: (x[1],x[0]))

    labelFrame = labelRdd.toDF(["did","label"])

    trainData = ngramFrame.featuresCV(labelFrame,"did")
    trainData.persist(StorageLevel(True, True, False, False, 1))
    saveData(trainData,path)

    trainData.show()
    returm featureFitModel
예제 #10
0
def feature_engineering(class_balancedDf):
    # N-Gram
    ngram = NGram(n=2, inputCol="lemmatized", outputCol="ngrams")
    ngramDataFrame = ngram.transform(class_balancedDf)

    # Hashing TF
    hashingTF = HashingTF(inputCol="ngrams",
                          outputCol="rawFeatures",
                          numFeatures=20)
    featurizedData = hashingTF.transform(ngramDataFrame)

    # IDF
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    # K-Means
    kmeans = KMeans().setK(6).setSeed(1)
    kmodel = kmeans.fit(rescaledData).transform(rescaledData)

    #LDA
    lda = LDA(k=10, maxIter=10)
    ldamodel = lda.fit(kmodel).transform(kmodel)

    # changing label column to int
    data = ldamodel.withColumn(
        "label", ldamodel.label.cast("Integer")).drop("prediction")

    return data
예제 #11
0
  def create_ngram(self, df, n, input_col, output_col='ngrams'):
    "Generate N-Gram -> https://spark.apache.org/docs/2.2.0/ml-features.html#n-gram"
    from pyspark.ml.feature import NGram

    ngram = NGram(n=n, inputCol=input_col, outputCol=output_col)

    ngram_df = ngram.transform(df)
    return ngram_df
예제 #12
0
 def test_ngram(self):
     dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])])
     ngram0 = NGram(n=4, inputCol="input", outputCol="output")
     self.assertEqual(ngram0.getN(), 4)
     self.assertEqual(ngram0.getInputCol(), "input")
     self.assertEqual(ngram0.getOutputCol(), "output")
     transformedDF = ngram0.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
def main():
    # basic cleaning and getting of files
    get_moby()
    sentences = get_sentences()

    # create spark app, for use in iPython notebook OR as a standalone.
    spark = SparkSession\
        .builder\
        .appName("NGramSample")\
        .getOrCreate()

    # build a distributed dataframe
    sentence_df = spark.createDataFrame(sentences, ['id', 'sentences'])

    # create a tokenizer and write a 'words' column to DF
    tokenizer = Tokenizer(inputCol='sentences', outputCol='words')
    words = tokenizer.transform(sentence_df)

    # create ngram generators for bi, tri, and quad grams
    bigram = NGram(n=2, inputCol='words', outputCol='bigrams')
    trigram = NGram(n=3, inputCol='words', outputCol='trigrams')
    quadgram = NGram(n=4, inputCol='words', outputCol='quadgrams')

    # add each one in turn to the df
    bigrams = bigram.transform(words)
    trigrams = trigram.transform(bigrams)
    final = quadgram.transform(trigrams)

    # write as traversable JSON
    if os.path.exists('ngrams'):
        shutil.rmtree('ngrams')
    final.coalesce(1).write.json('ngrams')

    # as an example, write out quadgrams to CSV
    if os.path.exists('bigrams'):
        shutil.rmtree('bigrams')

    # This tricky bit selects bigrams, explodes it, and regroups by unique
    # bigram, then adds a count, after filtering out extremely uncommon bigrams
    # It finally writes to a CSV
    final.select('bigrams')\
        .withColumn('bigrams', explode('bigrams'))\
        .groupBy('bigrams').count().orderBy('count', ascending=False)\
        .filter('count > 10')\
        .coalesce(1).write.csv('bigrams')
예제 #14
0
 def test_ngram(self):
     dataset = self.spark.createDataFrame([
         Row(input=["a", "b", "c", "d", "e"])])
     ngram0 = NGram(n=4, inputCol="input", outputCol="output")
     self.assertEqual(ngram0.getN(), 4)
     self.assertEqual(ngram0.getInputCol(), "input")
     self.assertEqual(ngram0.getOutputCol(), "output")
     transformedDF = ngram0.transform(dataset)
     self.assertEqual(transformedDF.head().output, ["a b c d", "b c d e"])
예제 #15
0
def calculate_vectors(data, n=2, binary=False):
    ngram = NGram(n=n, inputCol="sequence", outputCol="ngrams")
    ngramDataFrame = ngram.transform(data)
    ngrams = ngramDataFrame.select("ngrams")
    cvectorizer = CountVectorizer(
        inputCol="ngrams", outputCol="vec", binary=binary
    )
    model = cvectorizer.fit(ngrams)
    return model.transform(ngrams).select("vec")
예제 #16
0
def opcode_ngram(df_opcode, N):
    """
    Generates n-grams opcode by opcode data frame.
    Returns n-grams opcode in RDD((filename, n-gram), total_counts)
    """
    ngrams = NGram(n=N, inputCol="opcode", outputCol="ngrams")
    df_ngrams = ngrams.transform(df_opcode)
    rdd_ngrams = df_ngrams.select("filename", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\
                    .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add)
    return rdd_ngrams
예제 #17
0
def transformData(df, parameter):
    '''
    Transformed dataframe based on the parameter
        Input : - parameter
        Output : - transformed dataframe
    '''

    ngram = NGram(n=parameter["n"],
                  inputCol=parameter["inputCol"],
                  outputCol=parameter["outputCol"])

    temp = ''

    if len(ngram.transform(df).head().inputTokens) < ngram.getN():
        print('No element in ' + parameter["outputCol"])
    else:
        temp = ngram.transform(df).show()

    return temp
예제 #18
0
def bytes_ngram(df_bytes, n):
    """
    Generates n-grams bytes by bytes data frame.
    Returns n-grams bytes in RDD((hash, n-gram), total_counts)
    """
    ngrams = NGram(n=n, inputCol="bytes", outputCol="ngrams")
    df_ngrams = ngrams.transform(df_bytes)
    rdd_ngrams = df_ngrams.select("hash", "ngrams").rdd.map(tuple).flatMapValues(lambda x: x)\
                    .map(lambda x: ((x[0], x[1]), 1)).reduceByKey(add)
    return rdd_ngrams
def make_ngrams (df, n=1):

    df = df.withColumn('normalized_text', processing(F.col('text')))
    tokenizer = Tokenizer(inputCol="normalized_text", outputCol="tokens")
    tokenized = tokenizer.transform(df).drop('normalized_text')
    
    ngram = NGram(n=n, inputCol="tokens", outputCol="n_gram")
    n_gram_df = ngram.transform(tokenized)
    n_gram_df = n_gram_df.withColumn('n_gram', F.explode('n_gram'))
    n_gram_df = n_gram_df.filter(F.length('n_gram')>2)
    
    return n_gram_df
def ngram(dataframe, in_col, out_col, n):
    
    ngram = NGram(n=n, inputCol=in_col, outputCol=out_col)
    dataframe = ngram.transform(dataframe)
    
    # summarise top n-grams
    dataframe\
    .groupBy(out_col)\
    .count()\
    .sort(col("count").desc())\
    .show()
    
    return dataframe
예제 #21
0
    def learn(self, text_df):
        """Spark transformation to learn the adjacent terms of a given ngram"""

        ngram = NGram(n=self.n, inputCol='tokenized_text', outputCol='ngram')
        ngram_df = ngram.transform(text_df)
        # create the ngram to adjacent term mappings
        ngram_list = ngram_df.select("ngram").rdd.map(lambda r: r['ngram']).collect()
        self.ngram_model = ngram_df.rdd \
            .map(lambda x: PreProcess.generate_adjacent_terms(x.asDict()['ngram'])) \
            .flatMap(lambda xs: [x for x in xs]) \
            .map(lambda y: (y[0], [y[1]])) \
            .reduceByKey(lambda a, b: a + b).collect()

        # create list of the keys in the model and store them
        self.model_keys = self.ngram_model.map(lambda x: x[0]).collect()
예제 #22
0
def convert_ngrams(df, column):
    # convert tokens to ngram

    n = 5

    # convert to ngrams 2 to n ngrams
    for i in range(1, n + 1):
        ngram = NGram(n=i,
                      inputCol=column,
                      outputCol='{}_{}'.format(column, i))
        df = ngram.transform(df)

    return df.withColumn(
        column,
        concat(*['{}_{}'.format(column, i) for i in range(1, n + 1)])).drop(
            *['{}_{}'.format(column, i) for i in range(1, n + 1)])
예제 #23
0
def n_gram_fingerprint(df, input_cols, n_size=2):
    """
    Calculate the ngram for a fingerprinted string
    :param df: Dataframe to be processed
    :param input_cols: Columns to be processed
    :param n_size:
    :return:
    """
    def remote_white_spaces_remove_sort_join(value, args):
        # remove white spaces
        value = [x.replace(" ", "") for x in value]

        # sort and remove duplicated
        value = sorted(set(value))

        # join the tokens back together
        value = "".join(value)

        return value

    input_cols = parse_columns(df, input_cols)

    for input_col in input_cols:
        ngram_col = name_col(input_col, NGRAM_COL)
        ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL)

        df = (
            df.cols.copy(input_col, name_col(
                input_col,
                NGRAM_COL)).cols.lower(ngram_col).cols.remove_white_spaces(
                    ngram_col).cols.remove_special_chars(
                        ngram_col).cols.remove_accents(ngram_col)
            # For create n-grams we need an Array type column
            .cols.nest(input_cols=ngram_col,
                       output_col=ngram_col,
                       shape='array'))
        if Optimus.cache:
            df = df.cache()

        n_gram = NGram(n=n_size,
                       inputCol=ngram_col,
                       outputCol=ngram_fingerprint_col)
        df = n_gram.transform(df)
        df = df.cols.apply(ngram_fingerprint_col,
                           remote_white_spaces_remove_sort_join, "string")

    return df
예제 #24
0
def get_ngrams(cases, region_path):
    if (debug): logging.info(region_path)
    for case_path in tqdm(cases):
        parsed = parse_file(case_path)
        text = get_case_text(parsed)
        date = get_decision_date(parsed).year
        state = parsed("case|court").attr('jurisdiction').strip()
        text = text.encode("ascii", "ignore")
        clean_word_list = alphanumeric.sub('', text).lower().split()
        text_df = spark.createDataFrame([Row(inputTokens=clean_word_list)])
        for n in range(1,4):
            if n==1:
                ngrams = clean_word_list
            else:
                ngram_prepared = NGram(n=n, inputCol="inputTokens", outputCol="nGrams")
                ngrams = ngram_prepared.transform(text_df).head().nGrams
            sc.parallelize(ngrams).map(lambda word: (word,1)).reduceByKey(lambda v1,v2: v1 +  v2).map(lambda word_tuple: write_to_file(word_tuple, date, case_path, state, region_path, n=n)).collect()
예제 #25
0
def n_gram_fingerprint(df, columns, n_size):
    """
    Calculate the ngram for a fingerprinted string
    :param df:
    :param columns:
    :param n_size:
    :return:
    """

    def remote_white_spaces_remove_sort_join(value, args):
        # remove white spaces
        value = [x.replace(" ", "") for x in value]

        # sort and remove duplicated
        value = sorted(set(value))

        # join the tokens back together
        value = "".join(value)

        return value

    columns = parse_columns(df, columns)
    for col_name in columns:
        output_col = col_name + "_NGRAM"
        n_gram_col = col_name + "_NGRAM_FINGERPRINT"

        df = (df
              .withColumn(output_col, F.col(col_name))
              .cols.lower(output_col)
              .cols.remove_white_spaces(output_col)
              .cols.remove_special_chars(output_col)
              .cols.remove_accents(output_col)
              # For create n-grams we need an Array type column
              .cols.nest(output_col, output_col, 'array')
              .repartition(1)  # Needed for optimization in a single machine
              .cache()
              )
        n_gram = NGram(n=n_size, inputCol=output_col, outputCol=n_gram_col)
        df = n_gram.transform(df)
        df = df.cols.apply(n_gram_col, remote_white_spaces_remove_sort_join, "string")

    return df
def get_Ngram(text):
    # 문장을 단어 단위로 쪼갬
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    wordsData = tokenizer.transform(text)
    # 불용어 제거
    # remover = StopWordsRemover() \
    #         .setStopWords(mystopwords) \
    #         .setCaseSensitive(False) \
    #         .setInputCol("words") \
    #         .setOutputCol("filtered")
    # remover.transform(wordsData).show(truncate = 15)

    # N-gram 이용하여 단어 조합 만들기
    ngram = NGram(n=3, inputCol="words", outputCol="ngrams")
    ngramDataFrame = ngram.transform(wordsData)
    result = ngramDataFrame.select("ngrams").show(truncate=False)
    ngr = ngram.rdd.flatmap(lambda x: x).collect()
    for i in ngr:
        print(i)
    return result
예제 #27
0
def functions_for_deal_with_texts(spark, resources_folder):
    send_df = spark.createDataFrame([
        (0, 'Hi I heard about Spark'),
        (1, 'I wish java could use case classes'),
        (2, 'Logistic,regression,models,are,neat'),
    ], ['id', 'sentence'])

    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    regularTokenizer = RegexTokenizer(
        inputCol='sentence',
        outputCol='words',
        pattern='\\W')
    count_token = udf(lambda words: len(words), IntegerType())
    tokenize = tokenizer.transform(send_df)
    tokenize.show()
    tokenize.withColumn('tokens', count_token(col('words'))).show()

    rg_tokenize = regularTokenizer.transform(send_df)
    rg_tokenize.show()
    rg_tokenize.withColumn('tokens', count_token(col('words'))).show()

    # remover palabras comunes
    sentenceData = spark.createDataFrame([
        (0, ["I", "saw", "the", "red", "balloon"]),
        (1, ["Mary", "had", "a", "little", "lamb"])
    ], ["id", "raw"])

    remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
    remover.transform(sentenceData).show(truncate=False)

    wordDataFrame = spark.createDataFrame([
        (0, ["Hi", "I", "heard", "about", "Spark"]),
        (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
        (2, ["Logistic", "regression", "models", "are", "neat"])
    ], ["id", "words"])

    ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

    ngramDataFrame = ngram.transform(wordDataFrame)
    ngramDataFrame.select("ngrams").show(truncate=False)
예제 #28
0
def LimpiarTextoTweets(df, Busqueda):

    #spark = SparkSession.builder.master('spark://192.168.55.3:7077').appName('LimpiaDatos').getOrCreate()
    spark = SparkSession.builder.appName('LimpiaDatos').getOrCreate()

    sdf = spark.createDataFrame(df)

    stopword_unidecode = [
        unidecode.unidecode(word) for word in stopwords.words('spanish')
    ]
    numeros = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']

    stopwordList = list(numeros + stopword_unidecode +
                        stopwords.words('spanish') +
                        ['rt', 'https', 'co', 'http', 't', 'q', 'l', 'c'] +
                        Busqueda.lower().split())

    #dataTweet = spark.createDataFrame([(0, unidecode.unidecode(Texto))],['id','sentence'])

    tokenizer = RegexTokenizer(inputCol='text',
                               outputCol='tokens',
                               pattern='\W+')
    tokenized = tokenizer.transform(sdf)
    #tokenized.show(truncate=False)

    remover = StopWordsRemover(inputCol='tokens',
                               outputCol='removed',
                               stopWords=stopwordList)
    removered = remover.transform(tokenized)
    #removered.show(truncate=False)

    ngram = NGram(n=2, inputCol='removed', outputCol='grams')
    ngramd = ngram.transform(removered)

    Tweets_Limpios = ngramd.toPandas()

    spark.stop()

    return Tweets_Limpios
예제 #29
0
def get_ngrams(cases, region_path):
    if (debug): logging.info(region_path)
    for case_path in tqdm(cases):
        parsed = parse_file(case_path)
        text = get_case_text(parsed)
        date = get_decision_date(parsed).year
        state = parsed("case|court").attr('jurisdiction').strip()
        text = text.encode("ascii", "ignore")
        clean_word_list = alphanumeric.sub('', text).lower().split()
        text_df = spark.createDataFrame([Row(inputTokens=clean_word_list)])
        for n in range(1, 4):
            if n == 1:
                ngrams = clean_word_list
            else:
                ngram_prepared = NGram(n=n,
                                       inputCol="inputTokens",
                                       outputCol="nGrams")
                ngrams = ngram_prepared.transform(text_df).head().nGrams
            sc.parallelize(ngrams).map(lambda word: (word, 1)).reduceByKey(
                lambda v1, v2: v1 + v2).map(lambda word_tuple: write_to_file(
                    word_tuple, date, case_path, state, region_path, n=n)
                                            ).collect()
예제 #30
0
파일: ngram.py 프로젝트: hayj/SparkTools
def toNgramDF(df,
              nbGrams,
              inputColName,
              addNbGramsToOutputCol=False,
              removeInputCol=True,
              logger=None,
              verbose=True):
    """
        This function convert a dataframe to a ngramDF on the given inputColName
    """
    if addNbGramsToOutputCol:
        columnName = str(nbGrams) + "grams"
    else:
        columnName = "ngrams"
    ngram = NGram(n=nbGrams, inputCol=inputColName, outputCol=columnName)
    ngramDF = ngram.transform(df)
    # We drop the inputCol column:
    if removeInputCol:
        try:
            ngramDF = ngramDF.drop(inputColName)
        except Exception as e:
            logException(e, logger, verbose=verbose)
    return ngramDF
예제 #31
0
import nltk


print "Start preprocessing all data"
t0 = time()

def preProcess(doc):
    clean = doc[0].replace("<br /><br />"," ")
    tok = nltk.tokenize.wordpunct_tokenize(clean)
    low = [word.lower() for word in tok]
    return low,doc[1]

bigram = NGram(inputCol="words", outputCol="bigrams")

dfPre=df.map(preProcess).toDF(['words','label']).cache()
dfTrain, dfValid = bigram.transform(dfPre).randomSplit([0.8,0.2])
dfTrain.cache()
dfValid.cache()

lists=dfTrain.map(lambda r : r.bigrams).collect()
dictBigrams=list(set(itertools.chain(*lists)))
dictionaryBigrams={}
for i,word in enumerate(dictBigrams):
	dictionaryBigrams[word]=i
    
dict_broad=sc.broadcast(dictionaryBigrams)
revDict_broad=sc.broadcast(dictBigrams)

tt = time() - t0
print "Data preprocessed in {} second".format(round(tt,3))
예제 #32
0
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

print "Clean train and test set created"


from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTrainTok = tokenizer.transform(dfTrain)

print "Tokens computed"


from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTrainTok)

print "Bigrams computed"


import itertools
lists=dfBigram.map(lambda r : r.bigrams).collect()
dictBigrams=set(itertools.chain(*lists))
dictionaryBigrams={}
for i,word in enumerate(dictBigrams):
	dictionaryBigrams[word]=i

print "Dictionary created"


dict_broad=sc.broadcast(dictionaryBigrams)
# COMMAND ----------

from pyspark.ml.feature import StopWordsRemover
englishStopWords = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover()\
  .setStopWords(englishStopWords)\
  .setInputCol("DescOut")
stops.transform(tokenized).show()


# COMMAND ----------

from pyspark.ml.feature import NGram
unigram = NGram().setInputCol("DescOut").setN(1)
bigram = NGram().setInputCol("DescOut").setN(2)
unigram.transform(tokenized.select("DescOut")).show(False)
bigram.transform(tokenized.select("DescOut")).show(False)


# COMMAND ----------

from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("countVec")\
  .setVocabSize(500)\
  .setMinTF(1)\
  .setMinDF(2)
fittedCV = cv.fit(tokenized)
fittedCV.transform(tokenized).show(False)
예제 #34
0
    text2 = text.map(rm_junks).collect()

    rawLabelTweetDataFrame = spark.createDataFrame(text2, ["label", "tweets"])

    regexTokenizer = RegexTokenizer(inputCol="tweets",
                                    outputCol="words",
                                    pattern="\\W")
    tokenized = regexTokenizer.transform(rawLabelTweetDataFrame)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(tokenized).select(
        "label", "filtered")

    uningram = NGram(n=2, inputCol="filtered", outputCol="ngrams")
    uningramDataFrame = uningram.transform(filteredDataFrame)
    uningramDataFrame.select("label", "ngrams").show(truncate=False)

    uningramData = uningramDataFrame.select("label", "ngrams")

    #hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=20)

    featurizedData = hashingTF.transform(uningramData)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.select("label", "features").show()
예제 #35
0
print "Start preprocessing all data"
t0 = time()

def preProcess(doc):
    clean = doc.review.replace("<br /><br />"," ")
    tok = nltk.tokenize.wordpunct_tokenize(clean)
    tags = nltk.pos_tag(tok,tagset='universal')
    low = [word.lower() for word in tok]
    return low,zip(*tags)[1],doc.label

schema = StructType([StructField('words',ArrayType(StringType()),True), StructField('tags',ArrayType(StringType()),True), StructField('label',DoubleType())])

dfPre=df.map(preProcess).toDF(schema).cache()
trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams")
dfTriAux = trigram.transform(dfPre).cache()
trigram.setInputCol("words")
trigram.setOutputCol("wordTrigrams")
dfTri = trigram.transform(dfTriAux).cache()

dfTrain, dfValid = dfTri.randomSplit([0.8,0.2])


lists=dfTrain.map(lambda r : r.words).collect()
dictUnigrams=list(set(itertools.chain(*lists)))
dictionaryUni={}
for i,word in enumerate(dictUnigrams):
	dictionaryUni[word]=i

dict_broad = sc.broadcast(dictionaryUni)
예제 #36
0
import nltk


print "Start preprocessing all data"
t0 = time()

def preProcess(doc):
    clean = doc[0].replace("<br /><br />"," ")
    tok = nltk.tokenize.wordpunct_tokenize(clean)
    low = [word.lower() for word in tok]
    return low,doc[1]

bigram = NGram(inputCol="words", outputCol="bigrams")

dfPre=df.map(preProcess).toDF(['words','label']).cache()
dfTrain, dfValid = bigram.transform(dfPre).randomSplit([0.8,0.2])
dfTrain.cache()
dfValid.cache()

lists=dfTrain.map(lambda r : r.bigrams).collect()
dictBigrams=list(set(itertools.chain(*lists)))
dictionaryBigrams={}
for i,word in enumerate(dictBigrams):
	dictionaryBigrams[word]=i
    
dict_broad=sc.broadcast(dictionaryBigrams)
revDict_broad=sc.broadcast(dictBigrams)

tt = time() - t0
print "Data preprocessed in {} second".format(round(tt,3))
예제 #37
0
from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext

# $example on$
from pyspark.ml.feature import NGram

# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="NGramExample")
    sqlContext = SQLContext(sc)

    # $example on$
    wordDataFrame = sqlContext.createDataFrame(
        [
            (0, ["Hi", "I", "heard", "about", "Spark"]),
            (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
            (2, ["Logistic", "regression", "models", "are", "neat"]),
        ],
        ["label", "words"],
    )
    ngram = NGram(inputCol="words", outputCol="ngrams")
    ngramDataFrame = ngram.transform(wordDataFrame)
    for ngrams_label in ngramDataFrame.select("ngrams", "label").take(3):
        print(ngrams_label)
    # $example off$

    sc.stop()
예제 #38
0
posTagger = tr.NLTKPosTagger(
    inputCol="words", outputCol="tags")
print "Compute tags"
t0 = time()
dfTags = posTagger.transform(df)
dfTags.show()
tt = time() - t0
print "Tags computed in {} second".format(round(tt,3))


# In[7]:

from pyspark.ml.feature import NGram
trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams")
t0 = time()
dfTriAux = trigram.transform(dfTags)
trigram.setInputCol("words")
trigram.setOutputCol("wordTrigrams")
dfTri = trigram.transform(dfTriAux)
dfTri.show()
tt = time() - t0
print "Trigrams created in {} second".format(round(tt,3))


# In[8]:

dfTrain, dfTest = dfTri.randomSplit([0.8,0.2])


# In[9]:
예제 #39
0
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)

# ## n-grams
from pyspark.ml.feature import NGram

wordDataFrame = spark.createDataFrame(
    [(0, ["Hi", "I", "heard", "about", "Spark"]),
     (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
     (2, ["Logistic", "regression", "models", "are", "neat"])],
    ["id", "words"])

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

# _______
# # Feature Extractors
# _______

# <h2 id="tf-idf">TF-IDF</h2>
#
# <p><a href="http://en.wikipedia.org/wiki/Tf%E2%80%93idf">Term frequency-inverse document frequency (TF-IDF)</a>
# is a feature vectorization method widely used in text mining to reflect the importance of a term
# to a document in the corpus. Denote a term by <code>$t$</code>, a document by  d , and the corpus by D.
# Term frequency <code>$TF(t, d)$</code> is the number of times that term <code>$t$</code> appears in document <code>$d$</code>, while
# document frequency <code>$DF(t, D)$</code> is the number of documents that contains term <code>$t$</code>. If we only use
# term frequency to measure the importance, it is very easy to over-emphasize terms that appear very
# often but carry little information about the document, e.g. &#8220;a&#8221;, &#8220;the&#8221;, and &#8220;of&#8221;. If a term appears
예제 #40
0
print
print "Dataframe created in {} second".format(round(tt,3))


# In[314]:

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTok = tokenizer.transform(df)


# In[315]:

from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTok)


# In[317]:

print "Start tokenizing, computing bigrams and splitting between test and train"
t0 = time()
dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2])
dfTrain.take(1)
dfTest.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[318]:
예제 #41
0
    def concat_(*args):
        return list(chain(*args))

    return udf(concat_, ArrayType(type))


concat_string_arrays = concat(StringType())

df = df.withColumn(
    'joined_tokens',
    concat_string_arrays(col('filtered_title_tokens'),
                         col('filtered_sterm_tokens'),
                         col('filtered_attr_tokens')))
joined_ngram = NGram(n=2, inputCol="joined_tokens", outputCol="joined_ngrams")

df = joined_ngram.transform(df)
'''
stemmingUdf = udf(stemming, ArrayType(StringType()))
df = df.withColumn('stemmed_tokens', stemmingUdf('joined_tokens'))
'''
joined_hashingTF = HashingTF(inputCol="joined_ngrams",
                             outputCol="joined_rawFeatures",
                             numFeatures=30000)

df = joined_hashingTF.transform(df)

joined_idf = IDF(inputCol="joined_rawFeatures", outputCol="features")

joined_idfModel = joined_idf.fit(df)

df = joined_idfModel.transform(df)
예제 #42
0
        in_col = dataset[self.getInputCol()]
        return dataset.withColumn(out_col, udf(f, t)(in_col))


posTagger = NLTKPosTagger(inputCol="words", outputCol="tagWords")
dfTagged = posTagger.transform(dfTrainTok)
#dfTagged.show()



#----------------------------------------------------------------------
#------------------------------Bigrams---------------------------------
#----------------------------------------------------------------------
from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTrainTokNoSw)
print "DataFrame des Bigram: "
dfBigram.show()




#**********************************************************************
#------------------------Feature selection-----------------------------
#**********************************************************************

# Pour la suite on a le choix entre l'encodage utilisé par le prof (le mot y est ou n'y est pas)
# ou la version en apparence plus informative du tfidf. En vrai, le tfidf peut être trompeur 
# donc je construis quand même les dictionnaires d'unigrammes et de bigrammes pour pouvoir 
# calculer les sparse vectors du prof.