Python array_join примеры, pyspark.sql.functions.array_join Python примеры использования

Пример #1

0

Показать файл

Файл: join_using_delimiter.py Проект: gagan-chawla/SparkAutoMapper

 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     column_spec = array_join(
         self.column.get_column_spec(source_df=source_df,
                                     current_column=current_column),
         self.delimiter)
     return column_spec

Пример #2

0

Показать файл

def process_item_sequence(rating_data):
    sequences = rating_data.filter(F.col("rating") >= 3.5).groupBy("userId")\
        .agg(udf_combine_movies_by_timeline(F.collect_list("movieId"), F.collect_list("timestamp")).alias("movieIds"))\
        .withColumn("movieIdStr", F.array_join(F.col("movieIds"), " "))
    print_info(sequences, message="after build movieIdStr: ")
    sequences = sequences.select("movieIds")
    return sequences

Пример #3

0

Показать файл

def transform_abstracts_words(dataframe):
    udf_function_clean = udf(generate_cleaned_abstracts, StringType())
    udf_function_sentiment = udf(generate_sentiment, DoubleType())

    df_abstracts = (dataframe.select(
        "paper_id",
        func.posexplode("abstract").alias("pos", "value")).select(
            "paper_id", "pos", "value.text").withColumn(
                "ordered_text",
                func.collect_list("text").over(
                    Window.partitionBy("paper_id").orderBy("pos"))
            ).groupBy("paper_id").agg(
                func.max("ordered_text").alias("sentences")).select(
                    "paper_id",
                    func.array_join("sentences",
                                    " ").alias("abstract")).withColumn(
                                        "words",
                                        func.size(func.split(
                                            "abstract", "\s+"))))

    df_abstracts = df_abstracts.withColumn("clean_abstract",
                                           udf_function_clean("abstract"))
    df_abstracts = df_abstracts.withColumn(
        "sentiment_abstract", udf_function_sentiment("clean_abstract"))

    return df_abstracts

Пример #4

0

Показать файл

Файл: SparkNLP.py Проект: sahilsingh1123/dmx_deepinsight_prediction

    def viveknSentimentAnalysis(self, infoData):
        dataset = infoData.get(pc.DATASET)
        labelCol = infoData.get(pc.LABELCOLM)
        dataset = self.changeSentimentVal(dataset, labelCol)
        (trainDataset, testDataset) = dataset.randomSplit([0.80, 0.20], seed=0)
        viveknSentiment = ViveknSentimentApproach().setInputCols(["document", pc.DMXSTOPWORDS])\
            .setOutputCol("viveknSentiment").setSentimentCol("original_sentiment")
        viveknSentimentModel = viveknSentiment.fit(trainDataset)
        testDatasetPrediction = viveknSentimentModel.transform(testDataset)

        #storing the model at a location for future use in case of prediction of sentiment analysis.
        """you will get the list of all trained models and pretrained pipelines for using in the prediction of sentiment"""
        storagePath = infoData.get(pc.STORAGELOCATION)
        modelName = "testViveknSentiment"  #sahil - temporary only
        modelPath = storagePath + modelName
        viveknSentimentModel.write().overwrite().save(modelPath)
        infoData.get(
            infoData.get(pc.SPARKNLPPATHMAPPING).update(
                {pc.SENTIMENTMODEL: modelPath}))

        #convert back the column type to the string format
        testDatasetPrediction = testDatasetPrediction.withColumn(
            "viveknSentiment", array_join("viveknSentiment.result", ""))
        infoData.update({pc.TESTDATA: testDatasetPrediction})

        # need to coverts both the colms original sentiment and predicted sentiment for evaluation.
        infoData = self.evaluation(self.stringToIndex(infoData))
        """
        --> first check if the indexing is matching with label/original sentiment if not then match with the below method.
        finalDatasetTest = finalDatasetTest.withColumn("finalDataset_indexed",
                 when(finalDatasetTest["finalDataset_indexed"] == 0.0, 1.0)
                 .when(finalDatasetTest["finalDataset_indexed"] == 1.0, 0.0))
        """
        return infoData

Пример #5

0

Показать файл

def main(input_dir,output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('created_utc_iso', types.DateType()),
        types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType()))
    ])

    headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1)
    ).withColumn(
        'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2)
    ).cache()

    for year_int in range(2008,2020):
        print('Plotting for '+str(year_int))
        headlines_year = split_sentiment_df.where(
            functions.year(split_sentiment_df['created_utc_iso']) == year_int
        ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso']))

        headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg(
            functions.collect_set(headlines_year['title_clean']).alias('titles_group')
        )
        headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') )
        string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines

        wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot)
        wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')

Пример #6

0

Показать файл

    def performSentiment(self, infoData):
        dataset = infoData.get(pc.DATASET)
        sentimentModelPath = (infoData.get(pc.SPARKNLPPATHMAPPING)).get(
            pc.SENTIMENTMODEL)
        viveknSentimentModel = ViveknSentimentModel.load(sentimentModelPath)
        dataset = viveknSentimentModel.transform(dataset)
        dataset = dataset.withColumn("viveknSentiment",
                                     array_join("viveknSentiment.result", ""))
        infoData.update({pc.DATASET: dataset})

        return infoData

Пример #7

0

Показать файл

Файл: function.py Проект: amustaf5/SparkVCFtools

def column_revalue(vcf):
    # info 값 수정 필요
    name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"]
    for name in name_list:
        if name == "FORMAT":
            vcf = vcf.withColumn(
                name, F.array_sort(F.array_distinct(F.flatten(F.col(name)))))
            vcf = vcf.withColumn(
                name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":")))
        else:
            vcf = vcf.withColumn(name, F.array_max(F.col(name)))
    return vcf

Пример #8

0

Показать файл

Файл: PerformViveknSentiment.py Проект: sahilsingh1123/SentimentAnalysis

 def vivekSentimentPretrained(self, infoData):
     applySentimentOn = infoData.get(vc.APPLY_SENTIMENT_ON)
     dataset = infoData.get(mc.DATASET)
     viveknPretrainedModelPath = infoData.get(vc.VIVEKNPRETRAINEDPATH)
     predictionCol = infoData.get(mc.PREDICTIONCOL)
     """use to download it once later we need to load it from the local to avoid dependency on online downloader."""
     viveknSentiment = ViveknSentimentModel.load(viveknPretrainedModelPath).setInputCols(
         ["document", applySentimentOn]).setOutputCol(predictionCol)
     dataset = viveknSentiment.transform(dataset)
     dataset = dataset.withColumn(predictionCol, array_join(predictionCol + ".result", ""))
     dataset = dataset.select(mc.PA_INDEX, predictionCol)
     return dataset

Пример #9

0

Показать файл

def variant_SPDI(
        contig="contigName",
        start="start",
        # end="end",
        ref="referenceAllele",
        alt="alternateAllele",
        join_alternate_alleles=False):
    # if alt allele column is an array, concat with ','
    if join_alternate_alleles:
        alt = f.array_join(alt, ",")

    return f.concat(contig, f.lit(":"), start, f.lit(":"), ref, f.lit(":"),
                    alt)

Пример #10

0

Показать файл

Файл: function.py Проект: amustaf5/SparkVCFtools

def parquet_revalue(vcf, indel_com):
    temp = indel_com.join(vcf, ["#CHROM", "POS"], "full")
    sample_name = temp.columns[-1]

    sample_w = Window.partitionBy(F.col("#CHROM")).orderBy(
        F.col("POS")).rangeBetween(Window.unboundedPreceding,
                                   Window.currentRow)
    temp = temp.withColumn(
        sample_name,
        F.last(sample_name,
               ignorenulls=True).over(sample_w)).withColumnRenamed(
                   "#CHROM", "CHROM")

    # scala UDF
    null_not_value = temp.filter(F.map_keys(F.col(sample_name)) != F.col("FORMAT"))\
                         .selectExpr("CHROM", "POS","index2dict({}, FORMAT) as {}".format(sample_name, sample_name))\
                         .withColumn(sample_name,  F.concat(F.lit("./.:"), F.array_join(F.col(sample_name), ":")))

    null_value = temp.filter(F.map_keys(F.col(sample_name)) == F.col("FORMAT")).drop("FORMAT")\
                     .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.map_values(F.col(sample_name)), ":")))

    value_union = null_not_value.union(null_value).withColumnRenamed(
        "CHROM", "#CHROM")
    return value_union

Пример #11

0

Показать файл

 def get_column_spec(
     self,
     source_df: Optional[DataFrame],
     current_column: Optional[Column],
     parent_columns: Optional[List[Column]],
 ) -> Column:
     column_spec = array_join(
         self.column.get_column_spec(
             source_df=source_df,
             current_column=current_column,
             parent_columns=parent_columns,
         ),
         self.delimiter,
     )
     return column_spec

Пример #12

0

Показать файл

Файл: process_tweet.py Проект: davyjonesht/polytrends

def process_tweet_text(df):
    """Removes punctuation, stop words from inputCol and the output is in the outputCol Column.

    Args:
        df (DataFrame): A DataFrame with the column from which Stop Words need to be removed.

    Returns:
        DataFrame: Applying StopWordsRemover with text_clean as the input column and filtered as the output column.
    """
    df = df.withColumn('text', split(removePunctuation(df['text']), ' ').alias('text'))
    stopWordList = list(string.punctuation) + ['http', 'https', 'rt','via','...','…','’','—','—:','“'] + StopWordsRemover.loadDefaultStopWords('english')
    remover = StopWordsRemover(inputCol="text", outputCol="filtered", stopWords = stopWordList)
    df = remover.transform(df)
    df = df.withColumn('tweet', array_join(df['filtered'], ' '))
    return df.select('date', 'tweet', 'hashtags')

Пример #13

0

Показать файл

Файл: ViveknPretrainedModel.py Проект: sahilsingh1123/dmx_deepinsight_prediction

 def vivekSentimentPretrained(self, infoData):
     dataset = infoData.get(pc.DATASET)
     viveknPretrainedModelPath = infoData.get(pc.VIVEKNPRETRAINEDMODEL)
     predictionCol = infoData.get(pc.PREDICTIONCOLM)
     """use to download it once later we need to load it from the local to avoid dependency on online downloader."""
     viveknSentiment = ViveknSentimentModel.load(
         viveknPretrainedModelPath).setInputCols(
             ["document", pc.DMXSTOPWORDS]).setOutputCol(predictionCol)
     dataset = viveknSentiment.transform(dataset)
     dataset = dataset.withColumn(predictionCol,
                                  array_join(predictionCol + ".result", ""))
     dataset = dataset.select(pc.DMXINDEX, predictionCol)
     dataset = dataset.withColumn(
         predictionCol,
         when(dataset[predictionCol] == "negative",
              pc.NEGATIVE).when(dataset[predictionCol] == "positive",
                                pc.POSITIVE).otherwise(pc.NEUTRAL))
     return dataset

Пример #14

0

Показать файл

Файл: test_automapper_join_using_delimiter.py Проект: imranq2/SparkAutoMapper

def test_auto_mapper_join_using_delimiter(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (123456789, "Gagan", "Chawla", ["MD", "PhD"]),
        ],
        ["npi", "first_name", "last_name", "suffix"],
    ).createOrReplaceTempView("practitioners")

    source_df: DataFrame = spark_session.table("practitioners")

    df = source_df.select("npi")
    df.createOrReplaceTempView("physicians")

    # Act
    mapper = AutoMapper(
        view="physicians", source_view="practitioners", keys=[
            "npi"
        ]).columns(my_column=A.join_using_delimiter(A.column("suffix"), ", "))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["my_column"],
        array_join(col("b.suffix"), ", ").alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()
    assert (result_df.where("npi == 123456789").select("my_column").collect()
            [0][0] == "MD, PhD")

Пример #15

0

Показать файл

Файл: two_tower_retrieval.py Проект: mindalpha/MindAlpha

    def stringify(self,
                  result,
                  recommendation_info_item_delimiter="\001",
                  recommendation_info_field_delimiter="\004",
                  item_embedding_value_delimiter="\003",
                  user_embedding_value_delimiter="\003"):
        import pyspark.sql.functions as F
        from pyspark.sql.functions import pandas_udf
        output_item_embeddings = self.output_item_embeddings

        @pandas_udf('string')
        def format_rec_info(rec_info):
            import pandas as pd
            output = []
            for record in rec_info:
                string = ''
                for item in record:
                    if string:
                        string += recommendation_info_item_delimiter
                    string += item['name']
                    string += recommendation_info_field_delimiter
                    string += str(item['distance'])
                    if output_item_embeddings:
                        string += recommendation_info_field_delimiter
                        string += item_embedding_value_delimiter.join(
                            map(str, item['item_embedding']))
                output.append(string)
            return pd.Series(output)

        result = result.withColumn(
            self.recommendation_info_column_name,
            format_rec_info(self.recommendation_info_column_name))
        if self.output_user_embeddings:
            result = result.withColumn(
                self.user_embedding_column_name,
                F.array_join(F.col(self.user_embedding_column_name),
                             user_embedding_value_delimiter))
        return result

Пример #16

0

Показать файл

Файл: textPreprocessing.py Проект: sahilsingh1123/SentimentAnalysis

    def sparkLemmatizer(self, dataset, colName, lemmatizedModelPath):
        dataset = dataset.select(
            "SA_index",
            concat_ws(",", dataset[colName]).alias(colName))
        dataset = dataset.withColumn(colName,
                                     regexp_replace(col(colName), ",", " "))
        dataset = dataset.drop('lemma', 'document')

        documentAssembler = DocumentAssembler() \
            .setInputCol(colName) \
            .setOutputCol("document")

        tokenizer = Tokenizer() \
            .setInputCols(["document"]) \
            .setOutputCol("token")

        lemmaModel = LemmatizerModel.load(lemmatizedModelPath) \
            .setInputCols(["document", "token"]).setOutputCol("lemma")

        finisher = Finisher() \
            .setInputCols(["lemma"]) \
            .setOutputCols(["ntokens"]) \
            .setOutputAsArray(True) \
            .setCleanAnnotations(True)

        lemmatizerPipeline = Pipeline(
            stages=[documentAssembler, tokenizer, lemmaModel])
        dataset = lemmatizerPipeline.fit(dataset).transform(dataset)
        dataset = dataset.withColumn("lemma_result",
                                     array_join("lemma.result", ""))
        sentimentTokenizer = RegexTokenizer(inputCol="lemma_result",
                                            outputCol="SA_lemma",
                                            toLowercase=True,
                                            pattern="\\W")
        dataset = sentimentTokenizer.transform(dataset)

        return dataset

Пример #17

0

Показать файл

  .registerTempTable("input")

#load dictionary as csv to reduce proccessing required to pivot json fields
dictSchema = [
    StructField('colID', StringType(), True),
    StructField('colValue', IntegerType(), True)
]
finalStruct = StructType(fields=dictSchema)
df2 = sqlContext.read.csv(path='inputs/freq_dict_mini.csv',
                          header=True,
                          schema=finalStruct,
                          ignoreLeadingWhiteSpace=True,
                          ignoreTrailingWhiteSpace=True)

newDf = df2\
  .withColumn("sortedID",F.array_join(F.sort_array(F.split(df2["colID"],"")),"",""))\
  .withColumn("wordLen", F.length('colID'))
newDf.registerTempTable("dictionary")

#join on sorted characters to get unscrambled possibilities
sqlContext.sql("""select puzzle_id,
                         letters, 
                         keyPositions, 
                         colID, 
                         colValue, 
                         getKeyLetters_udf(colID,keyPositions) as keyLetters, 
                         answerLengths 
                  from input i inner join dictionary d on d.sortedID = i.sortedLetters
""").registerTempTable("unscrambled")
#aggregate to posibilities into list
unscrambled = sqlContext.sql("""select puzzle_id,

Пример #18

0

Показать файл

def text_formatting(spark):
    """ Extract formatting features from the text of a post

    Args:
        spark (SparkSession): used to run queries and commands

    Returns:
        DataFrame: With columns [
            (post)_Id,
            #codelines,
            #html_blocks,
            #headings,
            #referencelist,
            #quotes,
            #codeblocks,
            #themebreaks,
            #codespans,
            #references,
            #links,
            #inline_images,
            #mail_addresses,
            #emphasis,
            #strong
        ]
    """
    # Replaces formatted text that has already been processed
    FILLER = 'x'
    # Parser helper column
    COLNAME = 'processed_text'
    COL = col(COLNAME)

    # Data loading
    post_history_df = spark.read.parquet("/user/***REMOVED***/StackOverflow/PostHistory.parquet") \
        .select(['_PostId', '_Text', '_PostHistoryTypeId']) \
        .filter(col('_PostHistoryTypeId') == 2) \
        .drop('_PostHistoryTypeId')
    post_df = spark.read.parquet('/user/***REMOVED***/StackOverflow/Posts.parquet') \
        .select(['_Id', '_PostTypeId']) \
        .filter(col('_PostTypeId') == 1) \
        .drop("_PostTypeId")
    df = post_history_df.join(post_df,
                              post_df['_Id'] == post_history_df['_PostId'])

    # Count lines and words of the formatted text
    df = df.withColumn('#lines', size(split(col('_Text'), r'\n'))) \
        .withColumn('#words', size(split(col('_Text'), r'\s+')))

    # BLOCK ELEMENTS
    # Count code lines
    df = df.withColumn(COLNAME, split(col('_Text'), regex.CODE_BLOCK_RE)) \
        .withColumn('#codelines', size(COL) - 1) \
        .withColumn('codeline_ratio', col('#codelines') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count HTML blocks
    df = df.withColumn(COLNAME, split(COL, regex.HTML_BLOCK_RE)) \
        .withColumn('#html_blocks', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # # Count headings (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.SETEXT_HEADING_RE)) \
        .withColumn('#headings', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count reference list
    df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_LIST_RE)) \
        .withColumn('#referencelist', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count quotes
    df = df.withColumn(COLNAME, split(COL, regex.QUOTE_RE)) \
        .withColumn('#quotes', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count headings (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.HEADING_RE)) \
        .withColumn('#headings', size(COL) - 1 + col('#headings')) \
        .withColumn('heading_ratio', col('#headings') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count code blocks
    df = df.withColumn(COLNAME, split(COL, regex.FENCED_CODE_RE)) \
        .withColumn('#codeblocks', size(COL) - 1) \
        .withColumn('codeblock_ratio', col('#codeblocks') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count thematic break
    df = df.withColumn(COLNAME, split(COL, regex.THEME_BREAK_RE)) \
        .withColumn('#themebreaks', size(COL) - 1) \
        .withColumn('themebreak_ratio', col('#themebreaks') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))

    # INLINE ELEMENTS
    # Count codespans
    df = df.withColumn(COLNAME, split(COL, regex.CODESPAN_RE)) \
        .withColumn('#codespans', size(COL) - 1) \
        .withColumn('codespan_ratio', col('#codespans') / col('#words')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Remove markdown escapes
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.ESCAPE_RE, FILLER))
    # Count references (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_RE)) \
        .withColumn('#references', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count links (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.LINK_RE)) \
        .withColumn('#links', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count inline images
    df = df.withColumn(COLNAME, split(COL, regex.INLINE_IMAGE_RE)) \
        .withColumn('#inline_images', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # # Count references (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.SHORT_REFERENCE_RE)) \
        .withColumn('#references', size(COL) - 1 + col('#references')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count links (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.AUTOLINK_RE)) \
        .withColumn('#links', size(COL) - 1 + col('#links')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count mails
    df = df.withColumn(COLNAME, split(COL, regex.AUTOMAIL_RE)) \
        .withColumn('#mail_addresses', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))

    # Remove line breaks, html, stand-alone * or _
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.LINE_BREAK_RE,
                                               FILLER))
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.HTML_RE, FILLER))
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.NOT_STRONG_RE,
                                               FILLER))
    # Count strong & emphasis
    df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG_RE)) \
        .withColumn('#emphasis', size(COL) - 1) \
        .withColumn('#strong', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM3_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_RE)) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG2_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG2_RE)) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS2_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn('emphasis_ratio', col('#emphasis') / col('#words')) \
        .withColumn('strong_ratio', col('#strong') / col('#words'))

    # Remove unnecessary columns, including parser helper column
    df = df.drop('_Text', '_PostHistoryTypeId', '_PostId', '#lines', '#words',
                 COLNAME)
    return df

Пример #19

0

Показать файл

                    func.col('Au.auid').cast('long'))

        # select affiliations per author based on the most dominant affiliation in the last year.
        # keep only records from the final year
        .withColumn(
            'yearRank',
            func.rank().over(
                Window.partitionBy('auid').orderBy(
                    func.desc(sort_pub_year)))).filter('yearRank=1')
        # now count per author the afid occurences.
        .groupBy('auid',
                 func.col('affiliation.afid').alias('affil_id')).agg(
                     func.count('*').alias('aff_occurences'),
                     func.first(
                         func.array_join(
                             'affiliation.affiliation_organization',
                             ", ")).alias('affil_name'),
                     func.first('affiliation.affiliation_tag_country').alias(
                         'cntry'),
                     func.max('datesort').alias('max_datesort'),
                 ).withColumn(
                     'affRank',
                     func.rank().over(
                         Window.partitionBy('auid').orderBy(
                             func.desc('aff_occurences'),
                             func.desc('max_datesort'),
                             func.asc('affil_id')))).filter('affRank=1').drop(
                                 'affRank').drop('aff_occurences')

        # get name from ipr record (preferred name), otherwise default to the name as printed on the paper.
        .join(

Пример #20

0

Показать файл

def export_vector_df(vector_df, vocab_df, output_folder):
    vector_df = vector_df.join(vocab_df, 'id').select('standard_concept_id',
                                                      'vector')
    vector_df.withColumn('vector', F.array_join(F.col('vector'), ',')) \
        .select('standard_concept_id', 'vector').repartition(1) \
        .write.option('header', 'true').mode('overwrite').csv(create_file_path(output_folder, 'embedding_csv'))

Python array_join примеры использования