def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     column_spec = array_join(
         self.column.get_column_spec(source_df=source_df,
                                     current_column=current_column),
         self.delimiter)
     return column_spec
Пример #2
0
def process_item_sequence(rating_data):
    sequences = rating_data.filter(F.col("rating") >= 3.5).groupBy("userId")\
        .agg(udf_combine_movies_by_timeline(F.collect_list("movieId"), F.collect_list("timestamp")).alias("movieIds"))\
        .withColumn("movieIdStr", F.array_join(F.col("movieIds"), " "))
    print_info(sequences, message="after build movieIdStr: ")
    sequences = sequences.select("movieIds")
    return sequences
Пример #3
0
def transform_abstracts_words(dataframe):
    udf_function_clean = udf(generate_cleaned_abstracts, StringType())
    udf_function_sentiment = udf(generate_sentiment, DoubleType())

    df_abstracts = (dataframe.select(
        "paper_id",
        func.posexplode("abstract").alias("pos", "value")).select(
            "paper_id", "pos", "value.text").withColumn(
                "ordered_text",
                func.collect_list("text").over(
                    Window.partitionBy("paper_id").orderBy("pos"))
            ).groupBy("paper_id").agg(
                func.max("ordered_text").alias("sentences")).select(
                    "paper_id",
                    func.array_join("sentences",
                                    " ").alias("abstract")).withColumn(
                                        "words",
                                        func.size(func.split(
                                            "abstract", "\s+"))))

    df_abstracts = df_abstracts.withColumn("clean_abstract",
                                           udf_function_clean("abstract"))
    df_abstracts = df_abstracts.withColumn(
        "sentiment_abstract", udf_function_sentiment("clean_abstract"))

    return df_abstracts
    def viveknSentimentAnalysis(self, infoData):
        dataset = infoData.get(pc.DATASET)
        labelCol = infoData.get(pc.LABELCOLM)
        dataset = self.changeSentimentVal(dataset, labelCol)
        (trainDataset, testDataset) = dataset.randomSplit([0.80, 0.20], seed=0)
        viveknSentiment = ViveknSentimentApproach().setInputCols(["document", pc.DMXSTOPWORDS])\
            .setOutputCol("viveknSentiment").setSentimentCol("original_sentiment")
        viveknSentimentModel = viveknSentiment.fit(trainDataset)
        testDatasetPrediction = viveknSentimentModel.transform(testDataset)

        #storing the model at a location for future use in case of prediction of sentiment analysis.
        """you will get the list of all trained models and pretrained pipelines for using in the prediction of sentiment"""
        storagePath = infoData.get(pc.STORAGELOCATION)
        modelName = "testViveknSentiment"  #sahil - temporary only
        modelPath = storagePath + modelName
        viveknSentimentModel.write().overwrite().save(modelPath)
        infoData.get(
            infoData.get(pc.SPARKNLPPATHMAPPING).update(
                {pc.SENTIMENTMODEL: modelPath}))

        #convert back the column type to the string format
        testDatasetPrediction = testDatasetPrediction.withColumn(
            "viveknSentiment", array_join("viveknSentiment.result", ""))
        infoData.update({pc.TESTDATA: testDatasetPrediction})

        # need to coverts both the colms original sentiment and predicted sentiment for evaluation.
        infoData = self.evaluation(self.stringToIndex(infoData))
        """
        --> first check if the indexing is matching with label/original sentiment if not then match with the below method.
        finalDatasetTest = finalDatasetTest.withColumn("finalDataset_indexed",
                 when(finalDatasetTest["finalDataset_indexed"] == 0.0, 1.0)
                 .when(finalDatasetTest["finalDataset_indexed"] == 1.0, 0.0))
        """
        return infoData
Пример #5
0
def main(input_dir,output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('created_utc_iso', types.DateType()),
        types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType()))
    ])

    headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1)
    ).withColumn(
        'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2)
    ).cache()

    for year_int in range(2008,2020):
        print('Plotting for '+str(year_int))
        headlines_year = split_sentiment_df.where(
            functions.year(split_sentiment_df['created_utc_iso']) == year_int
        ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso']))

        headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg(
            functions.collect_set(headlines_year['title_clean']).alias('titles_group')
        )
        headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') )
        string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines

        wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot)
        wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')
Пример #6
0
    def performSentiment(self, infoData):
        dataset = infoData.get(pc.DATASET)
        sentimentModelPath = (infoData.get(pc.SPARKNLPPATHMAPPING)).get(
            pc.SENTIMENTMODEL)
        viveknSentimentModel = ViveknSentimentModel.load(sentimentModelPath)
        dataset = viveknSentimentModel.transform(dataset)
        dataset = dataset.withColumn("viveknSentiment",
                                     array_join("viveknSentiment.result", ""))
        infoData.update({pc.DATASET: dataset})

        return infoData
Пример #7
0
def column_revalue(vcf):
    # info 값 수정 필요
    name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"]
    for name in name_list:
        if name == "FORMAT":
            vcf = vcf.withColumn(
                name, F.array_sort(F.array_distinct(F.flatten(F.col(name)))))
            vcf = vcf.withColumn(
                name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":")))
        else:
            vcf = vcf.withColumn(name, F.array_max(F.col(name)))
    return vcf
 def vivekSentimentPretrained(self, infoData):
     applySentimentOn = infoData.get(vc.APPLY_SENTIMENT_ON)
     dataset = infoData.get(mc.DATASET)
     viveknPretrainedModelPath = infoData.get(vc.VIVEKNPRETRAINEDPATH)
     predictionCol = infoData.get(mc.PREDICTIONCOL)
     """use to download it once later we need to load it from the local to avoid dependency on online downloader."""
     viveknSentiment = ViveknSentimentModel.load(viveknPretrainedModelPath).setInputCols(
         ["document", applySentimentOn]).setOutputCol(predictionCol)
     dataset = viveknSentiment.transform(dataset)
     dataset = dataset.withColumn(predictionCol, array_join(predictionCol + ".result", ""))
     dataset = dataset.select(mc.PA_INDEX, predictionCol)
     return dataset
Пример #9
0
def variant_SPDI(
        contig="contigName",
        start="start",
        # end="end",
        ref="referenceAllele",
        alt="alternateAllele",
        join_alternate_alleles=False):
    # if alt allele column is an array, concat with ','
    if join_alternate_alleles:
        alt = f.array_join(alt, ",")

    return f.concat(contig, f.lit(":"), start, f.lit(":"), ref, f.lit(":"),
                    alt)
Пример #10
0
def parquet_revalue(vcf, indel_com):
    temp = indel_com.join(vcf, ["#CHROM", "POS"], "full")
    sample_name = temp.columns[-1]

    sample_w = Window.partitionBy(F.col("#CHROM")).orderBy(
        F.col("POS")).rangeBetween(Window.unboundedPreceding,
                                   Window.currentRow)
    temp = temp.withColumn(
        sample_name,
        F.last(sample_name,
               ignorenulls=True).over(sample_w)).withColumnRenamed(
                   "#CHROM", "CHROM")

    # scala UDF
    null_not_value = temp.filter(F.map_keys(F.col(sample_name)) != F.col("FORMAT"))\
                         .selectExpr("CHROM", "POS","index2dict({}, FORMAT) as {}".format(sample_name, sample_name))\
                         .withColumn(sample_name,  F.concat(F.lit("./.:"), F.array_join(F.col(sample_name), ":")))

    null_value = temp.filter(F.map_keys(F.col(sample_name)) == F.col("FORMAT")).drop("FORMAT")\
                     .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.map_values(F.col(sample_name)), ":")))

    value_union = null_not_value.union(null_value).withColumnRenamed(
        "CHROM", "#CHROM")
    return value_union
Пример #11
0
 def get_column_spec(
     self,
     source_df: Optional[DataFrame],
     current_column: Optional[Column],
     parent_columns: Optional[List[Column]],
 ) -> Column:
     column_spec = array_join(
         self.column.get_column_spec(
             source_df=source_df,
             current_column=current_column,
             parent_columns=parent_columns,
         ),
         self.delimiter,
     )
     return column_spec
Пример #12
0
def process_tweet_text(df):
    """Removes punctuation, stop words from inputCol and the output is in the outputCol Column.

    Args:
        df (DataFrame): A DataFrame with the column from which Stop Words need to be removed.

    Returns:
        DataFrame: Applying StopWordsRemover with text_clean as the input column and filtered as the output column.
    """
    df = df.withColumn('text', split(removePunctuation(df['text']), ' ').alias('text'))
    stopWordList = list(string.punctuation) + ['http', 'https', 'rt','via','...','…','’','—','—:','“'] + StopWordsRemover.loadDefaultStopWords('english')
    remover = StopWordsRemover(inputCol="text", outputCol="filtered", stopWords = stopWordList)
    df = remover.transform(df)
    df = df.withColumn('tweet', array_join(df['filtered'], ' '))
    return df.select('date', 'tweet', 'hashtags')
 def vivekSentimentPretrained(self, infoData):
     dataset = infoData.get(pc.DATASET)
     viveknPretrainedModelPath = infoData.get(pc.VIVEKNPRETRAINEDMODEL)
     predictionCol = infoData.get(pc.PREDICTIONCOLM)
     """use to download it once later we need to load it from the local to avoid dependency on online downloader."""
     viveknSentiment = ViveknSentimentModel.load(
         viveknPretrainedModelPath).setInputCols(
             ["document", pc.DMXSTOPWORDS]).setOutputCol(predictionCol)
     dataset = viveknSentiment.transform(dataset)
     dataset = dataset.withColumn(predictionCol,
                                  array_join(predictionCol + ".result", ""))
     dataset = dataset.select(pc.DMXINDEX, predictionCol)
     dataset = dataset.withColumn(
         predictionCol,
         when(dataset[predictionCol] == "negative",
              pc.NEGATIVE).when(dataset[predictionCol] == "positive",
                                pc.POSITIVE).otherwise(pc.NEUTRAL))
     return dataset
def test_auto_mapper_join_using_delimiter(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (123456789, "Gagan", "Chawla", ["MD", "PhD"]),
        ],
        ["npi", "first_name", "last_name", "suffix"],
    ).createOrReplaceTempView("practitioners")

    source_df: DataFrame = spark_session.table("practitioners")

    df = source_df.select("npi")
    df.createOrReplaceTempView("physicians")

    # Act
    mapper = AutoMapper(
        view="physicians", source_view="practitioners", keys=[
            "npi"
        ]).columns(my_column=A.join_using_delimiter(A.column("suffix"), ", "))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["my_column"],
        array_join(col("b.suffix"), ", ").alias("my_column"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()
    assert (result_df.where("npi == 123456789").select("my_column").collect()
            [0][0] == "MD, PhD")
Пример #15
0
    def stringify(self,
                  result,
                  recommendation_info_item_delimiter="\001",
                  recommendation_info_field_delimiter="\004",
                  item_embedding_value_delimiter="\003",
                  user_embedding_value_delimiter="\003"):
        import pyspark.sql.functions as F
        from pyspark.sql.functions import pandas_udf
        output_item_embeddings = self.output_item_embeddings

        @pandas_udf('string')
        def format_rec_info(rec_info):
            import pandas as pd
            output = []
            for record in rec_info:
                string = ''
                for item in record:
                    if string:
                        string += recommendation_info_item_delimiter
                    string += item['name']
                    string += recommendation_info_field_delimiter
                    string += str(item['distance'])
                    if output_item_embeddings:
                        string += recommendation_info_field_delimiter
                        string += item_embedding_value_delimiter.join(
                            map(str, item['item_embedding']))
                output.append(string)
            return pd.Series(output)

        result = result.withColumn(
            self.recommendation_info_column_name,
            format_rec_info(self.recommendation_info_column_name))
        if self.output_user_embeddings:
            result = result.withColumn(
                self.user_embedding_column_name,
                F.array_join(F.col(self.user_embedding_column_name),
                             user_embedding_value_delimiter))
        return result
    def sparkLemmatizer(self, dataset, colName, lemmatizedModelPath):
        dataset = dataset.select(
            "SA_index",
            concat_ws(",", dataset[colName]).alias(colName))
        dataset = dataset.withColumn(colName,
                                     regexp_replace(col(colName), ",", " "))
        dataset = dataset.drop('lemma', 'document')

        documentAssembler = DocumentAssembler() \
            .setInputCol(colName) \
            .setOutputCol("document")

        tokenizer = Tokenizer() \
            .setInputCols(["document"]) \
            .setOutputCol("token")

        lemmaModel = LemmatizerModel.load(lemmatizedModelPath) \
            .setInputCols(["document", "token"]).setOutputCol("lemma")

        finisher = Finisher() \
            .setInputCols(["lemma"]) \
            .setOutputCols(["ntokens"]) \
            .setOutputAsArray(True) \
            .setCleanAnnotations(True)

        lemmatizerPipeline = Pipeline(
            stages=[documentAssembler, tokenizer, lemmaModel])
        dataset = lemmatizerPipeline.fit(dataset).transform(dataset)
        dataset = dataset.withColumn("lemma_result",
                                     array_join("lemma.result", ""))
        sentimentTokenizer = RegexTokenizer(inputCol="lemma_result",
                                            outputCol="SA_lemma",
                                            toLowercase=True,
                                            pattern="\\W")
        dataset = sentimentTokenizer.transform(dataset)

        return dataset
Пример #17
0
  .registerTempTable("input")

#load dictionary as csv to reduce proccessing required to pivot json fields
dictSchema = [
    StructField('colID', StringType(), True),
    StructField('colValue', IntegerType(), True)
]
finalStruct = StructType(fields=dictSchema)
df2 = sqlContext.read.csv(path='inputs/freq_dict_mini.csv',
                          header=True,
                          schema=finalStruct,
                          ignoreLeadingWhiteSpace=True,
                          ignoreTrailingWhiteSpace=True)

newDf = df2\
  .withColumn("sortedID",F.array_join(F.sort_array(F.split(df2["colID"],"")),"",""))\
  .withColumn("wordLen", F.length('colID'))
newDf.registerTempTable("dictionary")

#join on sorted characters to get unscrambled possibilities
sqlContext.sql("""select puzzle_id,
                         letters, 
                         keyPositions, 
                         colID, 
                         colValue, 
                         getKeyLetters_udf(colID,keyPositions) as keyLetters, 
                         answerLengths 
                  from input i inner join dictionary d on d.sortedID = i.sortedLetters
""").registerTempTable("unscrambled")
#aggregate to posibilities into list
unscrambled = sqlContext.sql("""select puzzle_id, 
Пример #18
0
def text_formatting(spark):
    """ Extract formatting features from the text of a post

    Args:
        spark (SparkSession): used to run queries and commands

    Returns:
        DataFrame: With columns [
            (post)_Id,
            #codelines,
            #html_blocks,
            #headings,
            #referencelist,
            #quotes,
            #codeblocks,
            #themebreaks,
            #codespans,
            #references,
            #links,
            #inline_images,
            #mail_addresses,
            #emphasis,
            #strong
        ]
    """
    # Replaces formatted text that has already been processed
    FILLER = 'x'
    # Parser helper column
    COLNAME = 'processed_text'
    COL = col(COLNAME)

    # Data loading
    post_history_df = spark.read.parquet("/user/***REMOVED***/StackOverflow/PostHistory.parquet") \
        .select(['_PostId', '_Text', '_PostHistoryTypeId']) \
        .filter(col('_PostHistoryTypeId') == 2) \
        .drop('_PostHistoryTypeId')
    post_df = spark.read.parquet('/user/***REMOVED***/StackOverflow/Posts.parquet') \
        .select(['_Id', '_PostTypeId']) \
        .filter(col('_PostTypeId') == 1) \
        .drop("_PostTypeId")
    df = post_history_df.join(post_df,
                              post_df['_Id'] == post_history_df['_PostId'])

    # Count lines and words of the formatted text
    df = df.withColumn('#lines', size(split(col('_Text'), r'\n'))) \
        .withColumn('#words', size(split(col('_Text'), r'\s+')))

    # BLOCK ELEMENTS
    # Count code lines
    df = df.withColumn(COLNAME, split(col('_Text'), regex.CODE_BLOCK_RE)) \
        .withColumn('#codelines', size(COL) - 1) \
        .withColumn('codeline_ratio', col('#codelines') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count HTML blocks
    df = df.withColumn(COLNAME, split(COL, regex.HTML_BLOCK_RE)) \
        .withColumn('#html_blocks', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # # Count headings (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.SETEXT_HEADING_RE)) \
        .withColumn('#headings', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count reference list
    df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_LIST_RE)) \
        .withColumn('#referencelist', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count quotes
    df = df.withColumn(COLNAME, split(COL, regex.QUOTE_RE)) \
        .withColumn('#quotes', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count headings (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.HEADING_RE)) \
        .withColumn('#headings', size(COL) - 1 + col('#headings')) \
        .withColumn('heading_ratio', col('#headings') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count code blocks
    df = df.withColumn(COLNAME, split(COL, regex.FENCED_CODE_RE)) \
        .withColumn('#codeblocks', size(COL) - 1) \
        .withColumn('codeblock_ratio', col('#codeblocks') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count thematic break
    df = df.withColumn(COLNAME, split(COL, regex.THEME_BREAK_RE)) \
        .withColumn('#themebreaks', size(COL) - 1) \
        .withColumn('themebreak_ratio', col('#themebreaks') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))

    # INLINE ELEMENTS
    # Count codespans
    df = df.withColumn(COLNAME, split(COL, regex.CODESPAN_RE)) \
        .withColumn('#codespans', size(COL) - 1) \
        .withColumn('codespan_ratio', col('#codespans') / col('#words')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Remove markdown escapes
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.ESCAPE_RE, FILLER))
    # Count references (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_RE)) \
        .withColumn('#references', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count links (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.LINK_RE)) \
        .withColumn('#links', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count inline images
    df = df.withColumn(COLNAME, split(COL, regex.INLINE_IMAGE_RE)) \
        .withColumn('#inline_images', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # # Count references (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.SHORT_REFERENCE_RE)) \
        .withColumn('#references', size(COL) - 1 + col('#references')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count links (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.AUTOLINK_RE)) \
        .withColumn('#links', size(COL) - 1 + col('#links')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count mails
    df = df.withColumn(COLNAME, split(COL, regex.AUTOMAIL_RE)) \
        .withColumn('#mail_addresses', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))

    # Remove line breaks, html, stand-alone * or _
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.LINE_BREAK_RE,
                                               FILLER))
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.HTML_RE, FILLER))
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.NOT_STRONG_RE,
                                               FILLER))
    # Count strong & emphasis
    df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG_RE)) \
        .withColumn('#emphasis', size(COL) - 1) \
        .withColumn('#strong', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM3_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_RE)) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG2_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG2_RE)) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS2_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn('emphasis_ratio', col('#emphasis') / col('#words')) \
        .withColumn('strong_ratio', col('#strong') / col('#words'))

    # Remove unnecessary columns, including parser helper column
    df = df.drop('_Text', '_PostHistoryTypeId', '_PostId', '#lines', '#words',
                 COLNAME)
    return df
Пример #19
0
                    func.col('Au.auid').cast('long'))

        # select affiliations per author based on the most dominant affiliation in the last year.
        # keep only records from the final year
        .withColumn(
            'yearRank',
            func.rank().over(
                Window.partitionBy('auid').orderBy(
                    func.desc(sort_pub_year)))).filter('yearRank=1')
        # now count per author the afid occurences.
        .groupBy('auid',
                 func.col('affiliation.afid').alias('affil_id')).agg(
                     func.count('*').alias('aff_occurences'),
                     func.first(
                         func.array_join(
                             'affiliation.affiliation_organization',
                             ", ")).alias('affil_name'),
                     func.first('affiliation.affiliation_tag_country').alias(
                         'cntry'),
                     func.max('datesort').alias('max_datesort'),
                 ).withColumn(
                     'affRank',
                     func.rank().over(
                         Window.partitionBy('auid').orderBy(
                             func.desc('aff_occurences'),
                             func.desc('max_datesort'),
                             func.asc('affil_id')))).filter('affRank=1').drop(
                                 'affRank').drop('aff_occurences')

        # get name from ipr record (preferred name), otherwise default to the name as printed on the paper.
        .join(
Пример #20
0
def export_vector_df(vector_df, vocab_df, output_folder):
    vector_df = vector_df.join(vocab_df, 'id').select('standard_concept_id',
                                                      'vector')
    vector_df.withColumn('vector', F.array_join(F.col('vector'), ',')) \
        .select('standard_concept_id', 'vector').repartition(1) \
        .write.option('header', 'true').mode('overwrite').csv(create_file_path(output_folder, 'embedding_csv'))